├── util ├── __init__.py ├── box_ops.py └── stanza_utils.py ├── models ├── albef │ ├── models │ │ └── __init__.py │ ├── config_bert.json │ └── VL_Transformer_ITM.py ├── vlp_model_builder.py ├── helpers.py └── vlp_model.py ├── mask2former ├── evaluation │ ├── __init__.py │ └── instance_evaluation.py ├── utils │ ├── __init__.py │ └── misc.py ├── modeling │ ├── backbone │ │ └── __init__.py │ ├── meta_arch │ │ ├── __init__.py │ │ └── mask_former_head.py │ ├── pixel_decoder │ │ ├── __init__.py │ │ └── ops │ │ │ ├── make.sh │ │ │ ├── modules │ │ │ ├── __init__.py │ │ │ └── ms_deform_attn.py │ │ │ ├── functions │ │ │ ├── __init__.py │ │ │ └── ms_deform_attn_func.py │ │ │ ├── src │ │ │ ├── vision.cpp │ │ │ ├── cuda │ │ │ │ └── ms_deform_attn_cuda.h │ │ │ ├── cpu │ │ │ │ ├── ms_deform_attn_cpu.h │ │ │ │ └── ms_deform_attn_cpu.cpp │ │ │ └── ms_deform_attn.h │ │ │ ├── setup.py │ │ │ └── test.py │ ├── transformer_decoder │ │ ├── __init__.py │ │ ├── position_encoding.py │ │ └── maskformer_transformer_decoder.py │ └── __init__.py ├── __init__.py ├── test_time_augmentation.py └── config.py ├── image.png ├── configs ├── ade20k │ ├── semantic-segmentation │ │ ├── maskformer2_R101_bs16_90k.yaml │ │ ├── swin │ │ │ ├── maskformer2_swin_tiny_bs16_160k.yaml │ │ │ ├── maskformer2_swin_small_bs16_160k.yaml │ │ │ ├── maskformer2_swin_base_384_bs16_160k_res640.yaml │ │ │ ├── maskformer2_swin_large_IN21k_384_bs16_160k_res640.yaml │ │ │ └── maskformer2_swin_base_IN21k_384_bs16_160k_res640.yaml │ │ ├── maskformer2_R50_bs16_160k.yaml │ │ └── Base-ADE20K-SemanticSegmentation.yaml │ ├── instance-segmentation │ │ ├── swin │ │ │ └── maskformer2_swin_large_IN21k_384_bs16_160k.yaml │ │ ├── maskformer2_R50_bs16_160k.yaml │ │ └── Base-ADE20K-InstanceSegmentation.yaml │ └── panoptic-segmentation │ │ ├── swin │ │ └── maskformer2_swin_large_IN21k_384_bs16_160k.yaml │ │ ├── maskformer2_R50_bs16_160k.yaml │ │ └── Base-ADE20K-PanopticSegmentation.yaml ├── coco │ ├── instance-segmentation │ │ ├── maskformer2_R101_bs16_50ep.yaml │ │ ├── swin │ │ │ ├── maskformer2_swin_small_bs16_50ep.yaml │ │ │ ├── maskformer2_swin_tiny_bs16_50ep.yaml │ │ │ ├── maskformer2_swin_base_384_bs16_50ep.yaml │ │ │ ├── maskformer2_swin_base_IN21k_384_bs16_50ep.yaml │ │ │ └── maskformer2_swin_large_IN21k_384_bs16_100ep.yaml │ │ ├── Base-COCO-InstanceSegmentation.yaml │ │ └── maskformer2_R50_bs16_50ep.yaml │ └── panoptic-segmentation │ │ ├── maskformer2_R101_bs16_50ep.yaml │ │ ├── swin │ │ ├── maskformer2_swin_small_bs16_50ep.yaml │ │ ├── maskformer2_swin_tiny_bs16_50ep.yaml │ │ ├── maskformer2_swin_base_384_bs16_50ep.yaml │ │ ├── maskformer2_swin_base_IN21k_384_bs16_50ep.yaml │ │ └── maskformer2_swin_large_IN21k_384_bs16_100ep.yaml │ │ ├── Base-COCO-PanopticSegmentation.yaml │ │ └── maskformer2_R50_bs16_50ep.yaml ├── cityscapes │ ├── instance-segmentation │ │ ├── maskformer2_R101_bs16_90k.yaml │ │ ├── swin │ │ │ ├── maskformer2_swin_tiny_bs16_90k.yaml │ │ │ ├── maskformer2_swin_small_bs16_90k.yaml │ │ │ ├── maskformer2_swin_base_IN21k_384_bs16_90k.yaml │ │ │ └── maskformer2_swin_large_IN21k_384_bs16_90k.yaml │ │ ├── maskformer2_R50_bs16_90k.yaml │ │ └── Base-Cityscapes-InstanceSegmentation.yaml │ ├── panoptic-segmentation │ │ ├── maskformer2_R101_bs16_90k.yaml │ │ ├── swin │ │ │ ├── maskformer2_swin_tiny_bs16_90k.yaml │ │ │ ├── maskformer2_swin_small_bs16_90k.yaml │ │ │ ├── maskformer2_swin_base_IN21k_384_bs16_90k.yaml │ │ │ └── maskformer2_swin_large_IN21k_384_bs16_90k.yaml │ │ ├── maskformer2_R50_bs16_90k.yaml │ │ └── Base-Cityscapes-PanopticSegmentation.yaml │ └── semantic-segmentation │ │ ├── maskformer2_R101_bs16_90k.yaml │ │ ├── swin │ │ ├── maskformer2_swin_tiny_bs16_90k.yaml │ │ ├── maskformer2_swin_small_bs16_90k.yaml │ │ ├── maskformer2_swin_base_IN21k_384_bs16_90k.yaml │ │ └── maskformer2_swin_large_IN21k_384_bs16_90k.yaml │ │ ├── maskformer2_R50_bs16_90k.yaml │ │ └── Base-Cityscapes-SemanticSegmentation.yaml ├── youtubevis_2019 │ ├── video_maskformer2_R101_bs16_8ep.yaml │ ├── swin │ │ ├── video_maskformer2_swin_small_bs16_8ep.yaml │ │ ├── video_maskformer2_swin_tiny_bs16_8ep.yaml │ │ ├── video_maskformer2_swin_base_IN21k_384_bs16_8ep.yaml │ │ └── video_maskformer2_swin_large_IN21k_384_bs16_8ep.yaml │ ├── Base-YouTubeVIS-VideoInstanceSegmentation.yaml │ └── video_maskformer2_R50_bs16_8ep.yaml ├── youtubevis_2021 │ ├── video_maskformer2_R101_bs16_8ep.yaml │ ├── swin │ │ ├── video_maskformer2_swin_small_bs16_8ep.yaml │ │ ├── video_maskformer2_swin_tiny_bs16_8ep.yaml │ │ ├── video_maskformer2_swin_base_IN21k_384_bs16_8ep.yaml │ │ └── video_maskformer2_swin_large_IN21k_384_bs16_8ep.yaml │ ├── Base-YouTubeVIS-VideoInstanceSegmentation.yaml │ └── video_maskformer2_R50_bs16_8ep.yaml ├── med_config.json ├── mapillary-vistas │ ├── panoptic-segmentation │ │ ├── swin │ │ │ └── maskformer2_swin_large_IN21k_384_bs16_300k.yaml │ │ ├── maskformer_R50_bs16_300k.yaml │ │ └── Base-MapillaryVistas-PanopticSegmentation.yaml │ └── semantic-segmentation │ │ ├── swin │ │ └── maskformer2_swin_large_IN21k_384_bs16_300k.yaml │ │ ├── maskformer2_R50_bs16_300k.yaml │ │ └── Base-MapillaryVistas-SemanticSegmentation.yaml └── freesolo │ ├── freesolo_30k.yaml │ └── freemask.yaml ├── dataset ├── concat_dataset.py ├── __init__.py ├── refexp_eval.py ├── categories.py ├── image_to_seq_augmenter.py ├── a2d_eval.py ├── coco.py ├── samplers.py └── jhmdb.py ├── demo ├── Logger.py ├── postproc.py └── itergradcam.py ├── evaluator.py ├── .gitignore └── README.md /util/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /models/albef/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /mask2former/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VoyageWang/IteRPrimE/HEAD/image.png -------------------------------------------------------------------------------- /mask2former/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /mask2former/modeling/backbone/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /mask2former/modeling/meta_arch/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /mask2former/modeling/pixel_decoder/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /mask2former/modeling/transformer_decoder/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from .maskformer_transformer_decoder import StandardTransformerDecoder 3 | from .mask2former_transformer_decoder import MultiScaleMaskedTransformerDecoder 4 | -------------------------------------------------------------------------------- /mask2former/modeling/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from .backbone.swin import D2SwinTransformer 3 | from .pixel_decoder.fpn import BasePixelDecoder 4 | from .pixel_decoder.msdeformattn import MSDeformAttnPixelDecoder 5 | from .meta_arch.mask_former_head import MaskFormerHead 6 | from .meta_arch.per_pixel_baseline import PerPixelBaselineHead, PerPixelBaselinePlusHead 7 | -------------------------------------------------------------------------------- /configs/ade20k/semantic-segmentation/maskformer2_R101_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: maskformer2_R50_bs16_160k.yaml 2 | MODEL: 3 | WEIGHTS: "R-101.pkl" 4 | RESNETS: 5 | DEPTH: 101 6 | STEM_TYPE: "basic" # not used 7 | STEM_OUT_CHANNELS: 64 8 | STRIDE_IN_1X1: False 9 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 10 | NORM: "SyncBN" 11 | RES5_MULTI_GRID: [1, 1, 1] # not used 12 | -------------------------------------------------------------------------------- /configs/coco/instance-segmentation/maskformer2_R101_bs16_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | WEIGHTS: "R-101.pkl" 4 | RESNETS: 5 | DEPTH: 101 6 | STEM_TYPE: "basic" # not used 7 | STEM_OUT_CHANNELS: 64 8 | STRIDE_IN_1X1: False 9 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 10 | # NORM: "SyncBN" 11 | RES5_MULTI_GRID: [1, 1, 1] # not used 12 | -------------------------------------------------------------------------------- /configs/coco/panoptic-segmentation/maskformer2_R101_bs16_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | WEIGHTS: "R-101.pkl" 4 | RESNETS: 5 | DEPTH: 101 6 | STEM_TYPE: "basic" # not used 7 | STEM_OUT_CHANNELS: 64 8 | STRIDE_IN_1X1: False 9 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 10 | # NORM: "SyncBN" 11 | RES5_MULTI_GRID: [1, 1, 1] # not used 12 | -------------------------------------------------------------------------------- /configs/cityscapes/instance-segmentation/maskformer2_R101_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | WEIGHTS: "R-101.pkl" 4 | RESNETS: 5 | DEPTH: 101 6 | STEM_TYPE: "basic" # not used 7 | STEM_OUT_CHANNELS: 64 8 | STRIDE_IN_1X1: False 9 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 10 | NORM: "SyncBN" 11 | RES5_MULTI_GRID: [1, 1, 1] # not used 12 | -------------------------------------------------------------------------------- /configs/cityscapes/panoptic-segmentation/maskformer2_R101_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | WEIGHTS: "R-101.pkl" 4 | RESNETS: 5 | DEPTH: 101 6 | STEM_TYPE: "basic" # not used 7 | STEM_OUT_CHANNELS: 64 8 | STRIDE_IN_1X1: False 9 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 10 | NORM: "SyncBN" 11 | RES5_MULTI_GRID: [1, 1, 1] # not used 12 | -------------------------------------------------------------------------------- /configs/cityscapes/semantic-segmentation/maskformer2_R101_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | WEIGHTS: "R-101.pkl" 4 | RESNETS: 5 | DEPTH: 101 6 | STEM_TYPE: "basic" # not used 7 | STEM_OUT_CHANNELS: 64 8 | STRIDE_IN_1X1: False 9 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 10 | NORM: "SyncBN" 11 | RES5_MULTI_GRID: [1, 1, 1] # not used 12 | -------------------------------------------------------------------------------- /configs/youtubevis_2019/video_maskformer2_R101_bs16_8ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: video_maskformer2_R50_bs16_8ep.yaml 2 | MODEL: 3 | WEIGHTS: "model_final_eba159.pkl" 4 | RESNETS: 5 | DEPTH: 101 6 | STEM_TYPE: "basic" # not used 7 | STEM_OUT_CHANNELS: 64 8 | STRIDE_IN_1X1: False 9 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 10 | # NORM: "SyncBN" 11 | RES5_MULTI_GRID: [1, 1, 1] # not used 12 | -------------------------------------------------------------------------------- /configs/youtubevis_2021/video_maskformer2_R101_bs16_8ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: video_maskformer2_R50_bs16_8ep.yaml 2 | MODEL: 3 | WEIGHTS: "model_final_eba159.pkl" 4 | RESNETS: 5 | DEPTH: 101 6 | STEM_TYPE: "basic" # not used 7 | STEM_OUT_CHANNELS: 64 8 | STRIDE_IN_1X1: False 9 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 10 | # NORM: "SyncBN" 11 | RES5_MULTI_GRID: [1, 1, 1] # not used 12 | -------------------------------------------------------------------------------- /configs/ade20k/semantic-segmentation/swin/maskformer2_swin_tiny_bs16_160k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 6, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_tiny_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | -------------------------------------------------------------------------------- /configs/cityscapes/instance-segmentation/swin/maskformer2_swin_tiny_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 6, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_tiny_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | -------------------------------------------------------------------------------- /configs/cityscapes/panoptic-segmentation/swin/maskformer2_swin_tiny_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 6, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_tiny_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | -------------------------------------------------------------------------------- /configs/cityscapes/semantic-segmentation/swin/maskformer2_swin_tiny_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 6, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_tiny_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | -------------------------------------------------------------------------------- /configs/coco/instance-segmentation/swin/maskformer2_swin_small_bs16_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_small_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | -------------------------------------------------------------------------------- /configs/coco/instance-segmentation/swin/maskformer2_swin_tiny_bs16_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 6, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_tiny_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | -------------------------------------------------------------------------------- /configs/coco/panoptic-segmentation/swin/maskformer2_swin_small_bs16_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_small_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | -------------------------------------------------------------------------------- /configs/coco/panoptic-segmentation/swin/maskformer2_swin_tiny_bs16_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 6, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_tiny_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | -------------------------------------------------------------------------------- /configs/ade20k/semantic-segmentation/swin/maskformer2_swin_small_bs16_160k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_small_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | -------------------------------------------------------------------------------- /configs/cityscapes/instance-segmentation/swin/maskformer2_swin_small_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_small_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | -------------------------------------------------------------------------------- /configs/cityscapes/panoptic-segmentation/swin/maskformer2_swin_small_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_small_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | -------------------------------------------------------------------------------- /configs/cityscapes/semantic-segmentation/swin/maskformer2_swin_small_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_small_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | -------------------------------------------------------------------------------- /configs/youtubevis_2019/swin/video_maskformer2_swin_small_bs16_8ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "model_final_1e7f22.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | INPUT: 17 | MIN_SIZE_TEST: 480 18 | -------------------------------------------------------------------------------- /configs/youtubevis_2019/swin/video_maskformer2_swin_tiny_bs16_8ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 6, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "model_final_86143f.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | INPUT: 17 | MIN_SIZE_TEST: 480 18 | -------------------------------------------------------------------------------- /configs/youtubevis_2021/swin/video_maskformer2_swin_small_bs16_8ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "model_final_1e7f22.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | INPUT: 17 | MIN_SIZE_TEST: 480 18 | -------------------------------------------------------------------------------- /configs/youtubevis_2021/swin/video_maskformer2_swin_tiny_bs16_8ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 6, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "model_final_86143f.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | INPUT: 17 | MIN_SIZE_TEST: 480 18 | -------------------------------------------------------------------------------- /configs/coco/instance-segmentation/swin/maskformer2_swin_base_384_bs16_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 128 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [4, 8, 16, 32] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_base_patch4_window12_384.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | -------------------------------------------------------------------------------- /configs/coco/panoptic-segmentation/swin/maskformer2_swin_base_384_bs16_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 128 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [4, 8, 16, 32] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_base_patch4_window12_384.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | -------------------------------------------------------------------------------- /models/vlp_model_builder.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | from .vlp_model import VLPModel 3 | from models.albef.engine import ALBEF 4 | 5 | class VLPModelBuilder: 6 | MODEL_ID_MAP = { 7 | 'ALBEF': ALBEF, 8 | 'TCL': ALBEF, 9 | } 10 | def __call__(self, model_id, **kwargs) -> VLPModel: 11 | if model_id not in self.MODEL_ID_MAP.keys(): 12 | raise ValueError('Meaningless model_id: {}, you have to choose from: {}'.format(model_id, self.MODEL_ID_MAP.keys())) 13 | return self.MODEL_ID_MAP[model_id](model_id, **kwargs) -------------------------------------------------------------------------------- /configs/cityscapes/instance-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 128 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [4, 8, 16, 32] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_base_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | -------------------------------------------------------------------------------- /configs/cityscapes/panoptic-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 128 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [4, 8, 16, 32] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_base_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | -------------------------------------------------------------------------------- /configs/cityscapes/semantic-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 128 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [4, 8, 16, 32] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_base_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | -------------------------------------------------------------------------------- /configs/coco/instance-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 128 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [4, 8, 16, 32] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_base_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | -------------------------------------------------------------------------------- /configs/coco/panoptic-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 128 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [4, 8, 16, 32] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_base_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | -------------------------------------------------------------------------------- /configs/youtubevis_2019/swin/video_maskformer2_swin_base_IN21k_384_bs16_8ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 128 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [4, 8, 16, 32] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "model_final_83d103.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | INPUT: 18 | MIN_SIZE_TEST: 480 19 | -------------------------------------------------------------------------------- /configs/youtubevis_2021/swin/video_maskformer2_swin_base_IN21k_384_bs16_8ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 128 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [4, 8, 16, 32] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "model_final_83d103.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | INPUT: 18 | MIN_SIZE_TEST: 480 19 | -------------------------------------------------------------------------------- /models/albef/config_bert.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "BertForMaskedLM" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "hidden_act": "gelu", 7 | "hidden_dropout_prob": 0.1, 8 | "hidden_size": 768, 9 | "initializer_range": 0.02, 10 | "intermediate_size": 3072, 11 | "layer_norm_eps": 1e-12, 12 | "max_position_embeddings": 512, 13 | "model_type": "bert", 14 | "num_attention_heads": 12, 15 | "num_hidden_layers": 12, 16 | "pad_token_id": 0, 17 | "type_vocab_size": 2, 18 | "vocab_size": 30522, 19 | "fusion_layer": 6, 20 | "encoder_width": 768 21 | } -------------------------------------------------------------------------------- /configs/med_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "BertModel" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "hidden_act": "gelu", 7 | "hidden_dropout_prob": 0.1, 8 | "hidden_size": 768, 9 | "initializer_range": 0.02, 10 | "intermediate_size": 3072, 11 | "layer_norm_eps": 1e-12, 12 | "max_position_embeddings": 512, 13 | "model_type": "bert", 14 | "num_attention_heads": 12, 15 | "num_hidden_layers": 12, 16 | "pad_token_id": 0, 17 | "type_vocab_size": 2, 18 | "vocab_size": 30524, 19 | "encoder_width": 768, 20 | "add_cross_attention": true 21 | } 22 | -------------------------------------------------------------------------------- /configs/ade20k/instance-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_160k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | MASK_FORMER: 18 | NUM_OBJECT_QUERIES: 200 19 | -------------------------------------------------------------------------------- /configs/ade20k/panoptic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_160k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | MASK_FORMER: 18 | NUM_OBJECT_QUERIES: 200 19 | -------------------------------------------------------------------------------- /configs/cityscapes/instance-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | MASK_FORMER: 18 | NUM_OBJECT_QUERIES: 200 19 | -------------------------------------------------------------------------------- /configs/cityscapes/panoptic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | MASK_FORMER: 18 | NUM_OBJECT_QUERIES: 200 19 | -------------------------------------------------------------------------------- /configs/cityscapes/semantic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | MASK_FORMER: 18 | NUM_OBJECT_QUERIES: 100 19 | -------------------------------------------------------------------------------- /configs/mapillary-vistas/panoptic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_300k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer_R50_bs16_300k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | MASK_FORMER: 18 | NUM_OBJECT_QUERIES: 200 19 | -------------------------------------------------------------------------------- /configs/mapillary-vistas/semantic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_300k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_300k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | MASK_FORMER: 18 | NUM_OBJECT_QUERIES: 100 19 | -------------------------------------------------------------------------------- /configs/youtubevis_2019/swin/video_maskformer2_swin_large_IN21k_384_bs16_8ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "model_final_e5f453.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | MASK_FORMER: 18 | NUM_OBJECT_QUERIES: 200 19 | INPUT: 20 | MIN_SIZE_TEST: 480 21 | -------------------------------------------------------------------------------- /configs/coco/panoptic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_100ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "../checkpoints/model_final_f07440.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | MASK_FORMER: 18 | NUM_OBJECT_QUERIES: 200 19 | SOLVER: 20 | STEPS: (655556, 710184) 21 | MAX_ITER: 737500 22 | -------------------------------------------------------------------------------- /configs/youtubevis_2021/swin/video_maskformer2_swin_large_IN21k_384_bs16_8ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "model_final_e5f453.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | MASK_FORMER: 18 | NUM_OBJECT_QUERIES: 200 19 | # OOM when using a larger test size 20 | # INPUT: 21 | # MIN_SIZE_TEST: 480 22 | -------------------------------------------------------------------------------- /configs/coco/instance-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_100ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | MASK_FORMER: 18 | NUM_OBJECT_QUERIES: 200 19 | SOLVER: 20 | STEPS: (655556, 710184) 21 | MAX_ITER: 737500 22 | -------------------------------------------------------------------------------- /mask2former/modeling/pixel_decoder/ops/make.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # ------------------------------------------------------------------------------------------------ 3 | # Deformable DETR 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | # ------------------------------------------------------------------------------------------------ 7 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | # ------------------------------------------------------------------------------------------------ 9 | 10 | # Copyright (c) Facebook, Inc. and its affiliates. 11 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 12 | 13 | python setup.py build install 14 | -------------------------------------------------------------------------------- /mask2former/modeling/pixel_decoder/ops/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | from .ms_deform_attn import MSDeformAttn 13 | -------------------------------------------------------------------------------- /mask2former/modeling/pixel_decoder/ops/functions/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | from .ms_deform_attn_func import MSDeformAttnFunction 13 | 14 | -------------------------------------------------------------------------------- /mask2former/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from . import data # register all new datasets 3 | from . import modeling 4 | 5 | # config 6 | from .config import add_maskformer2_config 7 | 8 | # dataset loading 9 | from .data.dataset_mappers.coco_instance_new_baseline_dataset_mapper import COCOInstanceNewBaselineDatasetMapper 10 | from .data.dataset_mappers.coco_panoptic_new_baseline_dataset_mapper import COCOPanopticNewBaselineDatasetMapper 11 | from .data.dataset_mappers.mask_former_instance_dataset_mapper import ( 12 | MaskFormerInstanceDatasetMapper, 13 | ) 14 | from .data.dataset_mappers.mask_former_panoptic_dataset_mapper import ( 15 | MaskFormerPanopticDatasetMapper, 16 | ) 17 | from .data.dataset_mappers.mask_former_semantic_dataset_mapper import ( 18 | MaskFormerSemanticDatasetMapper, 19 | ) 20 | 21 | # models 22 | from .maskformer_model import MaskFormer 23 | from .test_time_augmentation import SemanticSegmentorWithTTA 24 | 25 | # evaluation 26 | from .evaluation.instance_evaluation import InstanceSegEvaluator 27 | -------------------------------------------------------------------------------- /mask2former/modeling/pixel_decoder/ops/src/vision.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #include "ms_deform_attn.h" 17 | 18 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 19 | m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward"); 20 | m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward"); 21 | } 22 | -------------------------------------------------------------------------------- /configs/freesolo/freesolo_30k.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | META_ARCHITECTURE: "PseudoSOLOv2" 3 | PIXEL_MEAN: [123.675, 116.280, 103.530] 4 | PIXEL_STD: [58.395, 57.120, 57.375] 5 | WEIGHTS: "training_dir/pre-trained/DenseCL/densecl_r101_imagenet_200ep.pkl" 6 | MASK_ON: True 7 | BACKBONE: 8 | NAME: "build_resnet_fpn_backbone" 9 | FREEZE_AT: 0 10 | RESNETS: 11 | STRIDE_IN_1X1: False 12 | DEPTH: 101 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | FPN: 15 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 16 | SOLOV2: 17 | NUM_CLASSES: 2 18 | LOSS: 19 | FOCAL_WEIGHT: 1.0 20 | DICE_WEIGHT: 1.0 21 | SOLVER: 22 | LR_SCHEDULER_NAME: "WarmupMultiStepLR" 23 | MAX_ITER: 30000 24 | STEPS: [] 25 | IMS_PER_BATCH: 16 26 | BASE_LR: 0.001 27 | WARMUP_FACTOR: 0.01 28 | WARMUP_ITERS: 1000 29 | CHECKPOINT_PERIOD: 5000 30 | DATASETS: 31 | TRAIN: ("coco_2017_train_unlabeled_densecl_r101",) 32 | TEST: ("coco_2017_val",) 33 | INPUT: 34 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) 35 | MASK_FORMAT: "bitmask" 36 | FORMAT: "RGB" 37 | -------------------------------------------------------------------------------- /configs/freesolo/freemask.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | META_ARCHITECTURE: "PseudoSOLOv2" 3 | PIXEL_MEAN: [123.675, 116.280, 103.530] 4 | PIXEL_STD: [58.395, 57.120, 57.375] 5 | WEIGHTS: "training_dir/pre-trained/DenseCL/densecl_r101_imagenet_200ep.pkl" 6 | MASK_ON: True 7 | BACKBONE: 8 | NAME: "build_resnet_backbone" 9 | FREEZE_AT: 0 10 | RESNETS: 11 | STRIDE_IN_1X1: False 12 | DEPTH: 101 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | FPN: 15 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 16 | SOLOV2: 17 | IS_FREEMASK: True 18 | NUM_CLASSES: 2 19 | LOSS: 20 | FOCAL_WEIGHT: 1.0 21 | DICE_WEIGHT: 1.0 22 | SOLVER: 23 | LR_SCHEDULER_NAME: "WarmupMultiStepLR" 24 | MAX_ITER: 30000 25 | STEPS: [] 26 | IMS_PER_BATCH: 16 27 | BASE_LR: 0.001 28 | WARMUP_FACTOR: 0.01 29 | WARMUP_ITERS: 1000 30 | CHECKPOINT_PERIOD: 5000 31 | DATASETS: 32 | TRAIN: ("coco_2017_train_unlabeled_densecl_r101",) 33 | TEST: ("coco_2017_val",) 34 | INPUT: 35 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) 36 | MASK_FORMAT: "bitmask" 37 | FORMAT: "RGB" 38 | -------------------------------------------------------------------------------- /dataset/concat_dataset.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # ------------------------------------------------------------------------ 5 | 6 | from pathlib import Path 7 | 8 | import torch 9 | import torch.utils.data 10 | 11 | from torch.utils.data import Dataset, ConcatDataset 12 | from .refexp2seq import build as build_seq_refexp 13 | from .ytvos import build as build_ytvs 14 | from dataset import ytvos 15 | 16 | 17 | 18 | def build(image_set, args): 19 | concat_data = [] 20 | 21 | print('preparing coco2seq dataset ....') 22 | coco_names = ["refcoco", "refcoco+", "refcocog"] 23 | for name in coco_names: 24 | coco_seq = build_seq_refexp(name, image_set, args) 25 | concat_data.append(coco_seq) 26 | 27 | print('preparing ytvos dataset .... ') 28 | ytvos_dataset = build_ytvs(image_set, args) 29 | concat_data.append(ytvos_dataset) 30 | 31 | concat_data = ConcatDataset(concat_data) 32 | 33 | return concat_data 34 | -------------------------------------------------------------------------------- /configs/ade20k/semantic-segmentation/swin/maskformer2_swin_base_384_bs16_160k_res640.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 128 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [4, 8, 16, 32] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_base_patch4_window12_384.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | INPUT: 18 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"] 19 | MIN_SIZE_TRAIN_SAMPLING: "choice" 20 | MIN_SIZE_TEST: 640 21 | MAX_SIZE_TRAIN: 2560 22 | MAX_SIZE_TEST: 2560 23 | CROP: 24 | ENABLED: True 25 | TYPE: "absolute" 26 | SIZE: (640, 640) 27 | SINGLE_CATEGORY_MAX_AREA: 1.0 28 | COLOR_AUG_SSD: True 29 | SIZE_DIVISIBILITY: 640 # used in dataset mapper 30 | FORMAT: "RGB" 31 | TEST: 32 | EVAL_PERIOD: 5000 33 | AUG: 34 | ENABLED: False 35 | MIN_SIZES: [320, 480, 640, 800, 960, 1120] 36 | MAX_SIZE: 4480 37 | FLIP: True 38 | -------------------------------------------------------------------------------- /configs/ade20k/semantic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_160k_res640.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | INPUT: 18 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"] 19 | MIN_SIZE_TRAIN_SAMPLING: "choice" 20 | MIN_SIZE_TEST: 640 21 | MAX_SIZE_TRAIN: 2560 22 | MAX_SIZE_TEST: 2560 23 | CROP: 24 | ENABLED: True 25 | TYPE: "absolute" 26 | SIZE: (640, 640) 27 | SINGLE_CATEGORY_MAX_AREA: 1.0 28 | COLOR_AUG_SSD: True 29 | SIZE_DIVISIBILITY: 640 # used in dataset mapper 30 | FORMAT: "RGB" 31 | TEST: 32 | EVAL_PERIOD: 5000 33 | AUG: 34 | ENABLED: False 35 | MIN_SIZES: [320, 480, 640, 800, 960, 1120] 36 | MAX_SIZE: 4480 37 | FLIP: True 38 | -------------------------------------------------------------------------------- /configs/ade20k/semantic-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_160k_res640.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 128 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [4, 8, 16, 32] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "/home/vegetabot/Filesys/CodeField_win/Mask2Former/weights/model_final_7e47bf.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | INPUT: 18 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"] 19 | MIN_SIZE_TRAIN_SAMPLING: "choice" 20 | MIN_SIZE_TEST: 640 21 | MAX_SIZE_TRAIN: 2560 22 | MAX_SIZE_TEST: 2560 23 | CROP: 24 | ENABLED: True 25 | TYPE: "absolute" 26 | SIZE: (640, 640) 27 | SINGLE_CATEGORY_MAX_AREA: 1.0 28 | COLOR_AUG_SSD: True 29 | SIZE_DIVISIBILITY: 640 # used in dataset mapper 30 | FORMAT: "RGB" 31 | TEST: 32 | EVAL_PERIOD: 5000 33 | AUG: 34 | ENABLED: False 35 | MIN_SIZES: [320, 480, 640, 800, 960, 1120] 36 | MAX_SIZE: 4480 37 | FLIP: True 38 | -------------------------------------------------------------------------------- /configs/coco/instance-segmentation/Base-COCO-InstanceSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | # NORM: "SyncBN" 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("coco_2017_train",) 18 | TEST: ("coco_2017_val",) 19 | SOLVER: 20 | IMS_PER_BATCH: 16 21 | BASE_LR: 0.0001 22 | STEPS: (327778, 355092) 23 | MAX_ITER: 368750 24 | WARMUP_FACTOR: 1.0 25 | WARMUP_ITERS: 10 26 | WEIGHT_DECAY: 0.05 27 | OPTIMIZER: "ADAMW" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | AMP: 35 | ENABLED: True 36 | INPUT: 37 | IMAGE_SIZE: 1024 38 | MIN_SCALE: 0.1 39 | MAX_SCALE: 2.0 40 | FORMAT: "RGB" 41 | DATASET_MAPPER_NAME: "coco_instance_lsj" 42 | TEST: 43 | EVAL_PERIOD: 5000 44 | DATALOADER: 45 | FILTER_EMPTY_ANNOTATIONS: True 46 | NUM_WORKERS: 4 47 | VERSION: 2 48 | -------------------------------------------------------------------------------- /demo/Logger.py: -------------------------------------------------------------------------------- 1 | import os 2 | import os.path as osp 3 | import sys 4 | 5 | def mkdir_if_missing(directory): 6 | if not osp.exists(directory): 7 | try: 8 | os.makedirs(directory) 9 | except OSError as e: 10 | if e.errno != errno.EEXIST: 11 | raise 12 | class Logger(object): 13 | """ 14 | Write console output to external text file. 15 | Code imported from https://github.com/Cysu/open-reid/blob/master/reid/utils/logging.py. 16 | """ 17 | def __init__(self, fpath=None, mode = 'w'): 18 | self.console = sys.stdout 19 | self.file = None 20 | if fpath is not None: 21 | mkdir_if_missing(osp.dirname(fpath)) 22 | self.file = open(fpath, mode) 23 | 24 | def __del__(self): 25 | self.close() 26 | 27 | def __enter__(self): 28 | pass 29 | 30 | def __exit__(self, *args): 31 | self.close() 32 | 33 | def write(self, msg): 34 | self.console.write(msg) 35 | if self.file is not None: 36 | self.file.write(msg) 37 | 38 | def flush(self): 39 | self.console.flush() 40 | if self.file is not None: 41 | self.file.flush() 42 | os.fsync(self.file.fileno()) 43 | 44 | def close(self): 45 | self.console.close() 46 | if self.file is not None: 47 | self.file.close() -------------------------------------------------------------------------------- /configs/coco/panoptic-segmentation/Base-COCO-PanopticSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | # NORM: "SyncBN" 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("coco_2017_train_panoptic",) 18 | TEST: ("coco_2017_val_panoptic_with_sem_seg",) # to evaluate instance and semantic performance as well 19 | SOLVER: 20 | IMS_PER_BATCH: 16 21 | BASE_LR: 0.0001 22 | STEPS: (327778, 355092) 23 | MAX_ITER: 368750 24 | WARMUP_FACTOR: 1.0 25 | WARMUP_ITERS: 10 26 | WEIGHT_DECAY: 0.05 27 | OPTIMIZER: "ADAMW" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | AMP: 35 | ENABLED: True 36 | INPUT: 37 | IMAGE_SIZE: 1024 38 | MIN_SCALE: 0.1 39 | MAX_SCALE: 2.0 40 | FORMAT: "RGB" 41 | DATASET_MAPPER_NAME: "coco_panoptic_lsj" 42 | TEST: 43 | EVAL_PERIOD: 5000 44 | DATALOADER: 45 | FILTER_EMPTY_ANNOTATIONS: True 46 | NUM_WORKERS: 4 47 | VERSION: 2 48 | -------------------------------------------------------------------------------- /configs/youtubevis_2019/Base-YouTubeVIS-VideoInstanceSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | MASK_ON: True 9 | RESNETS: 10 | DEPTH: 50 11 | STEM_TYPE: "basic" # not used 12 | STEM_OUT_CHANNELS: 64 13 | STRIDE_IN_1X1: False 14 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 15 | # NORM: "SyncBN" 16 | RES5_MULTI_GRID: [1, 1, 1] # not used 17 | DATASETS: 18 | TRAIN: ("ytvis_2019_train",) 19 | TEST: ("ytvis_2019_val",) 20 | SOLVER: 21 | IMS_PER_BATCH: 16 22 | BASE_LR: 0.0001 23 | STEPS: (4000,) 24 | MAX_ITER: 6000 25 | WARMUP_FACTOR: 1.0 26 | WARMUP_ITERS: 10 27 | WEIGHT_DECAY: 0.05 28 | OPTIMIZER: "ADAMW" 29 | BACKBONE_MULTIPLIER: 0.1 30 | CLIP_GRADIENTS: 31 | ENABLED: True 32 | CLIP_TYPE: "full_model" 33 | CLIP_VALUE: 0.01 34 | NORM_TYPE: 2.0 35 | AMP: 36 | ENABLED: True 37 | INPUT: 38 | MIN_SIZE_TRAIN_SAMPLING: "choice_by_clip" 39 | RANDOM_FLIP: "flip_by_clip" 40 | AUGMENTATIONS: [] 41 | MIN_SIZE_TRAIN: (360, 480) 42 | MIN_SIZE_TEST: 360 43 | CROP: 44 | ENABLED: False 45 | TYPE: "absolute_range" 46 | SIZE: (600, 720) 47 | FORMAT: "RGB" 48 | TEST: 49 | EVAL_PERIOD: 0 50 | DATALOADER: 51 | FILTER_EMPTY_ANNOTATIONS: False 52 | NUM_WORKERS: 4 53 | VERSION: 2 54 | -------------------------------------------------------------------------------- /configs/youtubevis_2021/Base-YouTubeVIS-VideoInstanceSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | MASK_ON: True 9 | RESNETS: 10 | DEPTH: 50 11 | STEM_TYPE: "basic" # not used 12 | STEM_OUT_CHANNELS: 64 13 | STRIDE_IN_1X1: False 14 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 15 | # NORM: "SyncBN" 16 | RES5_MULTI_GRID: [1, 1, 1] # not used 17 | DATASETS: 18 | TRAIN: ("ytvis_2021_train",) 19 | TEST: ("ytvis_2021_val",) 20 | SOLVER: 21 | IMS_PER_BATCH: 16 22 | BASE_LR: 0.0001 23 | STEPS: (5500,) 24 | MAX_ITER: 8000 25 | WARMUP_FACTOR: 1.0 26 | WARMUP_ITERS: 10 27 | WEIGHT_DECAY: 0.05 28 | OPTIMIZER: "ADAMW" 29 | BACKBONE_MULTIPLIER: 0.1 30 | CLIP_GRADIENTS: 31 | ENABLED: True 32 | CLIP_TYPE: "full_model" 33 | CLIP_VALUE: 0.01 34 | NORM_TYPE: 2.0 35 | AMP: 36 | ENABLED: True 37 | INPUT: 38 | MIN_SIZE_TRAIN_SAMPLING: "choice_by_clip" 39 | RANDOM_FLIP: "flip_by_clip" 40 | AUGMENTATIONS: [] 41 | MIN_SIZE_TRAIN: (360, 480) 42 | MIN_SIZE_TEST: 360 43 | CROP: 44 | ENABLED: False 45 | TYPE: "absolute_range" 46 | SIZE: (600, 720) 47 | FORMAT: "RGB" 48 | TEST: 49 | EVAL_PERIOD: 0 50 | DATALOADER: 51 | FILTER_EMPTY_ANNOTATIONS: False 52 | NUM_WORKERS: 4 53 | VERSION: 2 54 | -------------------------------------------------------------------------------- /mask2former/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #pragma once 17 | #include 18 | 19 | at::Tensor ms_deform_attn_cuda_forward( 20 | const at::Tensor &value, 21 | const at::Tensor &spatial_shapes, 22 | const at::Tensor &level_start_index, 23 | const at::Tensor &sampling_loc, 24 | const at::Tensor &attn_weight, 25 | const int im2col_step); 26 | 27 | std::vector ms_deform_attn_cuda_backward( 28 | const at::Tensor &value, 29 | const at::Tensor &spatial_shapes, 30 | const at::Tensor &level_start_index, 31 | const at::Tensor &sampling_loc, 32 | const at::Tensor &attn_weight, 33 | const at::Tensor &grad_output, 34 | const int im2col_step); 35 | 36 | -------------------------------------------------------------------------------- /configs/coco/instance-segmentation/maskformer2_R50_bs16_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-COCO-InstanceSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IGNORE_VALUE: 255 7 | NUM_CLASSES: 80 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 14 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 15 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 16 | COMMON_STRIDE: 4 17 | TRANSFORMER_ENC_LAYERS: 6 18 | MASK_FORMER: 19 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 20 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 21 | DEEP_SUPERVISION: True 22 | NO_OBJECT_WEIGHT: 0.1 23 | CLASS_WEIGHT: 2.0 24 | MASK_WEIGHT: 5.0 25 | DICE_WEIGHT: 5.0 26 | HIDDEN_DIM: 256 27 | NUM_OBJECT_QUERIES: 100 28 | NHEADS: 8 29 | DROPOUT: 0.0 30 | DIM_FEEDFORWARD: 2048 31 | ENC_LAYERS: 0 32 | PRE_NORM: False 33 | ENFORCE_INPUT_PROJ: False 34 | SIZE_DIVISIBILITY: 32 35 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 36 | TRAIN_NUM_POINTS: 12544 37 | OVERSAMPLE_RATIO: 3.0 38 | IMPORTANCE_SAMPLE_RATIO: 0.75 39 | TEST: 40 | SEMANTIC_ON: False 41 | INSTANCE_ON: True 42 | PANOPTIC_ON: False 43 | OVERLAP_THRESHOLD: 0.8 44 | OBJECT_MASK_THRESHOLD: 0.8 45 | -------------------------------------------------------------------------------- /mask2former/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #pragma once 17 | #include 18 | 19 | at::Tensor 20 | ms_deform_attn_cpu_forward( 21 | const at::Tensor &value, 22 | const at::Tensor &spatial_shapes, 23 | const at::Tensor &level_start_index, 24 | const at::Tensor &sampling_loc, 25 | const at::Tensor &attn_weight, 26 | const int im2col_step); 27 | 28 | std::vector 29 | ms_deform_attn_cpu_backward( 30 | const at::Tensor &value, 31 | const at::Tensor &spatial_shapes, 32 | const at::Tensor &level_start_index, 33 | const at::Tensor &sampling_loc, 34 | const at::Tensor &attn_weight, 35 | const at::Tensor &grad_output, 36 | const int im2col_step); 37 | 38 | 39 | -------------------------------------------------------------------------------- /configs/ade20k/instance-segmentation/maskformer2_R50_bs16_160k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-ADE20K-InstanceSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IGNORE_VALUE: 255 7 | NUM_CLASSES: 100 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 14 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 15 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 16 | COMMON_STRIDE: 4 17 | TRANSFORMER_ENC_LAYERS: 6 18 | MASK_FORMER: 19 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 20 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 21 | DEEP_SUPERVISION: True 22 | NO_OBJECT_WEIGHT: 0.1 23 | CLASS_WEIGHT: 2.0 24 | MASK_WEIGHT: 5.0 25 | DICE_WEIGHT: 5.0 26 | HIDDEN_DIM: 256 27 | NUM_OBJECT_QUERIES: 100 28 | NHEADS: 8 29 | DROPOUT: 0.0 30 | DIM_FEEDFORWARD: 2048 31 | ENC_LAYERS: 0 32 | PRE_NORM: False 33 | ENFORCE_INPUT_PROJ: False 34 | SIZE_DIVISIBILITY: 32 35 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 36 | TRAIN_NUM_POINTS: 12544 37 | OVERSAMPLE_RATIO: 3.0 38 | IMPORTANCE_SAMPLE_RATIO: 0.75 39 | TEST: 40 | SEMANTIC_ON: True 41 | INSTANCE_ON: True 42 | PANOPTIC_ON: True 43 | OVERLAP_THRESHOLD: 0.8 44 | OBJECT_MASK_THRESHOLD: 0.8 45 | -------------------------------------------------------------------------------- /configs/ade20k/panoptic-segmentation/maskformer2_R50_bs16_160k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-ADE20K-PanopticSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IGNORE_VALUE: 255 7 | NUM_CLASSES: 150 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 14 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 15 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 16 | COMMON_STRIDE: 4 17 | TRANSFORMER_ENC_LAYERS: 6 18 | MASK_FORMER: 19 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 20 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 21 | DEEP_SUPERVISION: True 22 | NO_OBJECT_WEIGHT: 0.1 23 | CLASS_WEIGHT: 2.0 24 | MASK_WEIGHT: 5.0 25 | DICE_WEIGHT: 5.0 26 | HIDDEN_DIM: 256 27 | NUM_OBJECT_QUERIES: 100 28 | NHEADS: 8 29 | DROPOUT: 0.0 30 | DIM_FEEDFORWARD: 2048 31 | ENC_LAYERS: 0 32 | PRE_NORM: False 33 | ENFORCE_INPUT_PROJ: False 34 | SIZE_DIVISIBILITY: 32 35 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 36 | TRAIN_NUM_POINTS: 12544 37 | OVERSAMPLE_RATIO: 3.0 38 | IMPORTANCE_SAMPLE_RATIO: 0.75 39 | TEST: 40 | SEMANTIC_ON: True 41 | INSTANCE_ON: True 42 | PANOPTIC_ON: True 43 | OVERLAP_THRESHOLD: 0.8 44 | OBJECT_MASK_THRESHOLD: 0.8 45 | -------------------------------------------------------------------------------- /configs/ade20k/semantic-segmentation/maskformer2_R50_bs16_160k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-ADE20K-SemanticSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IGNORE_VALUE: 255 7 | NUM_CLASSES: 150 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 14 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 15 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 16 | COMMON_STRIDE: 4 17 | TRANSFORMER_ENC_LAYERS: 6 18 | MASK_FORMER: 19 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 20 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 21 | DEEP_SUPERVISION: True 22 | NO_OBJECT_WEIGHT: 0.1 23 | CLASS_WEIGHT: 2.0 24 | MASK_WEIGHT: 5.0 25 | DICE_WEIGHT: 5.0 26 | HIDDEN_DIM: 256 27 | NUM_OBJECT_QUERIES: 100 28 | NHEADS: 8 29 | DROPOUT: 0.0 30 | DIM_FEEDFORWARD: 2048 31 | ENC_LAYERS: 0 32 | PRE_NORM: False 33 | ENFORCE_INPUT_PROJ: False 34 | SIZE_DIVISIBILITY: 32 35 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 36 | TRAIN_NUM_POINTS: 12544 37 | OVERSAMPLE_RATIO: 3.0 38 | IMPORTANCE_SAMPLE_RATIO: 0.75 39 | TEST: 40 | SEMANTIC_ON: True 41 | INSTANCE_ON: False 42 | PANOPTIC_ON: False 43 | OVERLAP_THRESHOLD: 0.8 44 | OBJECT_MASK_THRESHOLD: 0.8 45 | -------------------------------------------------------------------------------- /configs/cityscapes/instance-segmentation/maskformer2_R50_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-Cityscapes-InstanceSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IGNORE_VALUE: 255 7 | NUM_CLASSES: 8 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 14 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 15 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 16 | COMMON_STRIDE: 4 17 | TRANSFORMER_ENC_LAYERS: 6 18 | MASK_FORMER: 19 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 20 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 21 | DEEP_SUPERVISION: True 22 | NO_OBJECT_WEIGHT: 0.1 23 | CLASS_WEIGHT: 2.0 24 | MASK_WEIGHT: 5.0 25 | DICE_WEIGHT: 5.0 26 | HIDDEN_DIM: 256 27 | NUM_OBJECT_QUERIES: 100 28 | NHEADS: 8 29 | DROPOUT: 0.0 30 | DIM_FEEDFORWARD: 2048 31 | ENC_LAYERS: 0 32 | PRE_NORM: False 33 | ENFORCE_INPUT_PROJ: False 34 | SIZE_DIVISIBILITY: 32 35 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 36 | TRAIN_NUM_POINTS: 12544 37 | OVERSAMPLE_RATIO: 3.0 38 | IMPORTANCE_SAMPLE_RATIO: 0.75 39 | TEST: 40 | SEMANTIC_ON: False 41 | INSTANCE_ON: True 42 | PANOPTIC_ON: False 43 | OVERLAP_THRESHOLD: 0.8 44 | OBJECT_MASK_THRESHOLD: 0.8 45 | -------------------------------------------------------------------------------- /configs/cityscapes/panoptic-segmentation/maskformer2_R50_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-Cityscapes-PanopticSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IGNORE_VALUE: 255 7 | NUM_CLASSES: 19 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 14 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 15 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 16 | COMMON_STRIDE: 4 17 | TRANSFORMER_ENC_LAYERS: 6 18 | MASK_FORMER: 19 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 20 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 21 | DEEP_SUPERVISION: True 22 | NO_OBJECT_WEIGHT: 0.1 23 | CLASS_WEIGHT: 2.0 24 | MASK_WEIGHT: 5.0 25 | DICE_WEIGHT: 5.0 26 | HIDDEN_DIM: 256 27 | NUM_OBJECT_QUERIES: 100 28 | NHEADS: 8 29 | DROPOUT: 0.0 30 | DIM_FEEDFORWARD: 2048 31 | ENC_LAYERS: 0 32 | PRE_NORM: False 33 | ENFORCE_INPUT_PROJ: False 34 | SIZE_DIVISIBILITY: 32 35 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 36 | TRAIN_NUM_POINTS: 12544 37 | OVERSAMPLE_RATIO: 3.0 38 | IMPORTANCE_SAMPLE_RATIO: 0.75 39 | TEST: 40 | SEMANTIC_ON: True 41 | INSTANCE_ON: True 42 | PANOPTIC_ON: True 43 | OVERLAP_THRESHOLD: 0.8 44 | OBJECT_MASK_THRESHOLD: 0.8 45 | -------------------------------------------------------------------------------- /configs/cityscapes/semantic-segmentation/maskformer2_R50_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-Cityscapes-SemanticSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IGNORE_VALUE: 255 7 | NUM_CLASSES: 19 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 14 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 15 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 16 | COMMON_STRIDE: 4 17 | TRANSFORMER_ENC_LAYERS: 6 18 | MASK_FORMER: 19 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 20 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 21 | DEEP_SUPERVISION: True 22 | NO_OBJECT_WEIGHT: 0.1 23 | CLASS_WEIGHT: 2.0 24 | MASK_WEIGHT: 5.0 25 | DICE_WEIGHT: 5.0 26 | HIDDEN_DIM: 256 27 | NUM_OBJECT_QUERIES: 100 28 | NHEADS: 8 29 | DROPOUT: 0.0 30 | DIM_FEEDFORWARD: 2048 31 | ENC_LAYERS: 0 32 | PRE_NORM: False 33 | ENFORCE_INPUT_PROJ: False 34 | SIZE_DIVISIBILITY: 32 35 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 36 | TRAIN_NUM_POINTS: 12544 37 | OVERSAMPLE_RATIO: 3.0 38 | IMPORTANCE_SAMPLE_RATIO: 0.75 39 | TEST: 40 | SEMANTIC_ON: True 41 | INSTANCE_ON: False 42 | PANOPTIC_ON: False 43 | OVERLAP_THRESHOLD: 0.8 44 | OBJECT_MASK_THRESHOLD: 0.8 45 | -------------------------------------------------------------------------------- /configs/mapillary-vistas/panoptic-segmentation/maskformer_R50_bs16_300k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-MapillaryVistas-PanopticSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IGNORE_VALUE: 65 7 | NUM_CLASSES: 65 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 14 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 15 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 16 | COMMON_STRIDE: 4 17 | TRANSFORMER_ENC_LAYERS: 6 18 | MASK_FORMER: 19 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 20 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 21 | DEEP_SUPERVISION: True 22 | NO_OBJECT_WEIGHT: 0.1 23 | CLASS_WEIGHT: 2.0 24 | MASK_WEIGHT: 5.0 25 | DICE_WEIGHT: 5.0 26 | HIDDEN_DIM: 256 27 | NUM_OBJECT_QUERIES: 100 28 | NHEADS: 8 29 | DROPOUT: 0.0 30 | DIM_FEEDFORWARD: 2048 31 | ENC_LAYERS: 0 32 | PRE_NORM: False 33 | ENFORCE_INPUT_PROJ: False 34 | SIZE_DIVISIBILITY: 32 35 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 36 | TRAIN_NUM_POINTS: 12544 37 | OVERSAMPLE_RATIO: 3.0 38 | IMPORTANCE_SAMPLE_RATIO: 0.75 39 | TEST: 40 | SEMANTIC_ON: True 41 | INSTANCE_ON: False 42 | PANOPTIC_ON: True 43 | OVERLAP_THRESHOLD: 0.8 44 | OBJECT_MASK_THRESHOLD: 0.0 45 | -------------------------------------------------------------------------------- /dataset/__init__.py: -------------------------------------------------------------------------------- 1 | import torch.utils.data 2 | import torchvision 3 | 4 | from .ytvos import build as build_ytvos 5 | from .davis import build as build_davis 6 | from .a2d import build as build_a2d 7 | from .jhmdb import build as build_jhmdb 8 | from .refexp import build as build_refexp 9 | from .concat_dataset import build as build_joint 10 | 11 | 12 | def get_coco_api_from_dataset(dataset): 13 | for _ in range(10): 14 | # if isinstance(dataset, torchvision.datasets.CocoDetection): 15 | # break 16 | if isinstance(dataset, torch.utils.data.Subset): 17 | dataset = dataset.dataset 18 | if isinstance(dataset, torchvision.datasets.CocoDetection): 19 | return dataset.coco 20 | 21 | 22 | def build_dataset(dataset_file: str, image_set: str, args): 23 | if dataset_file == 'ytvos': 24 | return build_ytvos(image_set, args) 25 | if dataset_file == 'davis': 26 | return build_davis(image_set, args) 27 | if dataset_file == 'a2d': 28 | return build_a2d(image_set, args) 29 | if dataset_file == 'jhmdb': 30 | return build_jhmdb(image_set, args) 31 | # for pretraining 32 | if dataset_file == "refcoco" or dataset_file == "refcoco+" or dataset_file == "refcocog": 33 | return build_refexp(dataset_file, image_set, args) 34 | # for joint training of refcoco and ytvos 35 | if dataset_file == 'joint': 36 | return build_joint(image_set, args) 37 | raise ValueError(f'dataset {dataset_file} not supported') 38 | -------------------------------------------------------------------------------- /configs/mapillary-vistas/semantic-segmentation/maskformer2_R50_bs16_300k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-MapillaryVistas-SemanticSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IGNORE_VALUE: 65 7 | NUM_CLASSES: 65 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 14 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 15 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 16 | COMMON_STRIDE: 4 17 | TRANSFORMER_ENC_LAYERS: 6 18 | MASK_FORMER: 19 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 20 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 21 | DEEP_SUPERVISION: True 22 | NO_OBJECT_WEIGHT: 0.1 23 | CLASS_WEIGHT: 2.0 24 | MASK_WEIGHT: 5.0 25 | DICE_WEIGHT: 5.0 26 | HIDDEN_DIM: 256 27 | NUM_OBJECT_QUERIES: 100 28 | NHEADS: 8 29 | DROPOUT: 0.0 30 | DIM_FEEDFORWARD: 2048 31 | ENC_LAYERS: 0 32 | PRE_NORM: False 33 | ENFORCE_INPUT_PROJ: False 34 | SIZE_DIVISIBILITY: 32 35 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 36 | TRAIN_NUM_POINTS: 12544 37 | OVERSAMPLE_RATIO: 3.0 38 | IMPORTANCE_SAMPLE_RATIO: 0.75 39 | TEST: 40 | SEMANTIC_ON: True 41 | INSTANCE_ON: False 42 | PANOPTIC_ON: False 43 | OVERLAP_THRESHOLD: 0.8 44 | OBJECT_MASK_THRESHOLD: 0.0 45 | -------------------------------------------------------------------------------- /configs/coco/panoptic-segmentation/maskformer2_R50_bs16_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-COCO-PanopticSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 7 | IGNORE_VALUE: 255 8 | NUM_CLASSES: 133 9 | LOSS_WEIGHT: 1.0 10 | CONVS_DIM: 256 11 | MASK_DIM: 256 12 | NORM: "GN" 13 | # pixel decoder 14 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 15 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 16 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 17 | COMMON_STRIDE: 4 18 | TRANSFORMER_ENC_LAYERS: 6 19 | MASK_FORMER: 20 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 21 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 22 | DEEP_SUPERVISION: True 23 | NO_OBJECT_WEIGHT: 0.1 24 | CLASS_WEIGHT: 2.0 25 | MASK_WEIGHT: 5.0 26 | DICE_WEIGHT: 5.0 27 | HIDDEN_DIM: 256 28 | NUM_OBJECT_QUERIES: 100 29 | NHEADS: 8 30 | DROPOUT: 0.0 31 | DIM_FEEDFORWARD: 2048 32 | ENC_LAYERS: 0 33 | PRE_NORM: False 34 | ENFORCE_INPUT_PROJ: False 35 | SIZE_DIVISIBILITY: 32 36 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 37 | TRAIN_NUM_POINTS: 12544 38 | OVERSAMPLE_RATIO: 3.0 39 | IMPORTANCE_SAMPLE_RATIO: 0.75 40 | TEST: 41 | SEMANTIC_ON: True 42 | INSTANCE_ON: True 43 | PANOPTIC_ON: True 44 | OVERLAP_THRESHOLD: 0.8 45 | OBJECT_MASK_THRESHOLD: 0.8 46 | -------------------------------------------------------------------------------- /configs/youtubevis_2019/video_maskformer2_R50_bs16_8ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-YouTubeVIS-VideoInstanceSegmentation.yaml 2 | MODEL: 3 | WEIGHTS: "model_final_3c8ec9.pkl" 4 | META_ARCHITECTURE: "VideoMaskFormer" 5 | SEM_SEG_HEAD: 6 | NAME: "MaskFormerHead" 7 | IGNORE_VALUE: 255 8 | NUM_CLASSES: 40 9 | LOSS_WEIGHT: 1.0 10 | CONVS_DIM: 256 11 | MASK_DIM: 256 12 | NORM: "GN" 13 | # pixel decoder 14 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 15 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 16 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 17 | COMMON_STRIDE: 4 18 | TRANSFORMER_ENC_LAYERS: 6 19 | MASK_FORMER: 20 | TRANSFORMER_DECODER_NAME: "VideoMultiScaleMaskedTransformerDecoder" 21 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 22 | DEEP_SUPERVISION: True 23 | NO_OBJECT_WEIGHT: 0.1 24 | CLASS_WEIGHT: 2.0 25 | MASK_WEIGHT: 5.0 26 | DICE_WEIGHT: 5.0 27 | HIDDEN_DIM: 256 28 | NUM_OBJECT_QUERIES: 100 29 | NHEADS: 8 30 | DROPOUT: 0.0 31 | DIM_FEEDFORWARD: 2048 32 | ENC_LAYERS: 0 33 | PRE_NORM: False 34 | ENFORCE_INPUT_PROJ: False 35 | SIZE_DIVISIBILITY: 32 36 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 37 | TRAIN_NUM_POINTS: 12544 38 | OVERSAMPLE_RATIO: 3.0 39 | IMPORTANCE_SAMPLE_RATIO: 0.75 40 | TEST: 41 | SEMANTIC_ON: False 42 | INSTANCE_ON: True 43 | PANOPTIC_ON: False 44 | OVERLAP_THRESHOLD: 0.8 45 | OBJECT_MASK_THRESHOLD: 0.8 46 | -------------------------------------------------------------------------------- /configs/youtubevis_2021/video_maskformer2_R50_bs16_8ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-YouTubeVIS-VideoInstanceSegmentation.yaml 2 | MODEL: 3 | WEIGHTS: "model_final_3c8ec9.pkl" 4 | META_ARCHITECTURE: "VideoMaskFormer" 5 | SEM_SEG_HEAD: 6 | NAME: "MaskFormerHead" 7 | IGNORE_VALUE: 255 8 | NUM_CLASSES: 40 9 | LOSS_WEIGHT: 1.0 10 | CONVS_DIM: 256 11 | MASK_DIM: 256 12 | NORM: "GN" 13 | # pixel decoder 14 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 15 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 16 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 17 | COMMON_STRIDE: 4 18 | TRANSFORMER_ENC_LAYERS: 6 19 | MASK_FORMER: 20 | TRANSFORMER_DECODER_NAME: "VideoMultiScaleMaskedTransformerDecoder" 21 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 22 | DEEP_SUPERVISION: True 23 | NO_OBJECT_WEIGHT: 0.1 24 | CLASS_WEIGHT: 2.0 25 | MASK_WEIGHT: 5.0 26 | DICE_WEIGHT: 5.0 27 | HIDDEN_DIM: 256 28 | NUM_OBJECT_QUERIES: 100 29 | NHEADS: 8 30 | DROPOUT: 0.0 31 | DIM_FEEDFORWARD: 2048 32 | ENC_LAYERS: 0 33 | PRE_NORM: False 34 | ENFORCE_INPUT_PROJ: False 35 | SIZE_DIVISIBILITY: 32 36 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 37 | TRAIN_NUM_POINTS: 12544 38 | OVERSAMPLE_RATIO: 3.0 39 | IMPORTANCE_SAMPLE_RATIO: 0.75 40 | TEST: 41 | SEMANTIC_ON: False 42 | INSTANCE_ON: True 43 | PANOPTIC_ON: False 44 | OVERLAP_THRESHOLD: 0.8 45 | OBJECT_MASK_THRESHOLD: 0.8 46 | -------------------------------------------------------------------------------- /models/helpers.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | from typing import Any, Callable, Dict, Generator, Sequence 3 | 4 | 5 | def chunks(l: Sequence, n: int = 5) -> Generator[Sequence, None, None]: 6 | """Yield successive n-sized chunks from l.""" 7 | for i in range(0, len(l), n): 8 | yield l[i:i + n] 9 | 10 | class LRUCache: 11 | # initialising capacity 12 | def __init__(self, capacity: int): 13 | self.cache = OrderedDict() 14 | self.capacity = capacity 15 | 16 | def has(self, key) -> bool: 17 | return key in self.cache 18 | 19 | # we return the value of the key 20 | # that is queried in O(1) and return -1 if we 21 | # don't find the key in out dict / cache. 22 | # And also move the key to the end 23 | # to show that it was recently used. 24 | def get(self, key): 25 | if key not in self.cache: 26 | return None 27 | else: 28 | self.cache.move_to_end(key) 29 | return self.cache[key] 30 | 31 | # first, we add / update the key by conventional methods. 32 | # And also move the key to the end to show that it was recently used. 33 | # But here we will also check whether the length of our 34 | # ordered dictionary has exceeded our capacity, 35 | # If so we remove the first key (least recently used) 36 | def put(self, key, value) -> None: 37 | self.cache[key] = value 38 | self.cache.move_to_end(key) 39 | if len(self.cache) > self.capacity: 40 | self.cache.popitem(last=False) 41 | 42 | def pop(self, key, value): 43 | self.cache.pop(key, None) 44 | -------------------------------------------------------------------------------- /mask2former/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #include 17 | 18 | #include 19 | #include 20 | 21 | 22 | at::Tensor 23 | ms_deform_attn_cpu_forward( 24 | const at::Tensor &value, 25 | const at::Tensor &spatial_shapes, 26 | const at::Tensor &level_start_index, 27 | const at::Tensor &sampling_loc, 28 | const at::Tensor &attn_weight, 29 | const int im2col_step) 30 | { 31 | AT_ERROR("Not implement on cpu"); 32 | } 33 | 34 | std::vector 35 | ms_deform_attn_cpu_backward( 36 | const at::Tensor &value, 37 | const at::Tensor &spatial_shapes, 38 | const at::Tensor &level_start_index, 39 | const at::Tensor &sampling_loc, 40 | const at::Tensor &attn_weight, 41 | const at::Tensor &grad_output, 42 | const int im2col_step) 43 | { 44 | AT_ERROR("Not implement on cpu"); 45 | } 46 | 47 | -------------------------------------------------------------------------------- /configs/mapillary-vistas/panoptic-segmentation/Base-MapillaryVistas-PanopticSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | # NORM: "SyncBN" 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("mapillary_vistas_panoptic_train",) 18 | TEST: ("mapillary_vistas_panoptic_val",) 19 | SOLVER: 20 | IMS_PER_BATCH: 16 21 | BASE_LR: 0.0001 22 | MAX_ITER: 300000 23 | WARMUP_FACTOR: 1.0 24 | WARMUP_ITERS: 0 25 | WEIGHT_DECAY: 0.05 26 | OPTIMIZER: "ADAMW" 27 | LR_SCHEDULER_NAME: "WarmupPolyLR" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | AMP: 35 | ENABLED: True 36 | INPUT: 37 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 2048) for x in range(5, 21)]"] 38 | MIN_SIZE_TRAIN_SAMPLING: "choice" 39 | MIN_SIZE_TEST: 2048 40 | MAX_SIZE_TRAIN: 8192 41 | MAX_SIZE_TEST: 2048 42 | CROP: 43 | ENABLED: True 44 | TYPE: "absolute" 45 | SIZE: (1024, 1024) 46 | SINGLE_CATEGORY_MAX_AREA: 1.0 47 | COLOR_AUG_SSD: True 48 | SIZE_DIVISIBILITY: 1024 # used in dataset mapper 49 | FORMAT: "RGB" 50 | DATASET_MAPPER_NAME: "mask_former_panoptic" 51 | TEST: 52 | EVAL_PERIOD: 0 53 | DATALOADER: 54 | FILTER_EMPTY_ANNOTATIONS: True 55 | NUM_WORKERS: 10 56 | VERSION: 2 57 | -------------------------------------------------------------------------------- /configs/mapillary-vistas/semantic-segmentation/Base-MapillaryVistas-SemanticSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | # NORM: "SyncBN" 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("mapillary_vistas_sem_seg_train",) 18 | TEST: ("mapillary_vistas_sem_seg_val",) 19 | SOLVER: 20 | IMS_PER_BATCH: 16 21 | BASE_LR: 0.0001 22 | MAX_ITER: 300000 23 | WARMUP_FACTOR: 1.0 24 | WARMUP_ITERS: 0 25 | WEIGHT_DECAY: 0.05 26 | OPTIMIZER: "ADAMW" 27 | LR_SCHEDULER_NAME: "WarmupPolyLR" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | AMP: 35 | ENABLED: True 36 | INPUT: 37 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 2048) for x in range(5, 21)]"] 38 | MIN_SIZE_TRAIN_SAMPLING: "choice" 39 | MIN_SIZE_TEST: 2048 40 | MAX_SIZE_TRAIN: 8192 41 | MAX_SIZE_TEST: 2048 42 | CROP: 43 | ENABLED: True 44 | TYPE: "absolute" 45 | SIZE: (1024, 1024) 46 | SINGLE_CATEGORY_MAX_AREA: 1.0 47 | COLOR_AUG_SSD: True 48 | SIZE_DIVISIBILITY: 1024 # used in dataset mapper 49 | FORMAT: "RGB" 50 | DATASET_MAPPER_NAME: "mask_former_semantic" 51 | TEST: 52 | EVAL_PERIOD: 0 53 | DATALOADER: 54 | FILTER_EMPTY_ANNOTATIONS: True 55 | NUM_WORKERS: 10 56 | VERSION: 2 57 | -------------------------------------------------------------------------------- /configs/ade20k/instance-segmentation/Base-ADE20K-InstanceSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | # NORM: "SyncBN" 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("ade20k_instance_train",) 18 | TEST: ("ade20k_instance_val",) 19 | SOLVER: 20 | IMS_PER_BATCH: 16 21 | BASE_LR: 0.0001 22 | MAX_ITER: 160000 23 | WARMUP_FACTOR: 1.0 24 | WARMUP_ITERS: 0 25 | WEIGHT_DECAY: 0.05 26 | OPTIMIZER: "ADAMW" 27 | LR_SCHEDULER_NAME: "WarmupPolyLR" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | AMP: 35 | ENABLED: True 36 | INPUT: 37 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"] 38 | MIN_SIZE_TRAIN_SAMPLING: "choice" 39 | MIN_SIZE_TEST: 640 40 | MAX_SIZE_TRAIN: 2560 41 | MAX_SIZE_TEST: 2560 42 | CROP: 43 | ENABLED: True 44 | TYPE: "absolute" 45 | SIZE: (640, 640) 46 | SINGLE_CATEGORY_MAX_AREA: 1.0 47 | COLOR_AUG_SSD: True 48 | SIZE_DIVISIBILITY: 640 # used in dataset mapper 49 | FORMAT: "RGB" 50 | DATASET_MAPPER_NAME: "mask_former_instance" 51 | TEST: 52 | EVAL_PERIOD: 5000 53 | AUG: 54 | ENABLED: False 55 | MIN_SIZES: [320, 480, 640, 800, 960, 1120] 56 | MAX_SIZE: 4480 57 | FLIP: True 58 | DATALOADER: 59 | FILTER_EMPTY_ANNOTATIONS: True 60 | NUM_WORKERS: 4 61 | VERSION: 2 62 | -------------------------------------------------------------------------------- /configs/ade20k/panoptic-segmentation/Base-ADE20K-PanopticSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | # NORM: "SyncBN" 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("ade20k_panoptic_train",) 18 | TEST: ("ade20k_panoptic_val",) 19 | SOLVER: 20 | IMS_PER_BATCH: 16 21 | BASE_LR: 0.0001 22 | MAX_ITER: 160000 23 | WARMUP_FACTOR: 1.0 24 | WARMUP_ITERS: 0 25 | WEIGHT_DECAY: 0.05 26 | OPTIMIZER: "ADAMW" 27 | LR_SCHEDULER_NAME: "WarmupPolyLR" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | AMP: 35 | ENABLED: True 36 | INPUT: 37 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"] 38 | MIN_SIZE_TRAIN_SAMPLING: "choice" 39 | MIN_SIZE_TEST: 640 40 | MAX_SIZE_TRAIN: 2560 41 | MAX_SIZE_TEST: 2560 42 | CROP: 43 | ENABLED: True 44 | TYPE: "absolute" 45 | SIZE: (640, 640) 46 | SINGLE_CATEGORY_MAX_AREA: 1.0 47 | COLOR_AUG_SSD: True 48 | SIZE_DIVISIBILITY: 640 # used in dataset mapper 49 | FORMAT: "RGB" 50 | DATASET_MAPPER_NAME: "mask_former_panoptic" 51 | TEST: 52 | EVAL_PERIOD: 5000 53 | AUG: 54 | ENABLED: False 55 | MIN_SIZES: [320, 480, 640, 800, 960, 1120] 56 | MAX_SIZE: 4480 57 | FLIP: True 58 | DATALOADER: 59 | FILTER_EMPTY_ANNOTATIONS: True 60 | NUM_WORKERS: 4 61 | VERSION: 2 62 | -------------------------------------------------------------------------------- /configs/ade20k/semantic-segmentation/Base-ADE20K-SemanticSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | # NORM: "SyncBN" 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("ade20k_sem_seg_train",) 18 | TEST: ("ade20k_sem_seg_val",) 19 | SOLVER: 20 | IMS_PER_BATCH: 16 21 | BASE_LR: 0.0001 22 | MAX_ITER: 160000 23 | WARMUP_FACTOR: 1.0 24 | WARMUP_ITERS: 0 25 | WEIGHT_DECAY: 0.05 26 | OPTIMIZER: "ADAMW" 27 | LR_SCHEDULER_NAME: "WarmupPolyLR" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | AMP: 35 | ENABLED: True 36 | INPUT: 37 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 512) for x in range(5, 21)]"] 38 | MIN_SIZE_TRAIN_SAMPLING: "choice" 39 | MIN_SIZE_TEST: 512 40 | MAX_SIZE_TRAIN: 2048 41 | MAX_SIZE_TEST: 2048 42 | CROP: 43 | ENABLED: True 44 | TYPE: "absolute" 45 | SIZE: (512, 512) 46 | SINGLE_CATEGORY_MAX_AREA: 1.0 47 | COLOR_AUG_SSD: True 48 | SIZE_DIVISIBILITY: 512 # used in dataset mapper 49 | FORMAT: "RGB" 50 | DATASET_MAPPER_NAME: "mask_former_semantic" 51 | TEST: 52 | EVAL_PERIOD: 5000 53 | AUG: 54 | ENABLED: False 55 | MIN_SIZES: [256, 384, 512, 640, 768, 896] 56 | MAX_SIZE: 3584 57 | FLIP: True 58 | DATALOADER: 59 | FILTER_EMPTY_ANNOTATIONS: True 60 | NUM_WORKERS: 4 61 | VERSION: 2 62 | -------------------------------------------------------------------------------- /configs/cityscapes/panoptic-segmentation/Base-Cityscapes-PanopticSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | NORM: "SyncBN" # use syncbn for cityscapes dataset 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("cityscapes_fine_panoptic_train",) 18 | TEST: ("cityscapes_fine_panoptic_val",) 19 | SOLVER: 20 | IMS_PER_BATCH: 16 21 | BASE_LR: 0.0001 22 | MAX_ITER: 90000 23 | WARMUP_FACTOR: 1.0 24 | WARMUP_ITERS: 0 25 | WEIGHT_DECAY: 0.05 26 | OPTIMIZER: "ADAMW" 27 | LR_SCHEDULER_NAME: "WarmupPolyLR" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | AMP: 35 | ENABLED: True 36 | INPUT: 37 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1024) for x in range(5, 21)]"] 38 | MIN_SIZE_TRAIN_SAMPLING: "choice" 39 | MIN_SIZE_TEST: 1024 40 | MAX_SIZE_TRAIN: 4096 41 | MAX_SIZE_TEST: 2048 42 | CROP: 43 | ENABLED: True 44 | TYPE: "absolute" 45 | SIZE: (512, 1024) 46 | SINGLE_CATEGORY_MAX_AREA: 1.0 47 | COLOR_AUG_SSD: True 48 | SIZE_DIVISIBILITY: -1 49 | FORMAT: "RGB" 50 | DATASET_MAPPER_NAME: "mask_former_panoptic" 51 | TEST: 52 | EVAL_PERIOD: 5000 53 | AUG: 54 | ENABLED: False 55 | MIN_SIZES: [512, 768, 1024, 1280, 1536, 1792] 56 | MAX_SIZE: 4096 57 | FLIP: True 58 | DATALOADER: 59 | FILTER_EMPTY_ANNOTATIONS: True 60 | NUM_WORKERS: 4 61 | VERSION: 2 62 | -------------------------------------------------------------------------------- /configs/cityscapes/semantic-segmentation/Base-Cityscapes-SemanticSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | NORM: "SyncBN" # use syncbn for cityscapes dataset 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("cityscapes_fine_sem_seg_train",) 18 | TEST: ("cityscapes_fine_sem_seg_val",) 19 | SOLVER: 20 | IMS_PER_BATCH: 16 21 | BASE_LR: 0.0001 22 | MAX_ITER: 90000 23 | WARMUP_FACTOR: 1.0 24 | WARMUP_ITERS: 0 25 | WEIGHT_DECAY: 0.05 26 | OPTIMIZER: "ADAMW" 27 | LR_SCHEDULER_NAME: "WarmupPolyLR" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | AMP: 35 | ENABLED: True 36 | INPUT: 37 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1024) for x in range(5, 21)]"] 38 | MIN_SIZE_TRAIN_SAMPLING: "choice" 39 | MIN_SIZE_TEST: 1024 40 | MAX_SIZE_TRAIN: 4096 41 | MAX_SIZE_TEST: 2048 42 | CROP: 43 | ENABLED: True 44 | TYPE: "absolute" 45 | SIZE: (512, 1024) 46 | SINGLE_CATEGORY_MAX_AREA: 1.0 47 | COLOR_AUG_SSD: True 48 | SIZE_DIVISIBILITY: -1 49 | FORMAT: "RGB" 50 | DATASET_MAPPER_NAME: "mask_former_semantic" 51 | TEST: 52 | EVAL_PERIOD: 5000 53 | AUG: 54 | ENABLED: False 55 | MIN_SIZES: [512, 768, 1024, 1280, 1536, 1792] 56 | MAX_SIZE: 4096 57 | FLIP: True 58 | DATALOADER: 59 | FILTER_EMPTY_ANNOTATIONS: True 60 | NUM_WORKERS: 4 61 | VERSION: 2 62 | -------------------------------------------------------------------------------- /configs/cityscapes/instance-segmentation/Base-Cityscapes-InstanceSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | NORM: "SyncBN" # use syncbn for cityscapes dataset 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("cityscapes_fine_instance_seg_train",) 18 | TEST: ("cityscapes_fine_instance_seg_val",) 19 | SOLVER: 20 | IMS_PER_BATCH: 16 21 | BASE_LR: 0.0001 22 | MAX_ITER: 90000 23 | WARMUP_FACTOR: 1.0 24 | WARMUP_ITERS: 0 25 | WEIGHT_DECAY: 0.05 26 | OPTIMIZER: "ADAMW" 27 | LR_SCHEDULER_NAME: "WarmupPolyLR" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | AMP: 35 | ENABLED: True 36 | INPUT: 37 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1024) for x in range(5, 21)]"] 38 | MIN_SIZE_TRAIN_SAMPLING: "choice" 39 | MIN_SIZE_TEST: 1024 40 | MAX_SIZE_TRAIN: 4096 41 | MAX_SIZE_TEST: 2048 42 | CROP: 43 | ENABLED: True 44 | TYPE: "absolute" 45 | SIZE: (512, 1024) 46 | SINGLE_CATEGORY_MAX_AREA: 1.0 47 | COLOR_AUG_SSD: True 48 | SIZE_DIVISIBILITY: -1 49 | FORMAT: "RGB" 50 | DATASET_MAPPER_NAME: "mask_former_instance" 51 | TEST: 52 | EVAL_PERIOD: 5000 53 | AUG: 54 | ENABLED: False 55 | MIN_SIZES: [512, 768, 1024, 1280, 1536, 1792] 56 | MAX_SIZE: 4096 57 | FLIP: True 58 | DATALOADER: 59 | FILTER_EMPTY_ANNOTATIONS: True 60 | NUM_WORKERS: 4 61 | VERSION: 2 62 | -------------------------------------------------------------------------------- /models/vlp_model.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | # from Detic.predict import get_bbox_by_detic_rec, get_bbox_by_detic_rec_general 4 | from .helpers import LRUCache 5 | # from utils.cal_utils import cal_iou 6 | from util.stanza_utils import find_main_words, find_agent_by_stanza, find_no_main_words 7 | # from utils.map_to_coco_label import map_to_coco_label 8 | 9 | class VLPModel: 10 | MAX_CACHE = 20 11 | def __init__(self, model_id, device='cuda', templates = 'there is a {}', 12 | checkpoint_dir = '../checkpoints'): 13 | self._models = LRUCache(self.MAX_CACHE) 14 | self.model_id = model_id 15 | self.device = device 16 | self.templates = templates 17 | self.checkpoint_dir = checkpoint_dir 18 | 19 | def get_bbox_for_rec(self, image_path, category, threshold=0.15, unk=False, general=False): 20 | if general: 21 | return get_bbox_by_detic_rec_general(image_path, category, threshold) 22 | else: 23 | return get_bbox_by_detic_rec(image_path, category, threshold, unk=unk) 24 | 25 | def cal_iou(self, box1, box2): 26 | return cal_iou(box1, box2) 27 | 28 | def find_main_words(self, sent, start_idx, tokenizer): 29 | return find_main_words(sent, start_idx, tokenizer) 30 | 31 | def find_no_main_words(self, sent, start_idx, tokenizer): 32 | return find_no_main_words(sent, start_idx, tokenizer) 33 | 34 | def find_agent(self, sent): 35 | return find_agent_by_stanza(sent) 36 | 37 | def map_to_coco_label(self, agent): 38 | return map_to_coco_label([agent])[0] 39 | 40 | def get_results_for_rec( 41 | self, 42 | image_path: str, 43 | texts: List[str], 44 | gt_bbox: List[int], 45 | block_num: int = 8, 46 | category='', 47 | mapped_categories=None 48 | ): 49 | pass 50 | 51 | def cal_score( 52 | self, 53 | gradcam, 54 | gt_bbox, 55 | boxes_category, 56 | od_scores 57 | ): 58 | pass 59 | 60 | 61 | -------------------------------------------------------------------------------- /evaluator.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | class Evaluator(): 5 | def __init__(self): 6 | self.counters_by_iou = {iou: 0 for iou in [0.5, 0.6, 0.7, 0.8, 0.9]} 7 | self.total_intersection_area = 0 8 | self.total_union_area = 0 9 | self.ious_list = [] 10 | pass 11 | def compute_mask_iou(self, outputs: torch.Tensor, labels: torch.Tensor, EPS=1e-6): 12 | assert outputs.shape[0] == 1; assert outputs.shape == labels.shape; assert len(outputs.shape) == 3 13 | outputs = outputs.int(); labels = labels.int() 14 | intersection = (outputs & labels).float().sum((1, 2)) # Will be zero if Truth=0 or Prediction=0 15 | union = (outputs | labels).float().sum((1, 2)) # Will be zero if both are 0 16 | iou = (intersection + EPS) / (union + EPS) # EPS is used to avoid division by zero 17 | 18 | iou, intersection, union = iou.item(), intersection.item(), union.item() 19 | 20 | for iou_threshold in self.counters_by_iou.keys(): 21 | if iou > iou_threshold: 22 | self.counters_by_iou[iou_threshold] += 1 23 | 24 | self.total_intersection_area += intersection 25 | self.total_union_area += union 26 | self.ious_list.append(iou) 27 | 28 | return iou, intersection, union 29 | 30 | def evaluate(self): 31 | num_samples = len(self.ious_list) 32 | 33 | if num_samples == 0: 34 | print("No samples to evaluate.") 35 | return 36 | 37 | precision_at_k = np.array(list(self.counters_by_iou.values())) / num_samples 38 | overall_iou = self.total_intersection_area / self.total_union_area 39 | mean_iou = np.mean(self.ious_list) 40 | 41 | print("Evaluation Result") 42 | iou_thresholds = [0.5, 0.6, 0.7, 0.8, 0.9] 43 | for iou, prec in zip(iou_thresholds, precision_at_k): 44 | print(f"IoU: {iou:.2f}, Precision: {prec:.4f}") 45 | print("========================================") 46 | print(f"Overall IoU: {overall_iou:.4f}") 47 | print(f"Mean IoU: {mean_iou:.4f}") -------------------------------------------------------------------------------- /mask2former/modeling/pixel_decoder/ops/src/ms_deform_attn.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #pragma once 17 | 18 | #include "cpu/ms_deform_attn_cpu.h" 19 | 20 | #ifdef WITH_CUDA 21 | #include "cuda/ms_deform_attn_cuda.h" 22 | #endif 23 | 24 | 25 | at::Tensor 26 | ms_deform_attn_forward( 27 | const at::Tensor &value, 28 | const at::Tensor &spatial_shapes, 29 | const at::Tensor &level_start_index, 30 | const at::Tensor &sampling_loc, 31 | const at::Tensor &attn_weight, 32 | const int im2col_step) 33 | { 34 | if (value.type().is_cuda()) 35 | { 36 | #ifdef WITH_CUDA 37 | return ms_deform_attn_cuda_forward( 38 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step); 39 | #else 40 | AT_ERROR("Not compiled with GPU support"); 41 | #endif 42 | } 43 | AT_ERROR("Not implemented on the CPU"); 44 | } 45 | 46 | std::vector 47 | ms_deform_attn_backward( 48 | const at::Tensor &value, 49 | const at::Tensor &spatial_shapes, 50 | const at::Tensor &level_start_index, 51 | const at::Tensor &sampling_loc, 52 | const at::Tensor &attn_weight, 53 | const at::Tensor &grad_output, 54 | const int im2col_step) 55 | { 56 | if (value.type().is_cuda()) 57 | { 58 | #ifdef WITH_CUDA 59 | return ms_deform_attn_cuda_backward( 60 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step); 61 | #else 62 | AT_ERROR("Not compiled with GPU support"); 63 | #endif 64 | } 65 | AT_ERROR("Not implemented on the CPU"); 66 | } 67 | 68 | -------------------------------------------------------------------------------- /mask2former/modeling/transformer_decoder/position_encoding.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py 3 | """ 4 | Various positional encodings for the transformer. 5 | """ 6 | import math 7 | 8 | import torch 9 | from torch import nn 10 | 11 | 12 | class PositionEmbeddingSine(nn.Module): 13 | """ 14 | This is a more standard version of the position embedding, very similar to the one 15 | used by the Attention is all you need paper, generalized to work on images. 16 | """ 17 | 18 | def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None): 19 | super().__init__() 20 | self.num_pos_feats = num_pos_feats 21 | self.temperature = temperature 22 | self.normalize = normalize 23 | if scale is not None and normalize is False: 24 | raise ValueError("normalize should be True if scale is passed") 25 | if scale is None: 26 | scale = 2 * math.pi 27 | self.scale = scale 28 | 29 | def forward(self, x, mask=None): 30 | if mask is None: 31 | mask = torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool) 32 | not_mask = ~mask 33 | y_embed = not_mask.cumsum(1, dtype=torch.float32) 34 | x_embed = not_mask.cumsum(2, dtype=torch.float32) 35 | if self.normalize: 36 | eps = 1e-6 37 | y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale 38 | x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale 39 | 40 | dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) 41 | dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) 42 | 43 | pos_x = x_embed[:, :, :, None] / dim_t 44 | pos_y = y_embed[:, :, :, None] / dim_t 45 | pos_x = torch.stack( 46 | (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4 47 | ).flatten(3) 48 | pos_y = torch.stack( 49 | (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4 50 | ).flatten(3) 51 | pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) 52 | return pos 53 | 54 | def __repr__(self, _repr_indent=4): 55 | head = "Positional encoding " + self.__class__.__name__ 56 | body = [ 57 | "num_pos_feats: {}".format(self.num_pos_feats), 58 | "temperature: {}".format(self.temperature), 59 | "normalize: {}".format(self.normalize), 60 | "scale: {}".format(self.scale), 61 | ] 62 | # _repr_indent = 4 63 | lines = [head] + [" " * _repr_indent + line for line in body] 64 | return "\n".join(lines) 65 | -------------------------------------------------------------------------------- /demo/postproc.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import scipy.ndimage 3 | 4 | def mySig(x, a = 20, b = 0.5): 5 | sig_x = 1 / (1 + torch.exp(a*(-x+b))) 6 | sig_x = sig_x / sig_x.max() 7 | return x 8 | 9 | def post_process(bin_mask, gradcam, caption, select_idx, alpha = 1.00): 10 | """ 11 | 后处理操作 12 | 13 | Args: 14 | bin_mask (torch.Tensor): 候选mask tensor, shape=(1, 100, H, W) 15 | gradcam (torch.Tensor): GradCAM注意力图tensor, shape=(H, W) 16 | 17 | Returns: 18 | torch.Tensor: 最终mask, shape=(1, 1, H, W) 19 | """ 20 | B, N, H, W = bin_mask.size() 21 | 22 | device = bin_mask.device 23 | max_val = torch.max(gradcam) 24 | max_coord = torch.nonzero(gradcam >= max_val * alpha) 25 | # 先去做一个判断初筛,看最大的这个点的位置是否是白色像素,这样就能够有效的取出 26 | bin_mask_at_seed = torch.zeros((B,N), dtype=torch.bool).to(device) 27 | for coord in max_coord: 28 | max_x, max_y = coord[0].item(), coord[1].item() 29 | bin_mask_at_seed_tmp = bin_mask[:, :, max_x, max_y] 30 | bin_mask_at_seed = torch.logical_or(bin_mask_at_seed, bin_mask_at_seed_tmp != 0.0) 31 | 32 | # 检查每个候选mask是否在seed位置有白色像素 33 | 34 | # 初筛2 35 | bin_mask_tmp = bin_mask.squeeze(0) 36 | res = [] 37 | for mask in bin_mask_tmp: 38 | mask_np = mask.cpu().numpy() 39 | labeled_array, num_features = scipy.ndimage.label(mask_np) 40 | res.append(num_features<=12) 41 | res = torch.tensor(res).float().unsqueeze(0).to(device) 42 | bin_mask_at_seed = torch.logical_and(bin_mask_at_seed, res) 43 | 44 | bin_mask_at_seed = bin_mask_at_seed.float() 45 | # 对于初筛1,如果没有,将对应的mask置零, 然后后面进行初筛2 46 | bin_mask_tmp = bin_mask * bin_mask_at_seed.unsqueeze(-1).unsqueeze(-1) 47 | 48 | 49 | # gradcam = gradcam_reweight(caption, gradcam) 50 | 51 | # gradcam = torch.exp(gradcam) 52 | # 1. 计算每个候选mask和gradcam的点乘分数 53 | # nw = 1; gradcam = torch.exp(gradcam * nw) / torch.exp(torch.tensor(nw)) # 对gradcam重新加权一下 54 | # thresh = 0.5; 55 | # gradcam = (gradcam >= thresh).float() 56 | # gradcam = mySig(gradcam) 57 | scores = (bin_mask_tmp + bin_mask_tmp*gradcam.unsqueeze(0).unsqueeze(0)).sum((2, 3)) # (1, 100) 58 | 59 | w_area = bin_mask_tmp.sum((2,3)) + 1e-5 60 | 61 | scores /= w_area 62 | 63 | # 2. 选择分数最高的候选mask 64 | sorted_tensor, indices = torch.sort(scores,descending=True) 65 | for indice in indices[0]: 66 | if indice.item() in select_idx: continue 67 | max_idx = indice.item() 68 | break 69 | # max_score, max_idx = scores.view(200).max(0) 70 | print(max_idx) 71 | candidate_mask = bin_mask_tmp[0, max_idx] 72 | 73 | # 3. 找到分数最高点作为种子点, 进行区域提取 74 | 75 | 76 | # seed_point = (max_x, max_y) 77 | # final_mask = extract_region(candidate_mask, seed_point).unsqueeze(0).to(device) 78 | 79 | return candidate_mask, max_idx 80 | 81 | if __name__ == "__main__": 82 | bin_mask = torch.randn(1, 200, 480, 640) 83 | gradcam = torch.randn(480, 640) 84 | select_idx = [] 85 | caption = '' 86 | 87 | -------------------------------------------------------------------------------- /mask2former/modeling/pixel_decoder/ops/setup.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | import os 13 | import glob 14 | 15 | import torch 16 | 17 | from torch.utils.cpp_extension import CUDA_HOME 18 | from torch.utils.cpp_extension import CppExtension 19 | from torch.utils.cpp_extension import CUDAExtension 20 | 21 | from setuptools import find_packages 22 | from setuptools import setup 23 | 24 | requirements = ["torch", "torchvision"] 25 | 26 | def get_extensions(): 27 | this_dir = os.path.dirname(os.path.abspath(__file__)) 28 | extensions_dir = os.path.join(this_dir, "src") 29 | 30 | main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) 31 | source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) 32 | source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) 33 | 34 | sources = main_file + source_cpu 35 | extension = CppExtension 36 | extra_compile_args = {"cxx": []} 37 | define_macros = [] 38 | 39 | # Force cuda since torch ask for a device, not if cuda is in fact available. 40 | if (os.environ.get('FORCE_CUDA') or torch.cuda.is_available()) and CUDA_HOME is not None: 41 | extension = CUDAExtension 42 | sources += source_cuda 43 | define_macros += [("WITH_CUDA", None)] 44 | extra_compile_args["nvcc"] = [ 45 | "-DCUDA_HAS_FP16=1", 46 | "-D__CUDA_NO_HALF_OPERATORS__", 47 | "-D__CUDA_NO_HALF_CONVERSIONS__", 48 | "-D__CUDA_NO_HALF2_OPERATORS__", 49 | ] 50 | else: 51 | if CUDA_HOME is None: 52 | raise NotImplementedError('CUDA_HOME is None. Please set environment variable CUDA_HOME.') 53 | else: 54 | raise NotImplementedError('No CUDA runtime is found. Please set FORCE_CUDA=1 or test it by running torch.cuda.is_available().') 55 | 56 | sources = [os.path.join(extensions_dir, s) for s in sources] 57 | include_dirs = [extensions_dir] 58 | ext_modules = [ 59 | extension( 60 | "MultiScaleDeformableAttention", 61 | sources, 62 | include_dirs=include_dirs, 63 | define_macros=define_macros, 64 | extra_compile_args=extra_compile_args, 65 | ) 66 | ] 67 | return ext_modules 68 | 69 | setup( 70 | name="MultiScaleDeformableAttention", 71 | version="1.0", 72 | author="Weijie Su", 73 | url="https://github.com/fundamentalvision/Deformable-DETR", 74 | description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention", 75 | packages=find_packages(exclude=("configs", "tests",)), 76 | ext_modules=get_extensions(), 77 | cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, 78 | ) 79 | -------------------------------------------------------------------------------- /dataset/refexp_eval.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Aishwarya Kamath & Nicolas Carion. Licensed under the Apache License 2.0. All Rights Reserved 2 | import copy 3 | from collections import defaultdict 4 | from pathlib import Path 5 | 6 | import torch 7 | import torch.utils.data 8 | 9 | import util.misc as utils 10 | from util.box_ops import generalized_box_iou 11 | 12 | 13 | class RefExpEvaluator(object): 14 | def __init__(self, refexp_gt, iou_types, k=(1, 5, 10), thresh_iou=0.5): 15 | assert isinstance(k, (list, tuple)) 16 | refexp_gt = copy.deepcopy(refexp_gt) 17 | self.refexp_gt = refexp_gt 18 | self.iou_types = iou_types 19 | self.img_ids = self.refexp_gt.imgs.keys() 20 | self.predictions = {} 21 | self.k = k 22 | self.thresh_iou = thresh_iou 23 | 24 | def accumulate(self): 25 | pass 26 | 27 | def update(self, predictions): 28 | self.predictions.update(predictions) 29 | 30 | def synchronize_between_processes(self): 31 | all_predictions = utils.all_gather(self.predictions) 32 | merged_predictions = {} 33 | for p in all_predictions: 34 | merged_predictions.update(p) 35 | self.predictions = merged_predictions 36 | 37 | def summarize(self): 38 | if utils.is_main_process(): 39 | dataset2score = { 40 | "refcoco": {k: 0.0 for k in self.k}, 41 | "refcoco+": {k: 0.0 for k in self.k}, 42 | "refcocog": {k: 0.0 for k in self.k}, 43 | } 44 | dataset2count = {"refcoco": 0.0, "refcoco+": 0.0, "refcocog": 0.0} 45 | for image_id in self.img_ids: 46 | ann_ids = self.refexp_gt.getAnnIds(imgIds=image_id) 47 | assert len(ann_ids) == 1 48 | img_info = self.refexp_gt.loadImgs(image_id)[0] 49 | 50 | target = self.refexp_gt.loadAnns(ann_ids[0]) 51 | prediction = self.predictions[image_id] 52 | assert prediction is not None 53 | sorted_scores_boxes = sorted( 54 | zip(prediction["scores"].tolist(), prediction["boxes"].tolist()), reverse=True 55 | ) 56 | sorted_scores, sorted_boxes = zip(*sorted_scores_boxes) 57 | sorted_boxes = torch.cat([torch.as_tensor(x).view(1, 4) for x in sorted_boxes]) 58 | target_bbox = target[0]["bbox"] 59 | converted_bbox = [ 60 | target_bbox[0], 61 | target_bbox[1], 62 | target_bbox[2] + target_bbox[0], 63 | target_bbox[3] + target_bbox[1], 64 | ] 65 | giou = generalized_box_iou(sorted_boxes, torch.as_tensor(converted_bbox).view(-1, 4)) 66 | for k in self.k: 67 | if max(giou[:k]) >= self.thresh_iou: 68 | dataset2score[img_info["dataset_name"]][k] += 1.0 69 | dataset2count[img_info["dataset_name"]] += 1.0 70 | 71 | for key, value in dataset2score.items(): 72 | for k in self.k: 73 | try: 74 | value[k] /= dataset2count[key] 75 | except: 76 | pass 77 | results = {} 78 | for key, value in dataset2score.items(): 79 | results[key] = sorted([v for k, v in value.items()]) 80 | print(f" Dataset: {key} - Precision @ 1, 5, 10: {results[key]} \n") 81 | 82 | return results 83 | return None 84 | 85 | 86 | -------------------------------------------------------------------------------- /util/box_ops.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utilities for bounding box manipulation and GIoU. 3 | """ 4 | import torch 5 | from torchvision.ops.boxes import box_area 6 | 7 | def clip_iou(boxes1,boxes2): 8 | area1 = box_area(boxes1) 9 | area2 = box_area(boxes2) 10 | lt = torch.max(boxes1[:, :2], boxes2[:, :2]) 11 | rb = torch.min(boxes1[:, 2:], boxes2[:, 2:]) 12 | wh = (rb - lt).clamp(min=0) 13 | inter = wh[:,0] * wh[:,1] 14 | union = area1 + area2 - inter 15 | iou = (inter + 1e-6) / (union+1e-6) 16 | return iou 17 | 18 | def multi_iou(boxes1, boxes2): 19 | lt = torch.max(boxes1[...,:2], boxes2[...,:2]) 20 | rb = torch.min(boxes1[...,2:], boxes2[...,2:]) 21 | wh = (rb - lt).clamp(min=0) 22 | wh_1 = boxes1[...,2:] - boxes1[...,:2] 23 | wh_2 = boxes2[...,2:] - boxes2[...,:2] 24 | inter = wh[...,0] * wh[...,1] 25 | union = wh_1[...,0] * wh_1[...,1] + wh_2[...,0] * wh_2[...,1] - inter 26 | iou = (inter + 1e-6) / (union + 1e-6) 27 | return iou 28 | 29 | def box_cxcywh_to_xyxy(x): 30 | x_c, y_c, w, h = x.unbind(-1) 31 | b = [(x_c - 0.5 * w), (y_c - 0.5 * h), 32 | (x_c + 0.5 * w), (y_c + 0.5 * h)] 33 | return torch.stack(b, dim=-1) 34 | 35 | 36 | def box_xyxy_to_cxcywh(x): 37 | x0, y0, x1, y1 = x.unbind(-1) 38 | b = [(x0 + x1) / 2, (y0 + y1) / 2, 39 | (x1 - x0), (y1 - y0)] 40 | return torch.stack(b, dim=-1) 41 | 42 | 43 | # modified from torchvision to also return the union 44 | def box_iou(boxes1, boxes2): 45 | area1 = box_area(boxes1) 46 | area2 = box_area(boxes2) 47 | 48 | lt = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2] 49 | rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2] 50 | 51 | wh = (rb - lt).clamp(min=0) # [N,M,2] 52 | inter = wh[:, :, 0] * wh[:, :, 1] # [N,M] 53 | 54 | union = area1[:, None] + area2 - inter 55 | 56 | iou = (inter+1e-6) / (union+1e-6) 57 | return iou, union 58 | 59 | 60 | def generalized_box_iou(boxes1, boxes2): 61 | """ 62 | Generalized IoU from https://giou.stanford.edu/ 63 | 64 | The boxes should be in [x0, y0, x1, y1] format 65 | 66 | Returns a [N, M] pairwise matrix, where N = len(boxes1) 67 | and M = len(boxes2) 68 | """ 69 | # degenerate boxes gives inf / nan results 70 | # so do an early check 71 | assert (boxes1[:, 2:] >= boxes1[:, :2]).all() 72 | assert (boxes2[:, 2:] >= boxes2[:, :2]).all() 73 | iou, union = box_iou(boxes1, boxes2) 74 | 75 | lt = torch.min(boxes1[:, None, :2], boxes2[:, :2]) 76 | rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:]) 77 | 78 | wh = (rb - lt).clamp(min=0) # [N,M,2] 79 | area = wh[:, :, 0] * wh[:, :, 1] 80 | 81 | return iou - ((area - union) + 1e-6) / (area + 1e-6) 82 | 83 | 84 | def masks_to_boxes(masks): 85 | """Compute the bounding boxes around the provided masks 86 | 87 | The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions. 88 | 89 | Returns a [N, 4] tensors, with the boxes in xyxy format 90 | """ 91 | if masks.numel() == 0: 92 | return torch.zeros((0, 4), device=masks.device) 93 | 94 | h, w = masks.shape[-2:] 95 | 96 | y = torch.arange(0, h, dtype=torch.float) 97 | x = torch.arange(0, w, dtype=torch.float) 98 | y, x = torch.meshgrid(y, x) 99 | 100 | x_mask = (masks * x.unsqueeze(0)) 101 | x_max = x_mask.flatten(1).max(-1)[0] 102 | x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0] 103 | 104 | y_mask = (masks * y.unsqueeze(0)) 105 | y_max = y_mask.flatten(1).max(-1)[0] 106 | y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0] 107 | 108 | return torch.stack([x_min, y_min, x_max, y_max], 1) 109 | -------------------------------------------------------------------------------- /mask2former/modeling/pixel_decoder/ops/functions/ms_deform_attn_func.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | from __future__ import absolute_import 13 | from __future__ import print_function 14 | from __future__ import division 15 | 16 | import torch 17 | import torch.nn.functional as F 18 | from torch.autograd import Function 19 | from torch.autograd.function import once_differentiable 20 | 21 | try: 22 | import MultiScaleDeformableAttention as MSDA 23 | except ModuleNotFoundError as e: 24 | info_string = ( 25 | "\n\nPlease compile MultiScaleDeformableAttention CUDA op with the following commands:\n" 26 | "\t`cd mask2former/modeling/pixel_decoder/ops`\n" 27 | "\t`sh make.sh`\n" 28 | ) 29 | raise ModuleNotFoundError(info_string) 30 | 31 | 32 | class MSDeformAttnFunction(Function): 33 | @staticmethod 34 | def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step): 35 | ctx.im2col_step = im2col_step 36 | output = MSDA.ms_deform_attn_forward( 37 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step) 38 | ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights) 39 | return output 40 | 41 | @staticmethod 42 | @once_differentiable 43 | def backward(ctx, grad_output): 44 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors 45 | grad_value, grad_sampling_loc, grad_attn_weight = \ 46 | MSDA.ms_deform_attn_backward( 47 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step) 48 | 49 | return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None 50 | 51 | 52 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights): 53 | # for debug and test only, 54 | # need to use cuda version instead 55 | N_, S_, M_, D_ = value.shape 56 | _, Lq_, M_, L_, P_, _ = sampling_locations.shape 57 | value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1) 58 | sampling_grids = 2 * sampling_locations - 1 59 | sampling_value_list = [] 60 | for lid_, (H_, W_) in enumerate(value_spatial_shapes): 61 | # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_ 62 | value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_) 63 | # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2 64 | sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1) 65 | # N_*M_, D_, Lq_, P_ 66 | sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_, 67 | mode='bilinear', padding_mode='zeros', align_corners=False) 68 | sampling_value_list.append(sampling_value_l_) 69 | # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_) 70 | attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_) 71 | output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_) 72 | return output.transpose(1, 2).contiguous() 73 | -------------------------------------------------------------------------------- /dataset/categories.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------------------------- 2 | # 1. Ref-Youtube-VOS 3 | ytvos_category_dict = { 4 | 'airplane': 0, 'ape': 1, 'bear': 2, 'bike': 3, 'bird': 4, 'boat': 5, 'bucket': 6, 'bus': 7, 'camel': 8, 'cat': 9, 5 | 'cow': 10, 'crocodile': 11, 'deer': 12, 'dog': 13, 'dolphin': 14, 'duck': 15, 'eagle': 16, 'earless_seal': 17, 6 | 'elephant': 18, 'fish': 19, 'fox': 20, 'frisbee': 21, 'frog': 22, 'giant_panda': 23, 'giraffe': 24, 'hand': 25, 7 | 'hat': 26, 'hedgehog': 27, 'horse': 28, 'knife': 29, 'leopard': 30, 'lion': 31, 'lizard': 32, 'monkey': 33, 8 | 'motorbike': 34, 'mouse': 35, 'others': 36, 'owl': 37, 'paddle': 38, 'parachute': 39, 'parrot': 40, 'penguin': 41, 9 | 'person': 42, 'plant': 43, 'rabbit': 44, 'raccoon': 45, 'sedan': 46, 'shark': 47, 'sheep': 48, 'sign': 49, 10 | 'skateboard': 50, 'snail': 51, 'snake': 52, 'snowboard': 53, 'squirrel': 54, 'surfboard': 55, 'tennis_racket': 56, 11 | 'tiger': 57, 'toilet': 58, 'train': 59, 'truck': 60, 'turtle': 61, 'umbrella': 62, 'whale': 63, 'zebra': 64 12 | } 13 | 14 | ytvos_category_list = [ 15 | 'airplane', 'ape', 'bear', 'bike', 'bird', 'boat', 'bucket', 'bus', 'camel', 'cat', 'cow', 'crocodile', 16 | 'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frisbee', 'frog', 17 | 'giant_panda', 'giraffe', 'hand', 'hat', 'hedgehog', 'horse', 'knife', 'leopard', 'lion', 'lizard', 18 | 'monkey', 'motorbike', 'mouse', 'others', 'owl', 'paddle', 'parachute', 'parrot', 'penguin', 'person', 19 | 'plant', 'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'sign', 'skateboard', 'snail', 'snake', 'snowboard', 20 | 'squirrel', 'surfboard', 'tennis_racket', 'tiger', 'toilet', 'train', 'truck', 'turtle', 'umbrella', 'whale', 'zebra' 21 | ] 22 | 23 | # ------------------------------------------------------------------------------------------------------------------- 24 | # 2. Ref-DAVIS17 25 | davis_category_dict = { 26 | 'airplane': 0, 'backpack': 1, 'ball': 2, 'bear': 3, 'bicycle': 4, 'bird': 5, 'boat': 6, 'bottle': 7, 'box': 8, 'bus': 9, 27 | 'camel': 10, 'car': 11, 'carriage': 12, 'cat': 13, 'cellphone': 14, 'chamaleon': 15, 'cow': 16, 'deer': 17, 'dog': 18, 28 | 'dolphin': 19, 'drone': 20, 'elephant': 21, 'excavator': 22, 'fish': 23, 'goat': 24, 'golf cart': 25, 'golf club': 26, 29 | 'grass': 27, 'guitar': 28, 'gun': 29, 'helicopter': 30, 'horse': 31, 'hoverboard': 32, 'kart': 33, 'key': 34, 'kite': 35, 30 | 'koala': 36, 'leash': 37, 'lion': 38, 'lock': 39, 'mask': 40, 'microphone': 41, 'monkey': 42, 'motorcycle': 43, 'oar': 44, 31 | 'paper': 45, 'paraglide': 46, 'person': 47, 'pig': 48, 'pole': 49, 'potted plant': 50, 'puck': 51, 'rack': 52, 'rhino': 53, 32 | 'rope': 54, 'sail': 55, 'scale': 56, 'scooter': 57, 'selfie stick': 58, 'sheep': 59, 'skateboard': 60, 'ski': 61, 'ski poles': 62, 33 | 'snake': 63, 'snowboard': 64, 'stick': 65, 'stroller': 66, 'surfboard': 67, 'swing': 68, 'tennis racket': 69, 'tractor': 70, 34 | 'trailer': 71, 'train': 72, 'truck': 73, 'turtle': 74, 'varanus': 75, 'violin': 76, 'wheelchair': 77 35 | } 36 | 37 | davis_category_list = [ 38 | 'airplane', 'backpack', 'ball', 'bear', 'bicycle', 'bird', 'boat', 'bottle', 'box', 'bus', 'camel', 'car', 'carriage', 39 | 'cat', 'cellphone', 'chamaleon', 'cow', 'deer', 'dog', 'dolphin', 'drone', 'elephant', 'excavator', 'fish', 'goat', 40 | 'golf cart', 'golf club', 'grass', 'guitar', 'gun', 'helicopter', 'horse', 'hoverboard', 'kart', 'key', 'kite', 'koala', 41 | 'leash', 'lion', 'lock', 'mask', 'microphone', 'monkey', 'motorcycle', 'oar', 'paper', 'paraglide', 'person', 'pig', 42 | 'pole', 'potted plant', 'puck', 'rack', 'rhino', 'rope', 'sail', 'scale', 'scooter', 'selfie stick', 'sheep', 'skateboard', 43 | 'ski', 'ski poles', 'snake', 'snowboard', 'stick', 'stroller', 'surfboard', 'swing', 'tennis racket', 'tractor', 'trailer', 44 | 'train', 'truck', 'turtle', 'varanus', 'violin', 'wheelchair' 45 | ] -------------------------------------------------------------------------------- /mask2former/test_time_augmentation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import copy 3 | import logging 4 | from itertools import count 5 | 6 | import numpy as np 7 | import torch 8 | from fvcore.transforms import HFlipTransform 9 | from torch import nn 10 | from torch.nn.parallel import DistributedDataParallel 11 | 12 | from detectron2.data.detection_utils import read_image 13 | from detectron2.modeling import DatasetMapperTTA 14 | 15 | 16 | __all__ = [ 17 | "SemanticSegmentorWithTTA", 18 | ] 19 | 20 | 21 | class SemanticSegmentorWithTTA(nn.Module): 22 | """ 23 | A SemanticSegmentor with test-time augmentation enabled. 24 | Its :meth:`__call__` method has the same interface as :meth:`SemanticSegmentor.forward`. 25 | """ 26 | 27 | def __init__(self, cfg, model, tta_mapper=None, batch_size=1): 28 | """ 29 | Args: 30 | cfg (CfgNode): 31 | model (SemanticSegmentor): a SemanticSegmentor to apply TTA on. 32 | tta_mapper (callable): takes a dataset dict and returns a list of 33 | augmented versions of the dataset dict. Defaults to 34 | `DatasetMapperTTA(cfg)`. 35 | batch_size (int): batch the augmented images into this batch size for inference. 36 | """ 37 | super().__init__() 38 | if isinstance(model, DistributedDataParallel): 39 | model = model.module 40 | self.cfg = cfg.clone() 41 | 42 | self.model = model 43 | 44 | if tta_mapper is None: 45 | tta_mapper = DatasetMapperTTA(cfg) 46 | self.tta_mapper = tta_mapper 47 | self.batch_size = batch_size 48 | 49 | def __call__(self, batched_inputs): 50 | """ 51 | Same input/output format as :meth:`SemanticSegmentor.forward` 52 | """ 53 | 54 | def _maybe_read_image(dataset_dict): 55 | ret = copy.copy(dataset_dict) 56 | if "image" not in ret: 57 | image = read_image(ret.pop("file_name"), self.model.input_format) 58 | image = torch.from_numpy(np.ascontiguousarray(image.transpose(2, 0, 1))) # CHW 59 | ret["image"] = image 60 | if "height" not in ret and "width" not in ret: 61 | ret["height"] = image.shape[1] 62 | ret["width"] = image.shape[2] 63 | return ret 64 | 65 | processed_results = [] 66 | for x in batched_inputs: 67 | result = self._inference_one_image(_maybe_read_image(x)) 68 | processed_results.append(result) 69 | return processed_results 70 | 71 | def _inference_one_image(self, input): 72 | """ 73 | Args: 74 | input (dict): one dataset dict with "image" field being a CHW tensor 75 | Returns: 76 | dict: one output dict 77 | """ 78 | orig_shape = (input["height"], input["width"]) 79 | augmented_inputs, tfms = self._get_augmented_inputs(input) 80 | 81 | final_predictions = None 82 | count_predictions = 0 83 | for input, tfm in zip(augmented_inputs, tfms): 84 | count_predictions += 1 85 | with torch.no_grad(): 86 | if final_predictions is None: 87 | if any(isinstance(t, HFlipTransform) for t in tfm.transforms): 88 | final_predictions = self.model([input])[0].pop("sem_seg").flip(dims=[2]) 89 | else: 90 | final_predictions = self.model([input])[0].pop("sem_seg") 91 | else: 92 | if any(isinstance(t, HFlipTransform) for t in tfm.transforms): 93 | final_predictions += self.model([input])[0].pop("sem_seg").flip(dims=[2]) 94 | else: 95 | final_predictions += self.model([input])[0].pop("sem_seg") 96 | 97 | final_predictions = final_predictions / count_predictions 98 | return {"sem_seg": final_predictions} 99 | 100 | def _get_augmented_inputs(self, input): 101 | augmented_inputs = self.tta_mapper(input) 102 | tfms = [x.pop("transforms") for x in augmented_inputs] 103 | return augmented_inputs, tfms 104 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # UV 98 | # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | #uv.lock 102 | 103 | # poetry 104 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 105 | # This is especially recommended for binary packages to ensure reproducibility, and is more 106 | # commonly ignored for libraries. 107 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 108 | #poetry.lock 109 | 110 | # pdm 111 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 112 | #pdm.lock 113 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 114 | # in version control. 115 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 116 | .pdm.toml 117 | .pdm-python 118 | .pdm-build/ 119 | 120 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 121 | __pypackages__/ 122 | 123 | # Celery stuff 124 | celerybeat-schedule 125 | celerybeat.pid 126 | 127 | # SageMath parsed files 128 | *.sage.py 129 | 130 | # Environments 131 | .env 132 | .venv 133 | env/ 134 | venv/ 135 | ENV/ 136 | env.bak/ 137 | venv.bak/ 138 | 139 | # Spyder project settings 140 | .spyderproject 141 | .spyproject 142 | 143 | # Rope project settings 144 | .ropeproject 145 | 146 | # mkdocs documentation 147 | /site 148 | 149 | # mypy 150 | .mypy_cache/ 151 | .dmypy.json 152 | dmypy.json 153 | 154 | # Pyre type checker 155 | .pyre/ 156 | 157 | # pytype static type analyzer 158 | .pytype/ 159 | 160 | # Cython debug symbols 161 | cython_debug/ 162 | 163 | # PyCharm 164 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 165 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 166 | # and can be added to the global gitignore or merged into this file. For a more nuclear 167 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 168 | #.idea/ 169 | 170 | # Ruff stuff: 171 | .ruff_cache/ 172 | 173 | # PyPI configuration file 174 | .pypirc -------------------------------------------------------------------------------- /mask2former/utils/misc.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/util/misc.py 3 | """ 4 | Misc functions, including distributed helpers. 5 | 6 | Mostly copy-paste from torchvision references. 7 | """ 8 | from typing import List, Optional 9 | 10 | import torch 11 | import torch.distributed as dist 12 | import torchvision 13 | from torch import Tensor 14 | 15 | 16 | def _max_by_axis(the_list): 17 | # type: (List[List[int]]) -> List[int] 18 | maxes = the_list[0] 19 | for sublist in the_list[1:]: 20 | for index, item in enumerate(sublist): 21 | maxes[index] = max(maxes[index], item) 22 | return maxes 23 | 24 | 25 | class NestedTensor(object): 26 | def __init__(self, tensors, mask: Optional[Tensor]): 27 | self.tensors = tensors 28 | self.mask = mask 29 | 30 | def to(self, device): 31 | # type: (Device) -> NestedTensor # noqa 32 | cast_tensor = self.tensors.to(device) 33 | mask = self.mask 34 | if mask is not None: 35 | assert mask is not None 36 | cast_mask = mask.to(device) 37 | else: 38 | cast_mask = None 39 | return NestedTensor(cast_tensor, cast_mask) 40 | 41 | def decompose(self): 42 | return self.tensors, self.mask 43 | 44 | def __repr__(self): 45 | return str(self.tensors) 46 | 47 | 48 | def nested_tensor_from_tensor_list(tensor_list: List[Tensor]): 49 | # TODO make this more general 50 | if tensor_list[0].ndim == 3: 51 | if torchvision._is_tracing(): 52 | # nested_tensor_from_tensor_list() does not export well to ONNX 53 | # call _onnx_nested_tensor_from_tensor_list() instead 54 | return _onnx_nested_tensor_from_tensor_list(tensor_list) 55 | 56 | # TODO make it support different-sized images 57 | max_size = _max_by_axis([list(img.shape) for img in tensor_list]) 58 | # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list])) 59 | batch_shape = [len(tensor_list)] + max_size 60 | b, c, h, w = batch_shape 61 | dtype = tensor_list[0].dtype 62 | device = tensor_list[0].device 63 | tensor = torch.zeros(batch_shape, dtype=dtype, device=device) 64 | mask = torch.ones((b, h, w), dtype=torch.bool, device=device) 65 | for img, pad_img, m in zip(tensor_list, tensor, mask): 66 | pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) 67 | m[: img.shape[1], : img.shape[2]] = False 68 | else: 69 | raise ValueError("not supported") 70 | return NestedTensor(tensor, mask) 71 | 72 | 73 | # _onnx_nested_tensor_from_tensor_list() is an implementation of 74 | # nested_tensor_from_tensor_list() that is supported by ONNX tracing. 75 | @torch.jit.unused 76 | def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor: 77 | max_size = [] 78 | for i in range(tensor_list[0].dim()): 79 | max_size_i = torch.max( 80 | torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32) 81 | ).to(torch.int64) 82 | max_size.append(max_size_i) 83 | max_size = tuple(max_size) 84 | 85 | # work around for 86 | # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) 87 | # m[: img.shape[1], :img.shape[2]] = False 88 | # which is not yet supported in onnx 89 | padded_imgs = [] 90 | padded_masks = [] 91 | for img in tensor_list: 92 | padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))] 93 | padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0])) 94 | padded_imgs.append(padded_img) 95 | 96 | m = torch.zeros_like(img[0], dtype=torch.int, device=img.device) 97 | padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1) 98 | padded_masks.append(padded_mask.to(torch.bool)) 99 | 100 | tensor = torch.stack(padded_imgs) 101 | mask = torch.stack(padded_masks) 102 | 103 | return NestedTensor(tensor, mask=mask) 104 | 105 | 106 | def is_dist_avail_and_initialized(): 107 | if not dist.is_available(): 108 | return False 109 | if not dist.is_initialized(): 110 | return False 111 | return True 112 | -------------------------------------------------------------------------------- /mask2former/modeling/pixel_decoder/ops/test.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | from __future__ import absolute_import 13 | from __future__ import print_function 14 | from __future__ import division 15 | 16 | import time 17 | import torch 18 | import torch.nn as nn 19 | from torch.autograd import gradcheck 20 | 21 | from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch 22 | 23 | 24 | N, M, D = 1, 2, 2 25 | Lq, L, P = 2, 2, 2 26 | shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda() 27 | level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1])) 28 | S = sum([(H*W).item() for H, W in shapes]) 29 | 30 | 31 | torch.manual_seed(3) 32 | 33 | 34 | @torch.no_grad() 35 | def check_forward_equal_with_pytorch_double(): 36 | value = torch.rand(N, S, M, D).cuda() * 0.01 37 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 38 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 39 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 40 | im2col_step = 2 41 | output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu() 42 | output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu() 43 | fwdok = torch.allclose(output_cuda, output_pytorch) 44 | max_abs_err = (output_cuda - output_pytorch).abs().max() 45 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() 46 | 47 | print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') 48 | 49 | 50 | @torch.no_grad() 51 | def check_forward_equal_with_pytorch_float(): 52 | value = torch.rand(N, S, M, D).cuda() * 0.01 53 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 54 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 55 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 56 | im2col_step = 2 57 | output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu() 58 | output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu() 59 | fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3) 60 | max_abs_err = (output_cuda - output_pytorch).abs().max() 61 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() 62 | 63 | print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') 64 | 65 | 66 | def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True): 67 | 68 | value = torch.rand(N, S, M, channels).cuda() * 0.01 69 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 70 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 71 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 72 | im2col_step = 2 73 | func = MSDeformAttnFunction.apply 74 | 75 | value.requires_grad = grad_value 76 | sampling_locations.requires_grad = grad_sampling_loc 77 | attention_weights.requires_grad = grad_attn_weight 78 | 79 | gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step)) 80 | 81 | print(f'* {gradok} check_gradient_numerical(D={channels})') 82 | 83 | 84 | if __name__ == '__main__': 85 | check_forward_equal_with_pytorch_double() 86 | check_forward_equal_with_pytorch_float() 87 | 88 | for channels in [30, 32, 64, 71, 1025, 2048, 3096]: 89 | check_gradient_numerical(channels, True, True, True) 90 | 91 | 92 | 93 | -------------------------------------------------------------------------------- /dataset/image_to_seq_augmenter.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Modified from SeqFormer (https://github.com/wjf5203/SeqFormer) 3 | # ------------------------------------------------------------------------ 4 | # Modified from STEm-Seg (https://github.com/sabarim/STEm-Seg) 5 | # ------------------------------------------------------------------------ 6 | 7 | 8 | import imgaug 9 | import imgaug.augmenters as iaa 10 | import numpy as np 11 | 12 | from datetime import datetime 13 | 14 | from imgaug.augmentables.segmaps import SegmentationMapsOnImage 15 | from imgaug.augmentables.bbs import BoundingBox, BoundingBoxesOnImage 16 | 17 | 18 | class ImageToSeqAugmenter(object): 19 | def __init__(self, perspective=True, affine=True, motion_blur=True, 20 | brightness_range=(-50, 50), hue_saturation_range=(-15, 15), perspective_magnitude=0.12, 21 | scale_range=1.0, translate_range={"x": (-0.15, 0.15), "y": (-0.15, 0.15)}, rotation_range=(-20, 20), 22 | motion_blur_kernel_sizes=(7, 9), motion_blur_prob=0.5): 23 | 24 | self.basic_augmenter = iaa.SomeOf((1, None), [ 25 | iaa.Add(brightness_range), 26 | iaa.AddToHueAndSaturation(hue_saturation_range) 27 | ] 28 | ) 29 | 30 | transforms = [] 31 | if perspective: 32 | transforms.append(iaa.PerspectiveTransform(perspective_magnitude)) 33 | if affine: 34 | transforms.append(iaa.Affine(scale=scale_range, 35 | translate_percent=translate_range, 36 | rotate=rotation_range, 37 | order=1, # cv2.INTER_LINEAR 38 | backend='auto')) 39 | transforms = iaa.Sequential(transforms) 40 | transforms = [transforms] 41 | 42 | if motion_blur: 43 | blur = iaa.Sometimes(motion_blur_prob, iaa.OneOf( 44 | [ 45 | iaa.MotionBlur(ksize) 46 | for ksize in motion_blur_kernel_sizes 47 | ] 48 | )) 49 | transforms.append(blur) 50 | 51 | self.frame_shift_augmenter = iaa.Sequential(transforms) 52 | 53 | @staticmethod 54 | def condense_masks(instance_masks): 55 | condensed_mask = np.zeros_like(instance_masks[0], dtype=np.int8) 56 | for instance_id, mask in enumerate(instance_masks, 1): 57 | condensed_mask = np.where(mask, instance_id, condensed_mask) 58 | 59 | return condensed_mask 60 | 61 | @staticmethod 62 | def expand_masks(condensed_mask, num_instances): 63 | return [(condensed_mask == instance_id).astype(np.uint8) for instance_id in range(1, num_instances + 1)] 64 | 65 | def __call__(self, image, masks=None, boxes=None): 66 | det_augmenter = self.frame_shift_augmenter.to_deterministic() 67 | 68 | 69 | if masks is not None: 70 | masks_np, is_binary_mask = [], [] 71 | boxs_np = [] 72 | 73 | for mask in masks: 74 | 75 | if isinstance(mask, np.ndarray): 76 | masks_np.append(mask.astype(np.bool)) 77 | is_binary_mask.append(False) 78 | else: 79 | raise ValueError("Invalid mask type: {}".format(type(mask))) 80 | 81 | num_instances = len(masks_np) 82 | masks_np = SegmentationMapsOnImage(self.condense_masks(masks_np), shape=image.shape[:2]) 83 | # boxs_np = BoundingBoxesOnImage(boxs_np, shape=image.shape[:2]) 84 | 85 | seed = int(datetime.now().strftime('%M%S%f')[-8:]) 86 | imgaug.seed(seed) 87 | aug_image, aug_masks = det_augmenter(image=self.basic_augmenter(image=image) , segmentation_maps=masks_np) 88 | imgaug.seed(seed) 89 | invalid_pts_mask = det_augmenter(image=np.ones(image.shape[:2] + (1,), np.uint8)).squeeze(2) 90 | aug_masks = self.expand_masks(aug_masks.get_arr(), num_instances) 91 | # aug_boxes = aug_boxes.remove_out_of_image().clip_out_of_image() 92 | aug_masks = [mask for mask, is_bm in zip(aug_masks, is_binary_mask)] 93 | return aug_image, aug_masks #, aug_boxes.to_xyxy_array() 94 | 95 | else: 96 | masks = [SegmentationMapsOnImage(np.ones(image.shape[:2], np.bool), shape=image.shape[:2])] 97 | aug_image, invalid_pts_mask = det_augmenter(image=image, segmentation_maps=masks) 98 | return aug_image, invalid_pts_mask.get_arr() == 0 99 | -------------------------------------------------------------------------------- /dataset/a2d_eval.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file contains implementations for the precision@k and IoU (mean, overall) evaluation metrics. 3 | copy-paste from https://github.com/mttr2021/MTTR/blob/main/metrics.py 4 | """ 5 | import torch 6 | from tqdm import tqdm 7 | from pycocotools.coco import COCO 8 | from pycocotools.mask import decode 9 | import numpy as np 10 | 11 | from torchvision.ops.boxes import box_area 12 | 13 | def compute_bbox_iou(boxes1: torch.Tensor, boxes2: torch.Tensor): 14 | # both boxes: xyxy 15 | area1 = box_area(boxes1) 16 | area2 = box_area(boxes2) 17 | 18 | lt = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2] 19 | rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2] 20 | 21 | wh = (rb - lt).clamp(min=0) # [N,M,2] 22 | inter = wh[:, :, 0] * wh[:, :, 1] # [N,M] 23 | 24 | union = area1[:, None] + area2 - inter 25 | 26 | iou = (inter+1e-6) / (union+1e-6) 27 | return iou, inter, union 28 | 29 | def compute_mask_iou(outputs: torch.Tensor, labels: torch.Tensor, EPS=1e-6): 30 | outputs = outputs.int() 31 | intersection = (outputs & labels).float().sum((1, 2)) # Will be zero if Truth=0 or Prediction=0 32 | union = (outputs | labels).float().sum((1, 2)) # Will be zero if both are 0 33 | iou = (intersection + EPS) / (union + EPS) # EPS is used to avoid division by zero 34 | return iou, intersection, union 35 | 36 | # mask 37 | def calculate_precision_at_k_and_iou_metrics(coco_gt: COCO, coco_pred: COCO): 38 | print('evaluating mask precision@k & iou metrics...') 39 | counters_by_iou = {iou: 0 for iou in [0.5, 0.6, 0.7, 0.8, 0.9]} 40 | total_intersection_area = 0 41 | total_union_area = 0 42 | ious_list = [] 43 | for instance in tqdm(coco_gt.imgs.keys()): # each image_id contains exactly one instance 44 | gt_annot = coco_gt.imgToAnns[instance][0] 45 | gt_mask = decode(gt_annot['segmentation']) 46 | pred_annots = coco_pred.imgToAnns[instance] 47 | pred_annot = sorted(pred_annots, key=lambda a: a['score'])[-1] # choose pred with highest score 48 | pred_mask = decode(pred_annot['segmentation']) 49 | iou, intersection, union = compute_mask_iou(torch.tensor(pred_mask).unsqueeze(0), 50 | torch.tensor(gt_mask).unsqueeze(0)) 51 | iou, intersection, union = iou.item(), intersection.item(), union.item() 52 | for iou_threshold in counters_by_iou.keys(): 53 | if iou > iou_threshold: 54 | counters_by_iou[iou_threshold] += 1 55 | total_intersection_area += intersection 56 | total_union_area += union 57 | ious_list.append(iou) 58 | num_samples = len(ious_list) 59 | precision_at_k = np.array(list(counters_by_iou.values())) / num_samples 60 | overall_iou = total_intersection_area / total_union_area 61 | mean_iou = np.mean(ious_list) 62 | return precision_at_k, overall_iou, mean_iou 63 | 64 | # bbox 65 | def calculate_bbox_precision_at_k_and_iou_metrics(coco_gt: COCO, coco_pred: COCO): 66 | print('evaluating bbox precision@k & iou metrics...') 67 | counters_by_iou = {iou: 0 for iou in [0.5, 0.6, 0.7, 0.8, 0.9]} 68 | total_intersection_area = 0 69 | total_union_area = 0 70 | ious_list = [] 71 | for instance in tqdm(coco_gt.imgs.keys()): # each image_id contains exactly one instance 72 | gt_annot = coco_gt.imgToAnns[instance][0] 73 | gt_bbox = gt_annot['bbox'] # xywh 74 | gt_bbox = [ 75 | gt_bbox[0], 76 | gt_bbox[1], 77 | gt_bbox[2] + gt_bbox[0], 78 | gt_bbox[3] + gt_bbox[1], 79 | ] 80 | pred_annots = coco_pred.imgToAnns[instance] 81 | pred_annot = sorted(pred_annots, key=lambda a: a['score'])[-1] # choose pred with highest score 82 | pred_bbox = pred_annot['bbox'] # xyxy 83 | iou, intersection, union = compute_bbox_iou(torch.tensor(pred_bbox).unsqueeze(0), 84 | torch.tensor(gt_bbox).unsqueeze(0)) 85 | iou, intersection, union = iou.item(), intersection.item(), union.item() 86 | for iou_threshold in counters_by_iou.keys(): 87 | if iou > iou_threshold: 88 | counters_by_iou[iou_threshold] += 1 89 | total_intersection_area += intersection 90 | total_union_area += union 91 | ious_list.append(iou) 92 | num_samples = len(ious_list) 93 | precision_at_k = np.array(list(counters_by_iou.values())) / num_samples 94 | overall_iou = total_intersection_area / total_union_area 95 | mean_iou = np.mean(ious_list) 96 | return precision_at_k, overall_iou, mean_iou 97 | -------------------------------------------------------------------------------- /mask2former/config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | from detectron2.config import CfgNode as CN 4 | 5 | 6 | def add_maskformer2_config(cfg): 7 | """ 8 | Add config for MASK_FORMER. 9 | """ 10 | # NOTE: configs from original maskformer 11 | # data config 12 | # select the dataset mapper 13 | cfg.INPUT.DATASET_MAPPER_NAME = "mask_former_semantic" 14 | # Color augmentation 15 | cfg.INPUT.COLOR_AUG_SSD = False 16 | # We retry random cropping until no single category in semantic segmentation GT occupies more 17 | # than `SINGLE_CATEGORY_MAX_AREA` part of the crop. 18 | cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA = 1.0 19 | # Pad image and segmentation GT in dataset mapper. 20 | cfg.INPUT.SIZE_DIVISIBILITY = -1 21 | 22 | # solver config 23 | # weight decay on embedding 24 | cfg.SOLVER.WEIGHT_DECAY_EMBED = 0.0 25 | # optimizer 26 | cfg.SOLVER.OPTIMIZER = "ADAMW" 27 | cfg.SOLVER.BACKBONE_MULTIPLIER = 0.1 28 | 29 | # mask_former model config 30 | cfg.MODEL.MASK_FORMER = CN() 31 | 32 | # loss 33 | cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION = True 34 | cfg.MODEL.MASK_FORMER.NO_OBJECT_WEIGHT = 0.1 35 | cfg.MODEL.MASK_FORMER.CLASS_WEIGHT = 1.0 36 | cfg.MODEL.MASK_FORMER.DICE_WEIGHT = 1.0 37 | cfg.MODEL.MASK_FORMER.MASK_WEIGHT = 20.0 38 | 39 | # transformer config 40 | cfg.MODEL.MASK_FORMER.NHEADS = 8 41 | cfg.MODEL.MASK_FORMER.DROPOUT = 0.1 42 | cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD = 2048 43 | cfg.MODEL.MASK_FORMER.ENC_LAYERS = 0 44 | cfg.MODEL.MASK_FORMER.DEC_LAYERS = 6 45 | cfg.MODEL.MASK_FORMER.PRE_NORM = False 46 | 47 | cfg.MODEL.MASK_FORMER.HIDDEN_DIM = 256 48 | cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES = 100 49 | 50 | cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE = "res5" 51 | cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ = False 52 | 53 | # mask_former inference config 54 | cfg.MODEL.MASK_FORMER.TEST = CN() 55 | cfg.MODEL.MASK_FORMER.TEST.SEMANTIC_ON = True 56 | cfg.MODEL.MASK_FORMER.TEST.INSTANCE_ON = False 57 | cfg.MODEL.MASK_FORMER.TEST.PANOPTIC_ON = False 58 | cfg.MODEL.MASK_FORMER.TEST.OBJECT_MASK_THRESHOLD = 0.0 59 | cfg.MODEL.MASK_FORMER.TEST.OVERLAP_THRESHOLD = 0.0 60 | cfg.MODEL.MASK_FORMER.TEST.SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE = False 61 | 62 | # Sometimes `backbone.size_divisibility` is set to 0 for some backbone (e.g. ResNet) 63 | # you can use this config to override 64 | cfg.MODEL.MASK_FORMER.SIZE_DIVISIBILITY = 32 65 | 66 | # pixel decoder config 67 | cfg.MODEL.SEM_SEG_HEAD.MASK_DIM = 256 68 | # adding transformer in pixel decoder 69 | cfg.MODEL.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS = 0 70 | # pixel decoder 71 | cfg.MODEL.SEM_SEG_HEAD.PIXEL_DECODER_NAME = "BasePixelDecoder" 72 | 73 | # swin transformer backbone 74 | cfg.MODEL.SWIN = CN() 75 | cfg.MODEL.SWIN.PRETRAIN_IMG_SIZE = 224 76 | cfg.MODEL.SWIN.PATCH_SIZE = 4 77 | cfg.MODEL.SWIN.EMBED_DIM = 96 78 | cfg.MODEL.SWIN.DEPTHS = [2, 2, 6, 2] 79 | cfg.MODEL.SWIN.NUM_HEADS = [3, 6, 12, 24] 80 | cfg.MODEL.SWIN.WINDOW_SIZE = 7 81 | cfg.MODEL.SWIN.MLP_RATIO = 4.0 82 | cfg.MODEL.SWIN.QKV_BIAS = True 83 | cfg.MODEL.SWIN.QK_SCALE = None 84 | cfg.MODEL.SWIN.DROP_RATE = 0.0 85 | cfg.MODEL.SWIN.ATTN_DROP_RATE = 0.0 86 | cfg.MODEL.SWIN.DROP_PATH_RATE = 0.3 87 | cfg.MODEL.SWIN.APE = False 88 | cfg.MODEL.SWIN.PATCH_NORM = True 89 | cfg.MODEL.SWIN.OUT_FEATURES = ["res2", "res3", "res4", "res5"] 90 | cfg.MODEL.SWIN.USE_CHECKPOINT = False 91 | 92 | # NOTE: maskformer2 extra configs 93 | # transformer module 94 | cfg.MODEL.MASK_FORMER.TRANSFORMER_DECODER_NAME = "MultiScaleMaskedTransformerDecoder" 95 | 96 | # LSJ aug 97 | cfg.INPUT.IMAGE_SIZE = 1024 98 | cfg.INPUT.MIN_SCALE = 0.1 99 | cfg.INPUT.MAX_SCALE = 2.0 100 | 101 | # MSDeformAttn encoder configs 102 | cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES = ["res3", "res4", "res5"] 103 | cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_POINTS = 4 104 | cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_HEADS = 8 105 | 106 | # point loss configs 107 | # Number of points sampled during training for a mask point head. 108 | cfg.MODEL.MASK_FORMER.TRAIN_NUM_POINTS = 112 * 112 109 | # Oversampling parameter for PointRend point sampling during training. Parameter `k` in the 110 | # original paper. 111 | cfg.MODEL.MASK_FORMER.OVERSAMPLE_RATIO = 3.0 112 | # Importance sampling parameter for PointRend point sampling during training. Parametr `beta` in 113 | # the original paper. 114 | cfg.MODEL.MASK_FORMER.IMPORTANCE_SAMPLE_RATIO = 0.75 115 | -------------------------------------------------------------------------------- /util/stanza_utils.py: -------------------------------------------------------------------------------- 1 | import stanza 2 | import torch 3 | import os 4 | # from predict import get_clip_embeddings 5 | from torch.nn import functional as F 6 | 7 | 8 | nlp = stanza.Pipeline('en', dir = "../checkpoints/pweight", download_method=None) 9 | 10 | MAIN_POS = ['NOUN', 'VERB', 'ADJ', 'PROPN', 'NUM'] 11 | 12 | def find_no_main_words(sent, start_idx, tokenizer): 13 | # id_map = {} 14 | # doc = nlp(sent) 15 | # cursor = start_idx + 1 16 | # main_words_ids = [] 17 | # for word in doc.sentences[0].words: 18 | # temp = cursor; cursor += 1 19 | # main_words_ids.append(word.id) 20 | # id_map[word.id] = list(range(temp, cursor)) 21 | # return main_words_ids, id_map 22 | 23 | doc = nlp(sent) 24 | id_map = {} 25 | cursor = start_idx + 1 26 | main_words_ids = [] 27 | sent_split = sent.split() 28 | cnt = 0; rword = ""; flag = False 29 | for word in doc.sentences[0].words: 30 | if word.text != sent_split[cnt]: 31 | if word.pos: flag = True 32 | rword += word.text 33 | if rword != sent_split[cnt]: 34 | continue 35 | else: 36 | word.text = rword 37 | rword = "" 38 | if flag: word.pos = MAIN_POS[0] 39 | flag = False 40 | cnt += 1 41 | temp = cursor 42 | cursor += len(tokenizer(word.text, return_tensors="pt").input_ids[0]) - 1 43 | id_map[word.id] = list(range(temp, cursor)) 44 | if word.pos: 45 | main_words_ids.append(word.id) 46 | return main_words_ids, id_map 47 | 48 | def find_main_words(sent, start_idx, tokenizer): 49 | doc = nlp(sent) 50 | id_map = {} 51 | cursor = start_idx + 1 52 | main_words_ids = [] 53 | sent_split = sent.split() 54 | cnt = 0; rword = ""; flag = False 55 | for word in doc.sentences[0].words: 56 | if word.text != sent_split[cnt]: # 如果当前单词从中间被分解了 57 | if word.pos in MAIN_POS: flag = True 58 | rword += word.text 59 | if rword != sent_split[cnt]: 60 | continue 61 | else: 62 | word.text = rword 63 | rword = "" 64 | if flag: word.pos = MAIN_POS[0] 65 | flag = False 66 | cnt += 1 67 | temp = cursor 68 | cursor += len(tokenizer(word.text, return_tensors="pt").input_ids[0]) - 1 69 | id_map[word.id] = list(range(temp, cursor)) 70 | if word.pos in MAIN_POS: 71 | main_words_ids.append(word.id) 72 | return main_words_ids, id_map 73 | 74 | def DFS_right(node, variables: list): 75 | if node.is_leaf() or variables[1] != '': 76 | return 77 | if node.label == 'NN': 78 | variables[1] = str(node.children[0]) 79 | return 80 | for i in range(len(node.children)): 81 | idx = len(node.children) - i - 1 82 | DFS_right(node.children[idx], variables) 83 | 84 | 85 | def DFS_left(node, variables: list): 86 | if node.is_leaf() or variables[1] != '': 87 | return 88 | if node.label == 'NP': 89 | variables[0] += 1 90 | now_find_np = variables[0] 91 | for child in node.children: 92 | DFS_left(child, variables) 93 | if node.label == 'NP' and variables[0] == now_find_np: 94 | # find rightmost NN 95 | DFS_right(node, variables) 96 | 97 | 98 | def find_agent_by_stanza(sent): 99 | doc = nlp(sent) 100 | ''' 101 | the type of passing for int and str is passing by value, while for list is passing by reference, 102 | to modify the value of variables during DFS_left function, we put variables into a list 103 | variables[0] means the number of occurrence of NP, while variables[1] records the agent 104 | ''' 105 | variables = [0, ''] 106 | DFS_left(doc.sentences[0].constituency, variables) 107 | agent = variables[1] 108 | 109 | if agent == '': 110 | for i in range(len(doc.sentences[0].words)): 111 | idx = len(doc.sentences[0].words) - i - 1 112 | if doc.sentences[0].words[idx].pos == 'NOUN': 113 | agent = doc.sentences[0].words[idx].text 114 | break 115 | # if agent == '': 116 | # for i in range(len(doc.sentences[0].words)): 117 | # if doc.sentences[0].words[i].deprel == 'root': 118 | # agent = doc.sentences[0].words[i].text 119 | # break 120 | if agent == '': 121 | agent = '[UNK]' 122 | agent_id = -1 123 | for word in doc.sentences[0].words: 124 | if agent == word.text: 125 | agent_id = word.id 126 | return agent, agent_id, len(doc.sentences[0].words) 127 | 128 | if __name__ == '__main__': 129 | text = "a red and white checkered table with two wooden chairs" 130 | print(find_agent_by_stanza(text)[0]) 131 | 132 | -------------------------------------------------------------------------------- /mask2former/evaluation/instance_evaluation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import contextlib 3 | import copy 4 | import io 5 | import itertools 6 | import json 7 | import logging 8 | import numpy as np 9 | import os 10 | import pickle 11 | from collections import OrderedDict 12 | import pycocotools.mask as mask_util 13 | import torch 14 | from pycocotools.coco import COCO 15 | from pycocotools.cocoeval import COCOeval 16 | from tabulate import tabulate 17 | 18 | import detectron2.utils.comm as comm 19 | from detectron2.config import CfgNode 20 | from detectron2.data import MetadataCatalog 21 | from detectron2.data.datasets.coco import convert_to_coco_json 22 | from detectron2.evaluation.coco_evaluation import COCOEvaluator, _evaluate_predictions_on_coco 23 | from detectron2.evaluation.fast_eval_api import COCOeval_opt 24 | from detectron2.structures import Boxes, BoxMode, pairwise_iou 25 | from detectron2.utils.file_io import PathManager 26 | from detectron2.utils.logger import create_small_table 27 | 28 | 29 | # modified from COCOEvaluator for instance segmetnat 30 | class InstanceSegEvaluator(COCOEvaluator): 31 | """ 32 | Evaluate AR for object proposals, AP for instance detection/segmentation, AP 33 | for keypoint detection outputs using COCO's metrics. 34 | See http://cocodataset.org/#detection-eval and 35 | http://cocodataset.org/#keypoints-eval to understand its metrics. 36 | The metrics range from 0 to 100 (instead of 0 to 1), where a -1 or NaN means 37 | the metric cannot be computed (e.g. due to no predictions made). 38 | 39 | In addition to COCO, this evaluator is able to support any bounding box detection, 40 | instance segmentation, or keypoint detection dataset. 41 | """ 42 | 43 | def _eval_predictions(self, predictions, img_ids=None): 44 | """ 45 | Evaluate predictions. Fill self._results with the metrics of the tasks. 46 | """ 47 | self._logger.info("Preparing results for COCO format ...") 48 | coco_results = list(itertools.chain(*[x["instances"] for x in predictions])) 49 | tasks = self._tasks or self._tasks_from_predictions(coco_results) 50 | 51 | # unmap the category ids for COCO 52 | if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"): 53 | dataset_id_to_contiguous_id = self._metadata.thing_dataset_id_to_contiguous_id 54 | # all_contiguous_ids = list(dataset_id_to_contiguous_id.values()) 55 | # num_classes = len(all_contiguous_ids) 56 | # assert min(all_contiguous_ids) == 0 and max(all_contiguous_ids) == num_classes - 1 57 | 58 | reverse_id_mapping = {v: k for k, v in dataset_id_to_contiguous_id.items()} 59 | for result in coco_results: 60 | category_id = result["category_id"] 61 | # assert category_id < num_classes, ( 62 | # f"A prediction has class={category_id}, " 63 | # f"but the dataset only has {num_classes} classes and " 64 | # f"predicted class id should be in [0, {num_classes - 1}]." 65 | # ) 66 | assert category_id in reverse_id_mapping, ( 67 | f"A prediction has class={category_id}, " 68 | f"but the dataset only has class ids in {dataset_id_to_contiguous_id}." 69 | ) 70 | result["category_id"] = reverse_id_mapping[category_id] 71 | 72 | if self._output_dir: 73 | file_path = os.path.join(self._output_dir, "coco_instances_results.json") 74 | self._logger.info("Saving results to {}".format(file_path)) 75 | with PathManager.open(file_path, "w") as f: 76 | f.write(json.dumps(coco_results)) 77 | f.flush() 78 | 79 | if not self._do_evaluation: 80 | self._logger.info("Annotations are not available for evaluation.") 81 | return 82 | 83 | self._logger.info( 84 | "Evaluating predictions with {} COCO API...".format( 85 | "unofficial" if self._use_fast_impl else "official" 86 | ) 87 | ) 88 | for task in sorted(tasks): 89 | assert task in {"bbox", "segm", "keypoints"}, f"Got unknown task: {task}!" 90 | coco_eval = ( 91 | _evaluate_predictions_on_coco( 92 | self._coco_api, 93 | coco_results, 94 | task, 95 | kpt_oks_sigmas=self._kpt_oks_sigmas, 96 | use_fast_impl=self._use_fast_impl, 97 | img_ids=img_ids, 98 | max_dets_per_image=self._max_dets_per_image, 99 | ) 100 | if len(coco_results) > 0 101 | else None # cocoapi does not handle empty results very well 102 | ) 103 | 104 | res = self._derive_coco_results( 105 | coco_eval, task, class_names=self._metadata.get("thing_classes") 106 | ) 107 | self._results[task] = res 108 | -------------------------------------------------------------------------------- /demo/itergradcam.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import cv2 4 | import torch 5 | import numpy as np 6 | import matplotlib.pyplot as plt 7 | from skimage import transform as skimage_transform 8 | from scipy.ndimage import filters 9 | from PIL import Image 10 | import cv2 11 | 12 | def mySig(x, a = 20, b = 0.5): 13 | sig_x = 1 / (1 + torch.exp(a*(-x+b))) 14 | sig_x = sig_x / sig_x.max() 15 | return sig_x 16 | 17 | def getAttMap(img, attMap, blur = False, overlap = True): 18 | attMap -= attMap.min() 19 | if attMap.max() > 0: 20 | attMap /= attMap.max() 21 | attMap = skimage_transform.resize(attMap, (img.shape[:2]), order = 3, mode = 'constant') 22 | # attMap = skimage_transform.resize(attMap, (attMap.shape), order = 3, mode = 'constant') 23 | 24 | if blur: 25 | attMap = filters.gaussian_filter(attMap, 0.02*max(img.shape[:2])) 26 | attMap -= attMap.min() 27 | attMap /= attMap.max() 28 | cmap = plt.get_cmap('jet') 29 | attMapV = cmap(attMap) 30 | attMapV = np.delete(attMapV, 3, 2) 31 | if overlap: 32 | attMap = 1*(1-attMap**0.7).reshape(attMap.shape + (1,))*img + (attMap**0.7).reshape(attMap.shape+(1,)) * attMapV 33 | return attMap 34 | 35 | def show_groundvlp(image_path,query,gradcam,save_path = None): 36 | # category, 37 | # boxes_category, 38 | # od_scores 39 | num_image = 2 40 | fig, ax = plt.subplots(num_image, 1, figsize=(15, 5 * num_image)) 41 | 42 | bgr_image = cv2.imread(image_path) 43 | ax[0].imshow(bgr_image[:, :, ::-1]) 44 | ax[0].set_yticks([]) 45 | ax[0].set_xticks([]) 46 | ax[0].set_xlabel(query, fontsize=15) 47 | 48 | rgb_image = cv2.imread(image_path)[:, :, ::-1] 49 | rgb_image = np.float32(rgb_image) / 255 50 | 51 | gradcam_image = getAttMap(rgb_image, gradcam) 52 | np.clip(gradcam_image, 0., 1., out=gradcam_image) 53 | # print(np.max(gradcam_image)) 54 | ax[1].imshow(gradcam_image) 55 | ax[1].set_yticks([]) 56 | ax[1].set_xticks([]) 57 | ax[1].set_xlabel(query, fontsize=20) 58 | # plt.show() 59 | if save_path: 60 | plt.savefig(save_path) 61 | 62 | 63 | def IterGradCAM(engine,img_path,query,iter_num,path = './erase_imgs',vis = False,lmbda = 0.8): 64 | # Iterative机制 65 | gradcam = None 66 | if os.path.exists(path): 67 | shutil.rmtree(path) 68 | os.makedirs(path, exist_ok=True) 69 | 70 | itm_scores = [] 71 | pre_gradcam = None 72 | flag = True 73 | for i in range(iter_num): 74 | # gradcam, itm_score = engine.visualize_groundvlp(image_path=img_path,query=query,epoch=i) 75 | if i != 0: 76 | gradcam, itm_score = engine.visualize_groundvlp(image_path=img_path,query=query,epoch=i) 77 | # tmp = torch.exp(tmp) / torch.exp(torch.tensor(1.0)) 78 | # tmp = mySig(tmp) 79 | if grad_rev_sum.item()*itm_score.item() < itm_scores[-1]: 80 | # if itm_score.item() < itm_scores[-1]: 81 | flag = False 82 | break 83 | # gradcam = gradcam*lmbda + tmp*(1-lmbda) 84 | # gradcam = gradcam*(1-lmbda) + tmp*lmbda 85 | # gradcam = gradcam + tmp 86 | # gradcam = gradcam.clamp(0.0, 1.0) 87 | # gradcam -= gradcam.min() 88 | # if gradcam.max() > 0: 89 | # gradcam /= gradcam.max() 90 | else: 91 | gradcam, itm_score = engine.visualize_groundvlp(image_path=img_path,query=query,epoch=i) 92 | # gradcam = torch.exp(gradcam) / torch.exp(torch.tensor(1.0)) 93 | gradcam = mySig(gradcam) 94 | Image.open(img_path).save(f"{path}/raw.jpg") 95 | # img_path = f"{path}/raw.jpg" 96 | pre_gradcam = torch.zeros_like(gradcam) 97 | 98 | 99 | grad_rev_sum = torch.sum(torch.ones_like(pre_gradcam)-pre_gradcam) / pre_gradcam.numel() 100 | 101 | itm_scores.append(itm_score.item() * grad_rev_sum.item()) 102 | print(f"Iteration {i}") 103 | 104 | pre_gradcam = gradcam 105 | 106 | if vis: show_groundvlp(f"{path}/raw.jpg",query,gradcam,f"{path}/{i}_grad.jpg") 107 | # show_groundvlp(f"{path}/raw.jpg",query,gradcam,f"{path}/result.jpg") 108 | if flag: i = iter_num 109 | return gradcam, i 110 | def seed_torch(seed): 111 | os.environ['PYTHONHASHSEED'] = str(seed) 112 | np.random.seed(seed) 113 | torch.manual_seed(seed) 114 | torch.cuda.manual_seed(seed) 115 | torch.backends.cudnn.benchmark = False 116 | torch.backends.cudnn.deterministic = True 117 | 118 | if __name__ == "__main__": 119 | # 初始化路径和query 120 | import sys 121 | sys.path.insert(1, os.path.join(sys.path[0], '..')) 122 | from models.albef.engine import ALBEF 123 | seed = 1234 124 | seed_torch(seed) 125 | # torch.manual_seed(seed) 126 | # torch.cuda.manual_seed(seed) 127 | img_path = "/home/vegetabot/Filesys/CodeField_win/referformer_modify/data/coco/train2014/COCO_train2014_000000274266.jpg" 128 | query = "a man standing next to a young girl on a grassy hillside" # "a lady pouring wine in a glass" 129 | 130 | 131 | 132 | engine = ALBEF(model_id='ALBEF', device='cuda', templates='there is a {}') 133 | IterGradCAM(engine,img_path,query,3, vis = True) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # IteRPrimE: Zero-shot Referring Image Segmentation with Iterative Grad-CAM Refinement and Primary Word Emphasis 3 | 4 | ## 📝 Overview 5 | 6 | Architecture 7 | 8 | IteRPrimE is a novel framework for zero-shot referring image segmentation that leverages an Iterative Grad-CAM Refinement Strategy (IGRS) and a Primary Word Emphasis Module (PWEM) to improve localization accuracy—especially in cases with complex spatial descriptions. By using pre-trained vision-language models, IteRPrimE eliminates the need for further training or fine-tuning, achieving state-of-the-art performance on benchmarks such as RefCOCO, RefCOCO+, RefCOCOg, and PhraseCut. 9 | 10 | ## ✨ News 11 | 12 | - [2025.04.05]🔥 The v1 of the code is now officially open-sourced!🎉 13 | - [2024.12.10]🔥 IteRPrimE is accepted by AAAI-2025! 🥳 14 | 15 | ## 🛠 Installation 16 | 17 | 1. Create and activate a Conda environment: 18 | 19 | ``` 20 | conda create -n iterprime python=3.8.19 21 | conda activate iterprime 22 | ``` 23 | 24 | 2. Install the dependencies: 25 | 26 | ``` 27 | pip install -r requirements.txt 28 | ``` 29 | 30 | If you encounter any issues with the environment setup, you can refer to the configurations in the following two open-source projects for assistance: 31 | 32 | - [GroundVLP](https://github.com/om-ai-lab/GroundVLP) 33 | 34 | - [Mask2Former](https://github.com/facebookresearch/Mask2Former) 35 | 36 | ## 🏋️ Preparing Pretrained Model Weights 37 | 38 | Before you can run the code, you'll need to prepare the pretrained model weights and dataset. Please follow the steps below: 39 | 40 | ### 1. Create the necessary directories 41 | 42 | In the root directory of the project, create two directories: `checkpoints` and `data`. 43 | 44 | ```bash 45 | mkdir checkpoints 46 | mkdir data 47 | ``` 48 | 49 | ### 2. Download the pretrained model weights 50 | 51 | Download the following pretrained model weights and place them in the checkpoints folder. You can download them directly from the given links. 52 | 53 | - **Mask2Former weights:** [model_final_f07440.pkl](https://pan.baidu.com/s/1phxTevS--Pe-3p611Pt75g) [pt6z] 54 | - **ALBEF weights:** [ALBEF.pth](https://pan.baidu.com/s/1GlO1LIjjvp_G5fb7z49VJQ) [8u5y] 55 | - **Stanza weights:** [pweight.zip](https://pan.baidu.com/s/1P1bHVgiRUGsm1bWzcCy5-w) [6f84] 56 | - **BERT-base-uncased weights:** [bert-base-uncased.zip](https://pan.baidu.com/s/1l9afrG0tgAHDArh6ON3ieA)[bggq] 57 | 58 | After downloading, extract the weight files (if in a compressed format) into the checkpoints folder. Your folder structure should look like this: 59 | 60 | ```shell 61 | / 62 | │ 63 | ├── checkpoints/ 64 | │ ├── model_final_f07440.pkl 65 | │ ├── ALBEF.pth 66 | │ ├── pweight/ 67 | │ └── bert-base-uncased/ 68 | └── data/ 69 | ``` 70 | 71 | ### 3. Download the COCO dataset 72 | 73 | Download the COCO dataset from the provided link and place it in the data folder. 74 | 75 | - **COCO dataset:** [coco.zip](https://pan.baidu.com/s/1MxNHMRmlkqLqsd9MWFWHpQ) [egus] 76 | 77 | After downloading, extract the coco.zip file to the data folder. The final directory structure should look like this: 78 | 79 | ```shell 80 | / 81 | │ 82 | ├── data/ 83 | │ └── coco/ 84 | └── checkpoints/ 85 | ``` 86 | 87 | Once the weights and dataset are in place, you’re ready to run the code and start testing the model. 88 | 89 | 90 | ## 🚀 Running the Demo 91 | 92 | To quickly verify your setup and see IteRPrimE in action, follow these steps: 93 | 94 | 1. **Navigate to the `demo` directory**: 95 | 96 | ```bash 97 | cd demo 98 | ``` 99 | 100 | 2. **Run the main script**: 101 | 102 | Run following code to evaluate IteRPrimE on **refcoco testA** datasets: 103 | 104 | ```bash 105 | python IteRPrimE.py --data-set refcoco --image-set testA 106 | ``` 107 | 108 | This script will: 109 | 110 | - Load the necessary configurations and pretrained model weights. 111 | 112 | - Perform zero-shot referring image segmentation on a sample image or dataset. 113 | 114 | - Output the segmentation results for you to inspect. 115 | 116 | 117 | 118 | Depending on your configuration, the output images or logs may be saved to a specific folder (e.g., outputs/). Check the script and your config settings for details. 119 | 120 | If you encounter any issues or have questions, feel free to open an issue or start a discussion. 121 | 122 | 123 | 124 | 125 | ## 📖 Citation 126 | 127 | If you use IteRPrimE in your research, please cite our paper: 128 | 129 | ``` 130 | @article{wang2025iterprime, 131 | title={IteRPrimE: Zero-shot Referring Image Segmentation with Iterative Grad-CAM Refinement and Primary Word Emphasis}, 132 | author={Wang, Yuji and Ni, Jingchen and Liu, Yong and Yuan, Chun and Tang, Yansong}, 133 | journal={arXiv preprint arXiv:2503.00936}, 134 | year={2025} 135 | } 136 | ``` 137 | 138 | ## 🤝 Acknowledgements 139 | 140 | We appreciate the support from our collaborators and funding agencies. Stay tuned for updates—the full code release will be coming soon! 141 | 142 | We would also like to express our gratitude to the authors of [GroundVLP](https://github.com/om-ai-lab/GroundVLP) and [Mask2Former](https://github.com/facebookresearch/Mask2Former) for their inspiring work and open-source contributions, which served as valuable references for this project. 143 | -------------------------------------------------------------------------------- /models/albef/VL_Transformer_ITM.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | 3 | import numpy as np 4 | from matplotlib import pyplot as plt 5 | from scipy.ndimage import filters 6 | 7 | from models.albef.models.tokenization_bert import BertTokenizer 8 | from models.albef.models.vit import VisionTransformer 9 | from models.albef.models.xbert import BertConfig, BertModel 10 | from skimage import transform as skimage_transform 11 | 12 | import torch 13 | from torch import nn 14 | from torchvision import transforms 15 | import re 16 | from PIL import Image 17 | 18 | # coding=utf-8 19 | # 构建SPP层(空间金字塔池化层) 20 | import math 21 | import torch 22 | import torch.nn.functional as F 23 | 24 | # 构建SPP层(空间金字塔池化层) 25 | class SPPLayer(torch.nn.Module): 26 | 27 | def __init__(self, num_levels, pool_type='max_pool'): 28 | super(SPPLayer, self).__init__() 29 | 30 | self.num_levels = num_levels 31 | self.pool_type = pool_type 32 | 33 | def forward(self, x): 34 | num, n,c = x.size() # num:样本数量 c:通道数 h:高 w:宽 35 | 36 | h = 16 37 | w = 16 38 | # 提取最初的《cls》token 39 | cls_token = x[:, 0, :].view(num, 1, c) 40 | 41 | # 对剩下的 image token 做 SPP 42 | x = x[:, 1:, :].view(num, c, h, w) 43 | original_num_tokens = 256 44 | 45 | for i in range(self.num_levels): 46 | level = i + 1 47 | kernel_size = (math.ceil(h / level), math.ceil(w / level)) 48 | stride = (math.ceil(h / level), math.ceil(w / level)) 49 | pooling = (math.floor((kernel_size[0]*level-h+1)/2), math.floor((kernel_size[1]*level-w+1)/2)) 50 | 51 | # 选择池化方式 52 | if self.pool_type == 'max_pool': 53 | tensor = F.max_pool2d(x, kernel_size=kernel_size, stride=stride, padding=pooling) 54 | else: 55 | tensor = F.avg_pool2d(x, kernel_size=kernel_size, stride=stride, padding=pooling) 56 | 57 | # 展开、拼接 58 | if (i == 0): 59 | x_flatten = tensor.view(num, -1, c) 60 | else: 61 | x_flatten = torch.cat((x_flatten, tensor.view(num, -1, c)), 1) 62 | 63 | # 将最初的《cls》token拼接回去 64 | # x_flatten = torch.cat((cls_token, x_flatten), 1) 65 | 66 | # 复制 cls token 到 257 个 67 | cls_tokens = cls_token.expand(-1, original_num_tokens-x_flatten.shape[1]+2, -1) 68 | x_flatten = torch.cat((cls_tokens, x_flatten[:, 1:, :]), 1) 69 | 70 | 71 | return x_flatten 72 | 73 | 74 | class VL_Transformer_ITM(nn.Module): 75 | def __init__(self, 76 | text_encoder=None, 77 | config_bert='' 78 | ): 79 | super().__init__() 80 | 81 | bert_config = BertConfig.from_json_file(config_bert) 82 | 83 | self.visual_encoder = VisionTransformer( 84 | img_size=256, patch_size=16, embed_dim=768, depth=12, num_heads=12, 85 | mlp_ratio=4, qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6)) 86 | 87 | self.text_encoder = BertModel.from_pretrained(text_encoder, config=bert_config, add_pooling_layer=False) 88 | 89 | self.itm_head = nn.Linear(768, 2) 90 | 91 | self.spp = SPPLayer(6) 92 | 93 | def forward(self, image, text, gradcam=None): 94 | image_embeds = self.visual_encoder(image) 95 | # 看看这里如果加了细粒度的特征会怎么样 96 | # image_embeds = image_embeds.reshape(1,-1,16,16) 97 | # image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(image.device) 98 | # image_embeds = self.spp(image_embeds) 99 | 100 | if gradcam is None : image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(image.device) 101 | else: 102 | B = image_embeds.shape[0] 103 | ones = torch.ones(B, 1, dtype=torch.long, device=image.device) 104 | image_atts = torch.cat((ones, gradcam.to(image.device)), dim=1) 105 | 106 | output = self.text_encoder(text.input_ids.to(image.device), 107 | attention_mask=text.attention_mask.to(image.device), 108 | encoder_hidden_states=image_embeds, 109 | encoder_attention_mask=image_atts.to(image.device), 110 | return_dict=True, 111 | ) 112 | vl_embeddings = output.last_hidden_state[:, 0, :] 113 | 114 | # print(vl_embeddings) 115 | vl_output = self.itm_head(vl_embeddings) 116 | # _,pred = torch.max(vl_output,1) 117 | # print(type(vl_output)) 118 | # return image_embeds, output.last_hidden_state, vl_output 119 | return vl_output 120 | 121 | 122 | 123 | def getAttMap(img, attMap, blur = True, overlap = True): 124 | attMap -= attMap.min() 125 | if attMap.max() > 0: 126 | attMap /= attMap.max() 127 | attMap = skimage_transform.resize(attMap, (img.shape[:2]), order = 3, mode = 'constant') 128 | # attMap = skimage_transform.resize(attMap, (attMap.shape), order = 3, mode = 'constant') 129 | 130 | if blur: 131 | attMap = filters.gaussian_filter(attMap, 0.02*max(img.shape[:2])) 132 | attMap -= attMap.min() 133 | attMap /= attMap.max() 134 | cmap = plt.get_cmap('jet') 135 | attMapV = cmap(attMap) 136 | attMapV = np.delete(attMapV, 3, 2) 137 | if overlap: 138 | attMap = 1*(1-attMap**0.7).reshape(attMap.shape + (1,))*img + (attMap**0.7).reshape(attMap.shape+(1,)) * attMapV 139 | return attMap 140 | 141 | normalize = transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)) 142 | 143 | transform = transforms.Compose([ 144 | transforms.Resize((256, 256), interpolation=Image.BICUBIC), 145 | transforms.ToTensor(), 146 | normalize, 147 | ]) 148 | 149 | 150 | 151 | 152 | -------------------------------------------------------------------------------- /dataset/coco.py: -------------------------------------------------------------------------------- 1 | """ 2 | COCO dataset which returns image_id for evaluation. 3 | 4 | Mostly copy-paste from https://github.com/pytorch/vision/blob/13b35ff/references/detection/coco_utils.py 5 | """ 6 | from pathlib import Path 7 | 8 | import torch 9 | import torch.utils.data 10 | import torchvision 11 | from pycocotools import mask as coco_mask 12 | 13 | import datasets.transforms as T 14 | 15 | 16 | class CocoDetection(torchvision.datasets.CocoDetection): 17 | def __init__(self, img_folder, ann_file, transforms, return_masks): 18 | super(CocoDetection, self).__init__(img_folder, ann_file) 19 | self._transforms = transforms 20 | self.prepare = ConvertCocoPolysToMask(return_masks) 21 | 22 | def __getitem__(self, idx): 23 | img, target = super(CocoDetection, self).__getitem__(idx) 24 | image_id = self.ids[idx] 25 | target = {'image_id': image_id, 'annotations': target} 26 | 27 | img, target = self.prepare(img, target) 28 | if self._transforms is not None: 29 | img, target = self._transforms(img, target) 30 | return img, target 31 | 32 | 33 | def convert_coco_poly_to_mask(segmentations, height, width): 34 | masks = [] 35 | for polygons in segmentations: 36 | rles = coco_mask.frPyObjects(polygons, height, width) 37 | mask = coco_mask.decode(rles) 38 | if len(mask.shape) < 3: 39 | mask = mask[..., None] 40 | mask = torch.as_tensor(mask, dtype=torch.uint8) 41 | mask = mask.any(dim=2) 42 | masks.append(mask) 43 | if masks: 44 | masks = torch.stack(masks, dim=0) 45 | else: 46 | masks = torch.zeros((0, height, width), dtype=torch.uint8) 47 | return masks 48 | 49 | 50 | class ConvertCocoPolysToMask(object): 51 | def __init__(self, return_masks=False): 52 | self.return_masks = return_masks 53 | 54 | def __call__(self, image, target): 55 | w, h = image.size 56 | 57 | image_id = target["image_id"] 58 | image_id = torch.tensor([image_id]) 59 | 60 | anno = target["annotations"] 61 | 62 | anno = [obj for obj in anno if 'iscrowd' not in obj or obj['iscrowd'] == 0] 63 | 64 | boxes = [obj["bbox"] for obj in anno] 65 | # guard against no boxes via resizing 66 | boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4) 67 | boxes[:, 2:] += boxes[:, :2] 68 | boxes[:, 0::2].clamp_(min=0, max=w) 69 | boxes[:, 1::2].clamp_(min=0, max=h) 70 | 71 | classes = [obj["category_id"] for obj in anno] 72 | classes = torch.tensor(classes, dtype=torch.int64) 73 | 74 | if self.return_masks: 75 | segmentations = [obj["segmentation"] for obj in anno] 76 | masks = convert_coco_poly_to_mask(segmentations, h, w) 77 | 78 | keypoints = None 79 | if anno and "keypoints" in anno[0]: 80 | keypoints = [obj["keypoints"] for obj in anno] 81 | keypoints = torch.as_tensor(keypoints, dtype=torch.float32) 82 | num_keypoints = keypoints.shape[0] 83 | if num_keypoints: 84 | keypoints = keypoints.view(num_keypoints, -1, 3) 85 | 86 | keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0]) 87 | boxes = boxes[keep] 88 | classes = classes[keep] 89 | if self.return_masks: 90 | masks = masks[keep] 91 | if keypoints is not None: 92 | keypoints = keypoints[keep] 93 | 94 | target = {} 95 | target["boxes"] = boxes 96 | target["labels"] = classes 97 | if self.return_masks: 98 | target["masks"] = masks 99 | target["image_id"] = image_id 100 | if keypoints is not None: 101 | target["keypoints"] = keypoints 102 | 103 | # for conversion to coco api 104 | area = torch.tensor([obj["area"] for obj in anno]) 105 | iscrowd = torch.tensor([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in anno]) 106 | target["area"] = area[keep] 107 | target["iscrowd"] = iscrowd[keep] 108 | 109 | target["orig_size"] = torch.as_tensor([int(h), int(w)]) 110 | target["size"] = torch.as_tensor([int(h), int(w)]) 111 | 112 | return image, target 113 | 114 | 115 | def make_coco_transforms(image_set): 116 | 117 | normalize = T.Compose([ 118 | T.ToTensor(), 119 | T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) 120 | ]) 121 | 122 | scales = [480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800] 123 | 124 | if image_set == 'train': 125 | return T.Compose([ 126 | T.RandomHorizontalFlip(), 127 | T.RandomSelect( 128 | T.RandomResize(scales, max_size=1333), 129 | T.Compose([ 130 | T.RandomResize([400, 500, 600]), 131 | T.RandomSizeCrop(384, 600), 132 | T.RandomResize(scales, max_size=1333), 133 | ]) 134 | ), 135 | normalize, 136 | ]) 137 | 138 | if image_set == 'val': 139 | return T.Compose([ 140 | T.RandomResize([800], max_size=1333), 141 | normalize, 142 | ]) 143 | 144 | raise ValueError(f'unknown {image_set}') 145 | 146 | 147 | def build(image_set, args): 148 | root = Path(args.coco_path) 149 | assert root.exists(), f'provided COCO path {root} does not exist' 150 | mode = 'instances' 151 | PATHS = { 152 | "train": (root / "train2017", root / "annotations" / f'{mode}_train2017.json'), 153 | "val": (root / "val2017", root / "annotations" / f'{mode}_val2017.json'), 154 | } 155 | img_folder, ann_file = PATHS[image_set] 156 | dataset = CocoDetection(img_folder, ann_file, transforms=make_coco_transforms(image_set), return_masks=args.masks) 157 | return dataset 158 | -------------------------------------------------------------------------------- /dataset/samplers.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | # Modified from codes in torch.utils.data.distributed 7 | # ------------------------------------------------------------------------ 8 | 9 | import os 10 | import math 11 | import torch 12 | import torch.distributed as dist 13 | from torch.utils.data.sampler import Sampler 14 | 15 | 16 | class DistributedSampler(Sampler): 17 | """Sampler that restricts data loading to a subset of the dataset. 18 | It is especially useful in conjunction with 19 | :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each 20 | process can pass a DistributedSampler instance as a DataLoader sampler, 21 | and load a subset of the original dataset that is exclusive to it. 22 | .. note:: 23 | Dataset is assumed to be of constant size. 24 | Arguments: 25 | dataset: Dataset used for sampling. 26 | num_replicas (optional): Number of processes participating in 27 | distributed training. 28 | rank (optional): Rank of the current process within num_replicas. 29 | """ 30 | 31 | def __init__(self, dataset, num_replicas=None, rank=None, local_rank=None, local_size=None, shuffle=True): 32 | if num_replicas is None: 33 | if not dist.is_available(): 34 | raise RuntimeError("Requires distributed package to be available") 35 | num_replicas = dist.get_world_size() 36 | if rank is None: 37 | if not dist.is_available(): 38 | raise RuntimeError("Requires distributed package to be available") 39 | rank = dist.get_rank() 40 | self.dataset = dataset 41 | self.num_replicas = num_replicas 42 | self.rank = rank 43 | self.epoch = 0 44 | self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) 45 | self.total_size = self.num_samples * self.num_replicas 46 | self.shuffle = shuffle 47 | 48 | def __iter__(self): 49 | if self.shuffle: 50 | # deterministically shuffle based on epoch 51 | g = torch.Generator() 52 | g.manual_seed(self.epoch) 53 | indices = torch.randperm(len(self.dataset), generator=g).tolist() 54 | else: 55 | indices = torch.arange(len(self.dataset)).tolist() 56 | 57 | # add extra samples to make it evenly divisible 58 | indices += indices[: (self.total_size - len(indices))] 59 | assert len(indices) == self.total_size 60 | 61 | # subsample 62 | offset = self.num_samples * self.rank 63 | indices = indices[offset : offset + self.num_samples] 64 | assert len(indices) == self.num_samples 65 | 66 | return iter(indices) 67 | 68 | def __len__(self): 69 | return self.num_samples 70 | 71 | def set_epoch(self, epoch): 72 | self.epoch = epoch 73 | 74 | 75 | class NodeDistributedSampler(Sampler): 76 | """Sampler that restricts data loading to a subset of the dataset. 77 | It is especially useful in conjunction with 78 | :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each 79 | process can pass a DistributedSampler instance as a DataLoader sampler, 80 | and load a subset of the original dataset that is exclusive to it. 81 | .. note:: 82 | Dataset is assumed to be of constant size. 83 | Arguments: 84 | dataset: Dataset used for sampling. 85 | num_replicas (optional): Number of processes participating in 86 | distributed training. 87 | rank (optional): Rank of the current process within num_replicas. 88 | """ 89 | 90 | def __init__(self, dataset, num_replicas=None, rank=None, local_rank=None, local_size=None, shuffle=True): 91 | if num_replicas is None: 92 | if not dist.is_available(): 93 | raise RuntimeError("Requires distributed package to be available") 94 | num_replicas = dist.get_world_size() 95 | if rank is None: 96 | if not dist.is_available(): 97 | raise RuntimeError("Requires distributed package to be available") 98 | rank = dist.get_rank() 99 | if local_rank is None: 100 | local_rank = int(os.environ.get('LOCAL_RANK', 0)) 101 | if local_size is None: 102 | local_size = int(os.environ.get('LOCAL_SIZE', 1)) 103 | self.dataset = dataset 104 | self.shuffle = shuffle 105 | self.num_replicas = num_replicas 106 | self.num_parts = local_size 107 | self.rank = rank 108 | self.local_rank = local_rank 109 | self.epoch = 0 110 | self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) 111 | self.total_size = self.num_samples * self.num_replicas 112 | 113 | self.total_size_parts = self.num_samples * self.num_replicas // self.num_parts 114 | 115 | def __iter__(self): 116 | if self.shuffle: 117 | # deterministically shuffle based on epoch 118 | g = torch.Generator() 119 | g.manual_seed(self.epoch) 120 | indices = torch.randperm(len(self.dataset), generator=g).tolist() 121 | else: 122 | indices = torch.arange(len(self.dataset)).tolist() 123 | indices = [i for i in indices if i % self.num_parts == self.local_rank] 124 | 125 | # add extra samples to make it evenly divisible 126 | indices += indices[:(self.total_size_parts - len(indices))] 127 | assert len(indices) == self.total_size_parts 128 | 129 | # subsample 130 | indices = indices[self.rank // self.num_parts:self.total_size_parts:self.num_replicas // self.num_parts] 131 | assert len(indices) == self.num_samples 132 | 133 | return iter(indices) 134 | 135 | def __len__(self): 136 | return self.num_samples 137 | 138 | def set_epoch(self, epoch): 139 | self.epoch = epoch 140 | -------------------------------------------------------------------------------- /mask2former/modeling/meta_arch/mask_former_head.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import logging 3 | from copy import deepcopy 4 | from typing import Callable, Dict, List, Optional, Tuple, Union 5 | 6 | import fvcore.nn.weight_init as weight_init 7 | from torch import nn 8 | from torch.nn import functional as F 9 | 10 | from detectron2.config import configurable 11 | from detectron2.layers import Conv2d, ShapeSpec, get_norm 12 | from detectron2.modeling import SEM_SEG_HEADS_REGISTRY 13 | 14 | from ..transformer_decoder.maskformer_transformer_decoder import build_transformer_decoder 15 | from ..pixel_decoder.fpn import build_pixel_decoder 16 | 17 | 18 | @SEM_SEG_HEADS_REGISTRY.register() 19 | class MaskFormerHead(nn.Module): 20 | 21 | _version = 2 22 | 23 | def _load_from_state_dict( 24 | self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs 25 | ): 26 | version = local_metadata.get("version", None) 27 | if version is None or version < 2: 28 | # Do not warn if train from scratch 29 | scratch = True 30 | logger = logging.getLogger(__name__) 31 | for k in list(state_dict.keys()): 32 | newk = k 33 | if "sem_seg_head" in k and not k.startswith(prefix + "predictor"): 34 | newk = k.replace(prefix, prefix + "pixel_decoder.") 35 | # logger.debug(f"{k} ==> {newk}") 36 | if newk != k: 37 | state_dict[newk] = state_dict[k] 38 | del state_dict[k] 39 | scratch = False 40 | 41 | if not scratch: 42 | logger.warning( 43 | f"Weight format of {self.__class__.__name__} have changed! " 44 | "Please upgrade your models. Applying automatic conversion now ..." 45 | ) 46 | 47 | @configurable 48 | def __init__( 49 | self, 50 | input_shape: Dict[str, ShapeSpec], 51 | *, 52 | num_classes: int, 53 | pixel_decoder: nn.Module, 54 | loss_weight: float = 1.0, 55 | ignore_value: int = -1, 56 | # extra parameters 57 | transformer_predictor: nn.Module, 58 | transformer_in_feature: str, 59 | ): 60 | """ 61 | NOTE: this interface is experimental. 62 | Args: 63 | input_shape: shapes (channels and stride) of the input features 64 | num_classes: number of classes to predict 65 | pixel_decoder: the pixel decoder module 66 | loss_weight: loss weight 67 | ignore_value: category id to be ignored during training. 68 | transformer_predictor: the transformer decoder that makes prediction 69 | transformer_in_feature: input feature name to the transformer_predictor 70 | """ 71 | super().__init__() 72 | input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride) 73 | self.in_features = [k for k, v in input_shape] 74 | feature_strides = [v.stride for k, v in input_shape] 75 | feature_channels = [v.channels for k, v in input_shape] 76 | 77 | self.ignore_value = ignore_value 78 | self.common_stride = 4 79 | self.loss_weight = loss_weight 80 | 81 | self.pixel_decoder = pixel_decoder 82 | self.predictor = transformer_predictor 83 | self.transformer_in_feature = transformer_in_feature 84 | 85 | self.num_classes = num_classes 86 | 87 | @classmethod 88 | def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]): 89 | # figure out in_channels to transformer predictor 90 | if cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "transformer_encoder": 91 | transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM 92 | elif cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "pixel_embedding": 93 | transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM 94 | elif cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "multi_scale_pixel_decoder": # for maskformer2 95 | transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM 96 | else: 97 | transformer_predictor_in_channels = input_shape[cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE].channels 98 | 99 | return { 100 | "input_shape": { 101 | k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES 102 | }, 103 | "ignore_value": cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE, 104 | "num_classes": cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES, 105 | "pixel_decoder": build_pixel_decoder(cfg, input_shape), 106 | "loss_weight": cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT, 107 | "transformer_in_feature": cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE, 108 | "transformer_predictor": build_transformer_decoder( 109 | cfg, 110 | transformer_predictor_in_channels, 111 | mask_classification=True, 112 | ), 113 | } 114 | 115 | def forward(self, features, mask=None): 116 | return self.layers(features, mask) 117 | 118 | def layers(self, features, mask=None): 119 | mask_features, transformer_encoder_features, multi_scale_features = self.pixel_decoder.forward_features(features) 120 | if self.transformer_in_feature == "multi_scale_pixel_decoder": 121 | predictions = self.predictor(multi_scale_features, mask_features, mask) 122 | else: 123 | if self.transformer_in_feature == "transformer_encoder": 124 | assert ( 125 | transformer_encoder_features is not None 126 | ), "Please use the TransformerEncoderPixelDecoder." 127 | predictions = self.predictor(transformer_encoder_features, mask_features, mask) 128 | elif self.transformer_in_feature == "pixel_embedding": 129 | predictions = self.predictor(mask_features, mask_features, mask) 130 | else: 131 | predictions = self.predictor(features[self.transformer_in_feature], mask_features, mask) 132 | return predictions 133 | -------------------------------------------------------------------------------- /mask2former/modeling/pixel_decoder/ops/modules/ms_deform_attn.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | from __future__ import absolute_import 13 | from __future__ import print_function 14 | from __future__ import division 15 | 16 | import warnings 17 | import math 18 | 19 | import torch 20 | from torch import nn 21 | import torch.nn.functional as F 22 | from torch.nn.init import xavier_uniform_, constant_ 23 | 24 | from ..functions import MSDeformAttnFunction 25 | from ..functions.ms_deform_attn_func import ms_deform_attn_core_pytorch 26 | 27 | 28 | def _is_power_of_2(n): 29 | if (not isinstance(n, int)) or (n < 0): 30 | raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n))) 31 | return (n & (n-1) == 0) and n != 0 32 | 33 | 34 | class MSDeformAttn(nn.Module): 35 | def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4): 36 | """ 37 | Multi-Scale Deformable Attention Module 38 | :param d_model hidden dimension 39 | :param n_levels number of feature levels 40 | :param n_heads number of attention heads 41 | :param n_points number of sampling points per attention head per feature level 42 | """ 43 | super().__init__() 44 | if d_model % n_heads != 0: 45 | raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads)) 46 | _d_per_head = d_model // n_heads 47 | # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation 48 | if not _is_power_of_2(_d_per_head): 49 | warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 " 50 | "which is more efficient in our CUDA implementation.") 51 | 52 | self.im2col_step = 128 53 | 54 | self.d_model = d_model 55 | self.n_levels = n_levels 56 | self.n_heads = n_heads 57 | self.n_points = n_points 58 | 59 | self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2) 60 | self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points) 61 | self.value_proj = nn.Linear(d_model, d_model) 62 | self.output_proj = nn.Linear(d_model, d_model) 63 | 64 | self._reset_parameters() 65 | 66 | def _reset_parameters(self): 67 | constant_(self.sampling_offsets.weight.data, 0.) 68 | thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads) 69 | grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) 70 | grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1) 71 | for i in range(self.n_points): 72 | grid_init[:, :, i, :] *= i + 1 73 | with torch.no_grad(): 74 | self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1)) 75 | constant_(self.attention_weights.weight.data, 0.) 76 | constant_(self.attention_weights.bias.data, 0.) 77 | xavier_uniform_(self.value_proj.weight.data) 78 | constant_(self.value_proj.bias.data, 0.) 79 | xavier_uniform_(self.output_proj.weight.data) 80 | constant_(self.output_proj.bias.data, 0.) 81 | 82 | def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None): 83 | """ 84 | :param query (N, Length_{query}, C) 85 | :param reference_points (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area 86 | or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes 87 | :param input_flatten (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C) 88 | :param input_spatial_shapes (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})] 89 | :param input_level_start_index (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}] 90 | :param input_padding_mask (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements 91 | 92 | :return output (N, Length_{query}, C) 93 | """ 94 | N, Len_q, _ = query.shape 95 | N, Len_in, _ = input_flatten.shape 96 | assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in 97 | 98 | value = self.value_proj(input_flatten) 99 | if input_padding_mask is not None: 100 | value = value.masked_fill(input_padding_mask[..., None], float(0)) 101 | value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads) 102 | sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2) 103 | attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points) 104 | attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points) 105 | # N, Len_q, n_heads, n_levels, n_points, 2 106 | if reference_points.shape[-1] == 2: 107 | offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1) 108 | sampling_locations = reference_points[:, :, None, :, None, :] \ 109 | + sampling_offsets / offset_normalizer[None, None, None, :, None, :] 110 | elif reference_points.shape[-1] == 4: 111 | sampling_locations = reference_points[:, :, None, :, None, :2] \ 112 | + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5 113 | else: 114 | raise ValueError( 115 | 'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1])) 116 | try: 117 | output = MSDeformAttnFunction.apply( 118 | value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step) 119 | except: 120 | # CPU 121 | output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights) 122 | # # For FLOPs calculation only 123 | # output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights) 124 | output = self.output_proj(output) 125 | return output 126 | -------------------------------------------------------------------------------- /dataset/jhmdb.py: -------------------------------------------------------------------------------- 1 | """ 2 | JHMDB-Sentences data loader 3 | modified from https://github.com/mttr2021/MTTR/blob/main/datasets/jhmdb_sentences/jhmdb_sentences_dataset.py 4 | """ 5 | from pathlib import Path 6 | 7 | import torch 8 | from torchvision.io import read_video 9 | import torchvision.transforms.functional as F 10 | 11 | from torch.utils.data import Dataset 12 | import dataset.transforms_video as T 13 | 14 | import os 15 | from PIL import Image 16 | import json 17 | import numpy as np 18 | import random 19 | 20 | import scipy.io 21 | 22 | def get_image_id(video_id, frame_idx): 23 | image_id = f'v_{video_id}_f_{frame_idx}' 24 | return image_id 25 | 26 | class JHMDBSentencesDataset(Dataset): 27 | """ 28 | A Torch dataset for JHMDB-Sentences. 29 | For more information check out: https://kgavrilyuk.github.io/publication/actor_action/ or the original paper at: 30 | https://arxiv.org/abs/1803.07485 31 | """ 32 | def __init__(self, image_folder: Path, ann_file: Path, transforms, return_masks: bool, 33 | num_frames: int, max_skip: int, subset): 34 | super(JHMDBSentencesDataset, self).__init__() 35 | self.dataset_path = 'data' 36 | self.ann_file = ann_file 37 | self.samples_metadata = self.get_samples_metadata() 38 | 39 | self._transforms = transforms 40 | self.return_masks = return_masks # not used 41 | self.num_frames = num_frames 42 | self.max_skip = max_skip 43 | self.subset = subset 44 | 45 | print(f'\n {subset} sample num: ', len(self.samples_metadata)) 46 | print('\n') 47 | 48 | def get_samples_metadata(self): 49 | with open(str(self.ann_file), 'r') as f: 50 | samples_metadata = [tuple(a) for a in json.load(f)] 51 | return samples_metadata 52 | 53 | @staticmethod 54 | def bounding_box(img): 55 | rows = np.any(img, axis=1) 56 | cols = np.any(img, axis=0) 57 | rmin, rmax = np.where(rows)[0][[0, -1]] 58 | cmin, cmax = np.where(cols)[0][[0, -1]] 59 | return rmin, rmax, cmin, cmax # y1, y2, x1, x2 60 | 61 | def __len__(self): 62 | return len(self.samples_metadata) 63 | 64 | def __getitem__(self, idx): 65 | # only support for evaluation 66 | video_id, chosen_frame_path, video_masks_path, video_total_frames, text_query = self.samples_metadata[idx] 67 | text_query = " ".join(text_query.lower().split()) # clean up the text query 68 | 69 | # read the source window frames: 70 | chosen_frame_idx = int(chosen_frame_path.split('/')[-1].split('.')[0]) 71 | # get a window of window_size frames with frame chosen_frame_idx in the middle. 72 | start_idx, end_idx = chosen_frame_idx - self.num_frames // 2, chosen_frame_idx + (self.num_frames + 1) // 2 73 | frame_indices = list(range(start_idx, end_idx)) # note that jhmdb-sentences frames are 1-indexed 74 | # extract the window source frames: 75 | sample_indx = [] 76 | for i in frame_indices: 77 | i = min(max(i, 1), video_total_frames) # pad out of range indices with edge frames 78 | sample_indx.append(i) 79 | sample_indx.sort() 80 | # find the valid frame index in sampled frame list, there is only one valid frame 81 | valid_indices = sample_indx.index(chosen_frame_idx) 82 | 83 | # read frames 84 | imgs, boxes, masks, valid = [], [], [], [] 85 | for i in sample_indx: 86 | p = '/'.join(chosen_frame_path.split('/')[:-1]) + f'/{i:05d}.png' 87 | frame_path = os.path.join(self.dataset_path, p) 88 | imgs.append(Image.open(frame_path).convert('RGB')) 89 | 90 | # read the instance masks: 91 | video_masks_path = os.path.join(self.dataset_path, video_masks_path) 92 | all_video_masks = scipy.io.loadmat(video_masks_path)['part_mask'].transpose(2, 0, 1) # [T, H, W] 93 | # note that to take the center-frame corresponding mask we switch to 0-indexing: 94 | instance_mask = torch.tensor(all_video_masks[chosen_frame_idx - 1]) # [H, W] 95 | mask = instance_mask.numpy() 96 | if (mask > 0).any(): 97 | y1, y2, x1, x2 = self.bounding_box(mask) 98 | box = torch.tensor([x1, y1, x2, y2]).to(torch.float) 99 | valid.append(1) 100 | else: # some frame didn't contain the instance 101 | box = torch.tensor([0, 0, 0, 0]).to(torch.float) 102 | valid.append(0) 103 | mask = torch.from_numpy(mask) 104 | boxes.append(box) 105 | masks.append(mask) 106 | 107 | # transform 108 | h, w = instance_mask.shape[-2:] 109 | boxes = torch.stack(boxes, dim=0) 110 | boxes[:, 0::2].clamp_(min=0, max=w) 111 | boxes[:, 1::2].clamp_(min=0, max=h) 112 | masks = torch.stack(masks, dim=0) 113 | # there is only one valid frame 114 | target = { 115 | 'frames_idx': torch.tensor(sample_indx), # [T,] 116 | 'valid_indices': torch.tensor([valid_indices]), 117 | 'boxes': boxes, # [1, 4], xyxy 118 | 'masks': masks, # [1, H, W] 119 | 'valid': torch.tensor(valid), # [1,] 120 | 'caption': text_query, 121 | 'orig_size': torch.as_tensor([int(h), int(w)]), 122 | 'size': torch.as_tensor([int(h), int(w)]), 123 | 'image_id': get_image_id(video_id, chosen_frame_idx) 124 | } 125 | 126 | # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform 127 | imgs, target = self._transforms(imgs, target) 128 | imgs = torch.stack(imgs, dim=0) # [T, 3, H, W] 129 | 130 | # in 'val', valid always satisfies 131 | return imgs, target 132 | 133 | 134 | def make_coco_transforms(image_set, max_size=640): 135 | normalize = T.Compose([ 136 | T.ToTensor(), 137 | T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) 138 | ]) 139 | 140 | scales = [288, 320, 352, 392, 416, 448, 480, 512] 141 | 142 | if image_set == 'train': 143 | return T.Compose([ 144 | T.RandomHorizontalFlip(), 145 | T.PhotometricDistort(), 146 | T.RandomSelect( 147 | T.Compose([ 148 | T.RandomResize(scales, max_size=max_size), 149 | T.Check(), 150 | ]), 151 | T.Compose([ 152 | T.RandomResize([400, 500, 600]), 153 | T.RandomSizeCrop(384, 600), 154 | T.RandomResize(scales, max_size=max_size), 155 | T.Check(), 156 | ]) 157 | ), 158 | normalize, 159 | ]) 160 | 161 | # we do not use the 'val' set since the annotations are inaccessible 162 | if image_set == 'val': 163 | return T.Compose([ 164 | T.RandomResize([360], max_size=640), 165 | normalize, 166 | ]) 167 | 168 | raise ValueError(f'unknown {image_set}') 169 | 170 | 171 | def build(image_set, args): 172 | root = Path(args.jhmdb_path) 173 | assert root.exists(), f'provided JHMDB-Sentences path {root} does not exist' 174 | PATHS = { 175 | "train": (root, root / "jhmdb_sentences_samples_metadata.json"), # not used 176 | "val": (root, root / "jhmdb_sentences_samples_metadata.json"), 177 | } 178 | img_folder, ann_file = PATHS[image_set] 179 | dataset = JHMDBSentencesDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), 180 | return_masks=args.masks, num_frames=args.num_frames, max_skip=args.max_skip, subset=image_set) 181 | return dataset -------------------------------------------------------------------------------- /mask2former/modeling/transformer_decoder/maskformer_transformer_decoder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/detr.py 3 | import fvcore.nn.weight_init as weight_init 4 | import torch 5 | from torch import nn 6 | from torch.nn import functional as F 7 | 8 | from detectron2.config import configurable 9 | from detectron2.layers import Conv2d 10 | from detectron2.utils.registry import Registry 11 | 12 | from .position_encoding import PositionEmbeddingSine 13 | from .transformer import Transformer 14 | 15 | 16 | TRANSFORMER_DECODER_REGISTRY = Registry("TRANSFORMER_MODULE") 17 | TRANSFORMER_DECODER_REGISTRY.__doc__ = """ 18 | Registry for transformer module in MaskFormer. 19 | """ 20 | 21 | 22 | def build_transformer_decoder(cfg, in_channels, mask_classification=True): 23 | """ 24 | Build a instance embedding branch from `cfg.MODEL.INS_EMBED_HEAD.NAME`. 25 | """ 26 | name = cfg.MODEL.MASK_FORMER.TRANSFORMER_DECODER_NAME 27 | return TRANSFORMER_DECODER_REGISTRY.get(name)(cfg, in_channels, mask_classification) 28 | 29 | 30 | @TRANSFORMER_DECODER_REGISTRY.register() 31 | class StandardTransformerDecoder(nn.Module): 32 | @configurable 33 | def __init__( 34 | self, 35 | in_channels, 36 | mask_classification=True, 37 | *, 38 | num_classes: int, 39 | hidden_dim: int, 40 | num_queries: int, 41 | nheads: int, 42 | dropout: float, 43 | dim_feedforward: int, 44 | enc_layers: int, 45 | dec_layers: int, 46 | pre_norm: bool, 47 | deep_supervision: bool, 48 | mask_dim: int, 49 | enforce_input_project: bool, 50 | ): 51 | """ 52 | NOTE: this interface is experimental. 53 | Args: 54 | in_channels: channels of the input features 55 | mask_classification: whether to add mask classifier or not 56 | num_classes: number of classes 57 | hidden_dim: Transformer feature dimension 58 | num_queries: number of queries 59 | nheads: number of heads 60 | dropout: dropout in Transformer 61 | dim_feedforward: feature dimension in feedforward network 62 | enc_layers: number of Transformer encoder layers 63 | dec_layers: number of Transformer decoder layers 64 | pre_norm: whether to use pre-LayerNorm or not 65 | deep_supervision: whether to add supervision to every decoder layers 66 | mask_dim: mask feature dimension 67 | enforce_input_project: add input project 1x1 conv even if input 68 | channels and hidden dim is identical 69 | """ 70 | super().__init__() 71 | 72 | self.mask_classification = mask_classification 73 | 74 | # positional encoding 75 | N_steps = hidden_dim // 2 76 | self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True) 77 | 78 | transformer = Transformer( 79 | d_model=hidden_dim, 80 | dropout=dropout, 81 | nhead=nheads, 82 | dim_feedforward=dim_feedforward, 83 | num_encoder_layers=enc_layers, 84 | num_decoder_layers=dec_layers, 85 | normalize_before=pre_norm, 86 | return_intermediate_dec=deep_supervision, 87 | ) 88 | 89 | self.num_queries = num_queries 90 | self.transformer = transformer 91 | hidden_dim = transformer.d_model 92 | 93 | self.query_embed = nn.Embedding(num_queries, hidden_dim) 94 | 95 | if in_channels != hidden_dim or enforce_input_project: 96 | self.input_proj = Conv2d(in_channels, hidden_dim, kernel_size=1) 97 | weight_init.c2_xavier_fill(self.input_proj) 98 | else: 99 | self.input_proj = nn.Sequential() 100 | self.aux_loss = deep_supervision 101 | 102 | # output FFNs 103 | if self.mask_classification: 104 | self.class_embed = nn.Linear(hidden_dim, num_classes + 1) 105 | self.mask_embed = MLP(hidden_dim, hidden_dim, mask_dim, 3) 106 | 107 | @classmethod 108 | def from_config(cls, cfg, in_channels, mask_classification): 109 | ret = {} 110 | ret["in_channels"] = in_channels 111 | ret["mask_classification"] = mask_classification 112 | 113 | ret["num_classes"] = cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES 114 | ret["hidden_dim"] = cfg.MODEL.MASK_FORMER.HIDDEN_DIM 115 | ret["num_queries"] = cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES 116 | # Transformer parameters: 117 | ret["nheads"] = cfg.MODEL.MASK_FORMER.NHEADS 118 | ret["dropout"] = cfg.MODEL.MASK_FORMER.DROPOUT 119 | ret["dim_feedforward"] = cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD 120 | ret["enc_layers"] = cfg.MODEL.MASK_FORMER.ENC_LAYERS 121 | ret["dec_layers"] = cfg.MODEL.MASK_FORMER.DEC_LAYERS 122 | ret["pre_norm"] = cfg.MODEL.MASK_FORMER.PRE_NORM 123 | ret["deep_supervision"] = cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION 124 | ret["enforce_input_project"] = cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ 125 | 126 | ret["mask_dim"] = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM 127 | 128 | return ret 129 | 130 | def forward(self, x, mask_features, mask=None): 131 | if mask is not None: 132 | mask = F.interpolate(mask[None].float(), size=x.shape[-2:]).to(torch.bool)[0] 133 | pos = self.pe_layer(x, mask) 134 | 135 | src = x 136 | hs, memory = self.transformer(self.input_proj(src), mask, self.query_embed.weight, pos) 137 | 138 | if self.mask_classification: 139 | outputs_class = self.class_embed(hs) 140 | out = {"pred_logits": outputs_class[-1]} 141 | else: 142 | out = {} 143 | 144 | if self.aux_loss: 145 | # [l, bs, queries, embed] 146 | mask_embed = self.mask_embed(hs) 147 | outputs_seg_masks = torch.einsum("lbqc,bchw->lbqhw", mask_embed, mask_features) 148 | out["pred_masks"] = outputs_seg_masks[-1] 149 | out["aux_outputs"] = self._set_aux_loss( 150 | outputs_class if self.mask_classification else None, outputs_seg_masks 151 | ) 152 | else: 153 | # FIXME h_boxes takes the last one computed, keep this in mind 154 | # [bs, queries, embed] 155 | mask_embed = self.mask_embed(hs[-1]) 156 | outputs_seg_masks = torch.einsum("bqc,bchw->bqhw", mask_embed, mask_features) 157 | out["pred_masks"] = outputs_seg_masks 158 | return out 159 | 160 | @torch.jit.unused 161 | def _set_aux_loss(self, outputs_class, outputs_seg_masks): 162 | # this is a workaround to make torchscript happy, as torchscript 163 | # doesn't support dictionary with non-homogeneous values, such 164 | # as a dict having both a Tensor and a list. 165 | if self.mask_classification: 166 | return [ 167 | {"pred_logits": a, "pred_masks": b} 168 | for a, b in zip(outputs_class[:-1], outputs_seg_masks[:-1]) 169 | ] 170 | else: 171 | return [{"pred_masks": b} for b in outputs_seg_masks[:-1]] 172 | 173 | 174 | class MLP(nn.Module): 175 | """Very simple multi-layer perceptron (also called FFN)""" 176 | 177 | def __init__(self, input_dim, hidden_dim, output_dim, num_layers): 178 | super().__init__() 179 | self.num_layers = num_layers 180 | h = [hidden_dim] * (num_layers - 1) 181 | self.layers = nn.ModuleList( 182 | nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]) 183 | ) 184 | 185 | def forward(self, x): 186 | for i, layer in enumerate(self.layers): 187 | x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x) 188 | return x 189 | --------------------------------------------------------------------------------