├── .DS_Store
├── .gitignore
├── INSTALL.md
├── LICENSE
├── README.md
├── configs
    ├── ade20k
    │   └── semantic-segmentation
    │   │   ├── Base-ADE20K-SemanticSegmentation.yaml
    │   │   └── maskdino_R50_bs16_160k_steplr.yaml
    ├── cityscapes
    │   └── semantic-segmentation
    │   │   ├── Base-Cityscapes-SemanticSegmentation.yaml
    │   │   └── maskdino_R50_bs16_90k_steplr.yaml
    └── coco
    │   ├── instance-segmentation
    │       ├── Base-COCO-InstanceSegmentation.yaml
    │       ├── maskdino_R50_bs16_50ep_3s.yaml
    │       ├── maskdino_R50_bs16_50ep_3s_dowsample1_2048.yaml
    │       ├── maskdino_R50_bs16_50ep_3s_dowsample1_2048_bitmask.yaml
    │       ├── maskdino_R50_bs16_50ep_3s_dowsample2.yaml
    │       ├── maskdino_R50_bs16_50ep_4s_dowsample1_2048.yaml
    │       └── swin
    │       │   ├── maskdino_R50_bs16_50ep_4s_dowsample1_1024.yaml
    │       │   ├── maskdino_R50_bs16_50ep_4s_dowsample1_2048.yaml
    │       │   └── maskdino_R50_bs16_50ep_4s_dowsample1_2048_no_maskEnhance.yaml
    │   └── panoptic-segmentation
    │       ├── Base-COCO-PanopticSegmentation.yaml
    │       ├── maskdino_R50_bs16_50ep_3s_dowsample1_2048.yaml
    │       └── swin
    │           ├── maskdino_R50_bs16_50ep_4s_dowsample1_2048.yaml
    │           ├── maskdino_R50_bs16_50ep_4s_dowsample1_large_eval.yaml
    │           └── maskdino_R50_bs16_50ep_4s_dowsample1_maskEnhance_2048.yaml
├── datasets
    ├── README.md
    ├── ade20k_instance_catid_mapping.txt
    ├── ade20k_instance_imgCatIds.json
    ├── prepare_ade20k_ins_seg.py
    ├── prepare_ade20k_pan_seg.py
    ├── prepare_ade20k_sem_seg.py
    └── prepare_coco_semantic_annos_from_panoptic_annos.py
├── demo
    ├── README.md
    ├── demo.py
    └── predictor.py
├── figures
    ├── dinosaur.png
    ├── framework.jpg
    ├── instance.png
    ├── panoptic.png
    ├── semantic.png
    └── sota.png
├── maskdino
    ├── .DS_Store
    ├── __init__.py
    ├── config.py
    ├── data
    │   ├── __init__.py
    │   ├── dataset_mappers
    │   │   ├── __init__.py
    │   │   ├── coco_instance_new_baseline_dataset_mapper.py
    │   │   ├── coco_panoptic_new_baseline_dataset_mapper.py
    │   │   ├── detr_dataset_mapper.py
    │   │   └── mask_former_semantic_dataset_mapper.py
    │   └── datasets
    │   │   ├── __init__.py
    │   │   ├── register_ade20k_full.py
    │   │   ├── register_ade20k_instance.py
    │   │   ├── register_ade20k_panoptic.py
    │   │   ├── register_coco_panoptic_annos_semseg.py
    │   │   ├── register_coco_stuff_10k.py
    │   │   ├── register_mapillary_vistas.py
    │   │   └── register_mapillary_vistas_panoptic.py
    ├── evaluation
    │   ├── __init__.py
    │   └── instance_evaluation.py
    ├── maskdino.py
    ├── modeling
    │   ├── __init__.py
    │   ├── backbone
    │   │   ├── __init__.py
    │   │   ├── focal.py
    │   │   └── swin.py
    │   ├── criterion.py
    │   ├── matcher.py
    │   ├── meta_arch
    │   │   ├── __init__.py
    │   │   └── maskdino_head.py
    │   ├── pixel_decoder
    │   │   ├── __init__.py
    │   │   ├── maskdino_encoder.py
    │   │   ├── ops
    │   │   │   ├── functions
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── ms_deform_attn_func.py
    │   │   │   ├── make.sh
    │   │   │   ├── modules
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── ms_deform_attn.py
    │   │   │   ├── setup.py
    │   │   │   ├── src
    │   │   │   │   ├── cpu
    │   │   │   │   │   ├── ms_deform_attn_cpu.cpp
    │   │   │   │   │   └── ms_deform_attn_cpu.h
    │   │   │   │   ├── cuda
    │   │   │   │   │   ├── ms_deform_attn_cuda.cu
    │   │   │   │   │   ├── ms_deform_attn_cuda.h
    │   │   │   │   │   └── ms_deform_im2col_cuda.cuh
    │   │   │   │   ├── ms_deform_attn.h
    │   │   │   │   └── vision.cpp
    │   │   │   └── test.py
    │   │   └── position_encoding.py
    │   └── transformer_decoder
    │   │   ├── __init__.py
    │   │   ├── dino_decoder.py
    │   │   └── maskdino_decoder.py
    ├── test_time_augmentation.py
    └── utils
    │   ├── __init__.py
    │   ├── box_ops.py
    │   ├── misc.py
    │   └── utils.py
├── requirements.txt
├── tools
    ├── README.md
    ├── analyze_model.py
    ├── convert-pretrained-swin-model-to-d2.py
    ├── convert-torchvision-to-d2.py
    ├── evaluate_coco_boundary_ap.py
    └── evaluate_pq_for_semantic_segmentation.py
└── train_net.py


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IDEA-Research/MaskDINO/3831d8514a3728535ace8d4ecc7d28044c42dd14/.DS_Store


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # output dir
 2 | output
 3 | instant_test_output
 4 | inference_test_output
 5 | 
 6 | 
 7 | *.png
 8 | *.json
 9 | *.diff
10 | *.jpg
11 | !/projects/DensePose/doc/images/*.jpg
12 | 
13 | # compilation and distribution
14 | __pycache__
15 | _ext
16 | *.pyc
17 | *.pyd
18 | *.so
19 | *.dll
20 | *.egg-info/
21 | build/
22 | dist/
23 | wheels/
24 | 
25 | # pytorch/python/numpy formats
26 | *.pth
27 | *.pkl
28 | *.npy
29 | *.ts
30 | model_ts*.txt
31 | 
32 | # ipython/jupyter notebooks
33 | *.ipynb
34 | **/.ipynb_checkpoints/
35 | 
36 | # Editor temporaries
37 | *.swn
38 | *.swo
39 | *.swp
40 | *~
41 | 
42 | # editor settings
43 | .idea
44 | .vscode
45 | _darcs
46 | 
47 | # project dirs
48 | /detectron2/model_zoo/configs
49 | /datasets/*
50 | !/datasets/*.*
51 | /projects/*/datasets
52 | /models
53 | /snippet
54 | 


--------------------------------------------------------------------------------
/INSTALL.md:
--------------------------------------------------------------------------------
 1 | ## Installation
 2 | 
 3 | ### Requirements
 4 | - Linux with Python ≥ 3.6
 5 | - PyTorch ≥ 1.9 and [torchvision](https://github.com/pytorch/vision/) that matches the PyTorch installation.
 6 |   Install them together at [pytorch.org](https://pytorch.org) to make sure of this. Note, please check
 7 |   PyTorch version matches that is required by Detectron2.
 8 | - Detectron2: follow [Detectron2 installation instructions](https://detectron2.readthedocs.io/tutorials/install.html).
 9 | - OpenCV is optional but needed by demo and visualization
10 | - `pip install -r requirements.txt`
11 | 
12 | ### CUDA kernel for MSDeformAttn
13 | After preparing the required environment, run the following command to compile CUDA kernel for MSDeformAttn:
14 | 
15 | `CUDA_HOME` must be defined and points to the directory of the installed CUDA toolkit.
16 | 
17 | ```bash
18 | cd maskdino/modeling/pixel_decoder/ops
19 | sh make.sh
20 | ```
21 | 
22 | #### Building on another system
23 | To build on a system that does not have a GPU device but provide the drivers:
24 | ```bash
25 | TORCH_CUDA_ARCH_LIST='8.0' FORCE_CUDA=1 python setup.py build install
26 | ```
27 | 
28 | ### Example conda environment setup
29 | ```bash
30 | conda create --name maskdino python=3.8 -y
31 | conda activate maskdino
32 | conda install pytorch==1.9.0 torchvision==0.10.0 cudatoolkit=11.1 -c pytorch -c nvidia
33 | pip install -U opencv-python
34 | 
35 | # under your working directory
36 | git clone git@github.com:facebookresearch/detectron2.git
37 | cd detectron2
38 | pip install -e .
39 | pip install git+https://github.com/cocodataset/panopticapi.git
40 | pip install git+https://github.com/mcordts/cityscapesScripts.git
41 | 
42 | cd ..
43 | git clone git@github.com:facebookresearch/MaskDINO.git
44 | cd MaskDINO
45 | pip install -r requirements.txt
46 | cd maskdino/modeling/pixel_decoder/ops
47 | sh make.sh
48 | ```
49 | 


--------------------------------------------------------------------------------
/configs/ade20k/semantic-segmentation/Base-ADE20K-SemanticSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     # NORM: "SyncBN"
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("ade20k_sem_seg_train",)
18 |   TEST: ("ade20k_sem_seg_val",)
19 | SOLVER:
20 |   IMS_PER_BATCH: 16
21 |   BASE_LR: 0.0001
22 |   MAX_ITER: 160000
23 |   WARMUP_FACTOR: 1.0
24 |   WARMUP_ITERS: 0
25 |   WEIGHT_DECAY: 0.05
26 |   OPTIMIZER: "ADAMW"
27 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
28 |   BACKBONE_MULTIPLIER: 0.1
29 |   CLIP_GRADIENTS:
30 |     ENABLED: True
31 |     CLIP_TYPE: "full_model"
32 |     CLIP_VALUE: 0.01
33 |     NORM_TYPE: 2.0
34 |   AMP:
35 |     ENABLED: True
36 | INPUT:
37 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 512) for x in range(5, 21)]"]
38 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
39 |   MIN_SIZE_TEST: 512
40 |   MAX_SIZE_TRAIN: 2048
41 |   MAX_SIZE_TEST: 2048
42 |   CROP:
43 |     ENABLED: True
44 |     TYPE: "absolute"
45 |     SIZE: (512, 512)
46 |     SINGLE_CATEGORY_MAX_AREA: 1.0
47 |   COLOR_AUG_SSD: True
48 |   SIZE_DIVISIBILITY: 512  # used in dataset mapper
49 |   FORMAT: "RGB"
50 |   DATASET_MAPPER_NAME: "mask_former_semantic"
51 | TEST:
52 |   EVAL_PERIOD: 5000
53 |   AUG:
54 |     ENABLED: False
55 |     MIN_SIZES: [256, 384, 512, 640, 768, 896]
56 |     MAX_SIZE: 3584
57 |     FLIP: True
58 | DATALOADER:
59 |   FILTER_EMPTY_ANNOTATIONS: True
60 |   NUM_WORKERS: 4
61 | VERSION: 2
62 | 


--------------------------------------------------------------------------------
/configs/ade20k/semantic-segmentation/maskdino_R50_bs16_160k_steplr.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-ADE20K-SemanticSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskDINO"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskDINOHead"
 6 |     IGNORE_VALUE: 255
 7 |     NUM_CLASSES: 150
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MaskDINOEncoder"
14 |     DIM_FEEDFORWARD: 1024
15 |     NUM_FEATURE_LEVELS: 3
16 |     TOTAL_NUM_FEATURE_LEVELS: 3
17 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
18 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
19 |     COMMON_STRIDE: 4
20 |     TRANSFORMER_ENC_LAYERS: 6
21 |   MaskDINO:
22 |     TRANSFORMER_DECODER_NAME: "MaskDINODecoder"
23 |     DEEP_SUPERVISION: True
24 |     NO_OBJECT_WEIGHT: 0.1
25 |     CLASS_WEIGHT: 4.0
26 |     MASK_WEIGHT: 5.0
27 |     DICE_WEIGHT: 5.0
28 |     HIDDEN_DIM: 256
29 |     NUM_OBJECT_QUERIES: 100
30 |     NHEADS: 8
31 |     DROPOUT: 0.0
32 |     DIM_FEEDFORWARD: 2048
33 |     ENC_LAYERS: 0
34 |     PRE_NORM: False
35 |     ENFORCE_INPUT_PROJ: False
36 |     SIZE_DIVISIBILITY: 32
37 |     DEC_LAYERS: 9  # 9 decoder layers, add one for the loss on learnable query
38 |     TRAIN_NUM_POINTS: 12544
39 |     OVERSAMPLE_RATIO: 3.0
40 |     IMPORTANCE_SAMPLE_RATIO: 0.75
41 |     TWO_STAGE: False
42 |     DN: "seg"
43 |     DN_NUM: 100
44 |     INITIALIZE_BOX_TYPE: "no"
45 |     SEMANTIC_CE_LOSS: True
46 |     TEST:
47 |       SEMANTIC_ON: True
48 |       INSTANCE_ON: False
49 |       PANOPTIC_ON: False
50 |       OVERLAP_THRESHOLD: 0.8
51 |       OBJECT_MASK_THRESHOLD: 0.8
52 | SOLVER:
53 |   AMP:
54 |     ENABLED: False
55 |   BACKBONE_MULTIPLIER: 0.1
56 |   BASE_LR: 0.0001
57 |   BIAS_LR_FACTOR: 1.0
58 |   CHECKPOINT_PERIOD: 5000
59 | 
60 |   IMS_PER_BATCH: 16
61 |   LR_SCHEDULER_NAME: WarmupMultiStepLR
62 |   MAX_ITER: 160000
63 | 
64 |   STEPS: (135000,150000)
65 |   WARMUP_FACTOR: 1.0
66 |   WARMUP_ITERS: 10
67 |   WARMUP_METHOD: linear
68 | 


--------------------------------------------------------------------------------
/configs/cityscapes/semantic-segmentation/Base-Cityscapes-SemanticSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     NORM: "SyncBN"  # use syncbn for cityscapes dataset
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("cityscapes_fine_sem_seg_train",)
18 |   TEST: ("cityscapes_fine_sem_seg_val",)
19 | SOLVER:
20 |   IMS_PER_BATCH: 16
21 |   BASE_LR: 0.0001
22 |   MAX_ITER: 90000
23 |   WARMUP_FACTOR: 1.0
24 |   WARMUP_ITERS: 0
25 |   WEIGHT_DECAY: 0.05
26 |   OPTIMIZER: "ADAMW"
27 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
28 |   BACKBONE_MULTIPLIER: 0.1
29 |   CLIP_GRADIENTS:
30 |     ENABLED: True
31 |     CLIP_TYPE: "full_model"
32 |     CLIP_VALUE: 0.01
33 |     NORM_TYPE: 2.0
34 |   AMP:
35 |     ENABLED: True
36 | INPUT:
37 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1024) for x in range(5, 21)]"]
38 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
39 |   MIN_SIZE_TEST: 1024
40 |   MAX_SIZE_TRAIN: 4096
41 |   MAX_SIZE_TEST: 2048
42 |   CROP:
43 |     ENABLED: True
44 |     TYPE: "absolute"
45 |     SIZE: (512, 1024)
46 |     SINGLE_CATEGORY_MAX_AREA: 1.0
47 |   COLOR_AUG_SSD: True
48 |   SIZE_DIVISIBILITY: -1
49 |   FORMAT: "RGB"
50 |   DATASET_MAPPER_NAME: "mask_former_semantic"
51 | TEST:
52 |   EVAL_PERIOD: 5000
53 |   AUG:
54 |     ENABLED: False
55 |     MIN_SIZES: [512, 768, 1024, 1280, 1536, 1792]
56 |     MAX_SIZE: 4096
57 |     FLIP: True
58 | DATALOADER:
59 |   FILTER_EMPTY_ANNOTATIONS: True
60 |   NUM_WORKERS: 4
61 | VERSION: 2
62 | 


--------------------------------------------------------------------------------
/configs/cityscapes/semantic-segmentation/maskdino_R50_bs16_90k_steplr.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-Cityscapes-SemanticSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskDINO"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskDINOHead"
 6 |     IGNORE_VALUE: 255
 7 |     NUM_CLASSES: 19
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MaskDINOEncoder"
14 |     DIM_FEEDFORWARD: 1024
15 |     NUM_FEATURE_LEVELS: 3
16 |     TOTAL_NUM_FEATURE_LEVELS: 3
17 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
18 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
19 |     COMMON_STRIDE: 4
20 |     TRANSFORMER_ENC_LAYERS: 6
21 |   MaskDINO:
22 |     TRANSFORMER_DECODER_NAME: "MaskDINODecoder"
23 |     DEEP_SUPERVISION: True
24 |     NO_OBJECT_WEIGHT: 0.1
25 |     CLASS_WEIGHT: 4.0
26 |     MASK_WEIGHT: 5.0
27 |     DICE_WEIGHT: 5.0
28 |     HIDDEN_DIM: 256
29 |     NUM_OBJECT_QUERIES: 100
30 |     NHEADS: 8
31 |     DROPOUT: 0.0
32 |     DIM_FEEDFORWARD: 2048
33 |     ENC_LAYERS: 0
34 |     PRE_NORM: False
35 |     ENFORCE_INPUT_PROJ: False
36 |     SIZE_DIVISIBILITY: 32
37 |     DEC_LAYERS: 9  # 9 decoder layers, add one for the loss on learnable query
38 |     TRAIN_NUM_POINTS: 12544
39 |     OVERSAMPLE_RATIO: 3.0
40 |     IMPORTANCE_SAMPLE_RATIO: 0.75
41 |     TWO_STAGE: False
42 |     DN: "seg"
43 |     DN_NUM: 100
44 |     INITIALIZE_BOX_TYPE: "no"
45 |     SEMANTIC_CE_LOSS: True
46 |     TEST:
47 |       SEMANTIC_ON: True
48 |       INSTANCE_ON: False
49 |       PANOPTIC_ON: False
50 |       OVERLAP_THRESHOLD: 0.8
51 |       OBJECT_MASK_THRESHOLD: 0.8
52 | SOLVER:
53 |   AMP:
54 |     ENABLED: False
55 |   BACKBONE_MULTIPLIER: 0.1
56 |   BASE_LR: 0.0001
57 |   BIAS_LR_FACTOR: 1.0
58 |   CHECKPOINT_PERIOD: 5000
59 | 
60 |   IMS_PER_BATCH: 16
61 |   LR_SCHEDULER_NAME: WarmupMultiStepLR
62 |   MAX_ITER: 90000
63 | 
64 |   STEPS: (80000,87000)
65 |   WARMUP_FACTOR: 1.0
66 |   WARMUP_ITERS: 10
67 |   WARMUP_METHOD: linear
68 | 


--------------------------------------------------------------------------------
/configs/coco/instance-segmentation/Base-COCO-InstanceSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     # NORM: "SyncBN"
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("coco_2017_train",)
18 |   TEST: ("coco_2017_val",)
19 | SOLVER:
20 |   IMS_PER_BATCH: 16
21 |   BASE_LR: 0.0001
22 |   STEPS: (327778, 355092)
23 |   MAX_ITER: 368750
24 |   WARMUP_FACTOR: 1.0
25 |   WARMUP_ITERS: 10
26 |   WEIGHT_DECAY: 0.05
27 |   OPTIMIZER: "ADAMW"
28 |   BACKBONE_MULTIPLIER: 0.1
29 |   CLIP_GRADIENTS:
30 |     ENABLED: True
31 |     CLIP_TYPE: "full_model"
32 |     CLIP_VALUE: 0.01
33 |     NORM_TYPE: 2.0
34 |   AMP:
35 |     ENABLED: True
36 | INPUT:
37 |   IMAGE_SIZE: 1024
38 |   MIN_SCALE: 0.1
39 |   MAX_SCALE: 2.0
40 |   FORMAT: "RGB"
41 |   DATASET_MAPPER_NAME: "coco_instance_lsj"
42 | TEST:
43 |   EVAL_PERIOD: 5000
44 | #  EVAL_FLAG: 1
45 | DATALOADER:
46 |   FILTER_EMPTY_ANNOTATIONS: True
47 |   NUM_WORKERS: 4
48 | VERSION: 2
49 | 


--------------------------------------------------------------------------------
/configs/coco/instance-segmentation/maskdino_R50_bs16_50ep_3s.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-COCO-InstanceSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskDINO"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskDINOHead"
 6 |     IGNORE_VALUE: 255
 7 |     NUM_CLASSES: 80
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MaskDINOEncoder"
14 |     DIM_FEEDFORWARD: 1024
15 |     NUM_FEATURE_LEVELS: 3
16 |     TOTAL_NUM_FEATURE_LEVELS: 3
17 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
18 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
19 |     COMMON_STRIDE: 4
20 |     TRANSFORMER_ENC_LAYERS: 6
21 |   MaskDINO:
22 |     TRANSFORMER_DECODER_NAME: "MaskDINODecoder"
23 |     DEEP_SUPERVISION: True
24 |     NO_OBJECT_WEIGHT: 0.1
25 |     CLASS_WEIGHT: 4.0
26 |     MASK_WEIGHT: 5.0
27 |     DICE_WEIGHT: 5.0
28 |     BOX_WEIGHT: 5.0
29 |     GIOU_WEIGHT: 2.0
30 |     HIDDEN_DIM: 256
31 |     NUM_OBJECT_QUERIES: 300
32 |     NHEADS: 8
33 |     DROPOUT: 0.0
34 |     DIM_FEEDFORWARD: 2048
35 |     ENC_LAYERS: 0
36 |     PRE_NORM: False
37 |     ENFORCE_INPUT_PROJ: False
38 |     SIZE_DIVISIBILITY: 32
39 |     DEC_LAYERS: 9  # 9+1, 9 decoder layers, add one for the loss on learnable query
40 |     TRAIN_NUM_POINTS: 12544
41 |     OVERSAMPLE_RATIO: 3.0
42 |     IMPORTANCE_SAMPLE_RATIO: 0.75
43 |     INITIAL_PRED: True
44 |     TWO_STAGE: True
45 |     DN: "seg"
46 |     DN_NUM: 100
47 |     INITIALIZE_BOX_TYPE: "bitmask"
48 |     TEST:
49 |       SEMANTIC_ON: False
50 |       INSTANCE_ON: True
51 |       PANOPTIC_ON: False
52 |       OVERLAP_THRESHOLD: 0.8
53 |       OBJECT_MASK_THRESHOLD: 0.25
54 | 
55 | SOLVER:
56 |   AMP:
57 |     ENABLED: True
58 | TEST:
59 |   EVAL_PERIOD: 5000
60 | #  EVAL_FLAG: 1


--------------------------------------------------------------------------------
/configs/coco/instance-segmentation/maskdino_R50_bs16_50ep_3s_dowsample1_2048.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-COCO-InstanceSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskDINO"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskDINOHead"
 6 |     IGNORE_VALUE: 255
 7 |     NUM_CLASSES: 80
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MaskDINOEncoder"
14 |     DIM_FEEDFORWARD: 2048
15 |     NUM_FEATURE_LEVELS: 3
16 |     TOTAL_NUM_FEATURE_LEVELS: 4
17 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
18 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
19 |     COMMON_STRIDE: 4
20 |     TRANSFORMER_ENC_LAYERS: 6
21 |     FEATURE_ORDER: "low2high"
22 |   MaskDINO:
23 |     TRANSFORMER_DECODER_NAME: "MaskDINODecoder"
24 |     DEEP_SUPERVISION: True
25 |     NO_OBJECT_WEIGHT: 0.1
26 |     CLASS_WEIGHT: 4.0
27 |     MASK_WEIGHT: 5.0
28 |     DICE_WEIGHT: 5.0
29 |     BOX_WEIGHT: 5.0
30 |     GIOU_WEIGHT: 2.0
31 |     HIDDEN_DIM: 256
32 |     NUM_OBJECT_QUERIES: 300
33 |     NHEADS: 8
34 |     DROPOUT: 0.0
35 |     DIM_FEEDFORWARD: 2048
36 |     ENC_LAYERS: 0
37 |     PRE_NORM: False
38 |     ENFORCE_INPUT_PROJ: False
39 |     SIZE_DIVISIBILITY: 32
40 |     DEC_LAYERS: 9  # 9+1, 9 decoder layers, add one for the loss on learnable query
41 |     TRAIN_NUM_POINTS: 12544
42 |     OVERSAMPLE_RATIO: 3.0
43 |     IMPORTANCE_SAMPLE_RATIO: 0.75
44 |     EVAL_FLAG: 1
45 |     INITIAL_PRED: True
46 |     TWO_STAGE: True
47 |     DN: "seg"
48 |     DN_NUM: 100
49 |     INITIALIZE_BOX_TYPE: 'mask2box'
50 |     TEST:
51 |       SEMANTIC_ON: False
52 |       INSTANCE_ON: True
53 |       PANOPTIC_ON: False
54 |       OVERLAP_THRESHOLD: 0.8
55 |       OBJECT_MASK_THRESHOLD: 0.25
56 | 
57 | SOLVER:
58 |   AMP:
59 |     ENABLED: True
60 | TEST:
61 |   EVAL_PERIOD: 5000
62 | #  EVAL_FLAG: 1


--------------------------------------------------------------------------------
/configs/coco/instance-segmentation/maskdino_R50_bs16_50ep_3s_dowsample1_2048_bitmask.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-COCO-InstanceSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskDINO"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskDINOHead"
 6 |     IGNORE_VALUE: 255
 7 |     NUM_CLASSES: 80
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MaskDINOEncoder"
14 |     DIM_FEEDFORWARD: 2048
15 |     NUM_FEATURE_LEVELS: 3
16 |     TOTAL_NUM_FEATURE_LEVELS: 4
17 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
18 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
19 |     COMMON_STRIDE: 4
20 |     TRANSFORMER_ENC_LAYERS: 6
21 |     FEATURE_ORDER: "low2high"
22 |   MaskDINO:
23 |     TRANSFORMER_DECODER_NAME: "MaskDINODecoder"
24 |     DEEP_SUPERVISION: True
25 |     NO_OBJECT_WEIGHT: 0.1
26 |     CLASS_WEIGHT: 4.0
27 |     MASK_WEIGHT: 5.0
28 |     DICE_WEIGHT: 5.0
29 |     BOX_WEIGHT: 5.0
30 |     GIOU_WEIGHT: 2.0
31 |     HIDDEN_DIM: 256
32 |     NUM_OBJECT_QUERIES: 300
33 |     NHEADS: 8
34 |     DROPOUT: 0.0
35 |     DIM_FEEDFORWARD: 2048
36 |     ENC_LAYERS: 0
37 |     PRE_NORM: False
38 |     ENFORCE_INPUT_PROJ: False
39 |     SIZE_DIVISIBILITY: 32
40 |     DEC_LAYERS: 9  # 9+1, 9 decoder layers, add one for the loss on learnable query
41 |     TRAIN_NUM_POINTS: 12544
42 |     OVERSAMPLE_RATIO: 3.0
43 |     IMPORTANCE_SAMPLE_RATIO: 0.75
44 |     EVAL_FLAG: 1
45 |     INITIAL_PRED: True
46 |     TWO_STAGE: True
47 |     DN: "seg"
48 |     DN_NUM: 100
49 |     INITIALIZE_BOX_TYPE: 'bitmask'
50 |     TEST:
51 |       SEMANTIC_ON: False
52 |       INSTANCE_ON: True
53 |       PANOPTIC_ON: False
54 |       OVERLAP_THRESHOLD: 0.8
55 |       OBJECT_MASK_THRESHOLD: 0.25
56 | 
57 | SOLVER:
58 |   AMP:
59 |     ENABLED: True
60 | TEST:
61 |   EVAL_PERIOD: 5000
62 | #  EVAL_FLAG: 1
63 | 


--------------------------------------------------------------------------------
/configs/coco/instance-segmentation/maskdino_R50_bs16_50ep_3s_dowsample2.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-COCO-InstanceSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskDINO"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskDINOHead"
 6 |     IGNORE_VALUE: 255
 7 |     NUM_CLASSES: 80
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MaskDINOEncoder"
14 |     DIM_FEEDFORWARD: 1024
15 |     NUM_FEATURE_LEVELS: 3
16 |     TOTAL_NUM_FEATURE_LEVELS: 5
17 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
18 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
19 |     COMMON_STRIDE: 4
20 |     TRANSFORMER_ENC_LAYERS: 6
21 |   MaskDINO:
22 |     TRANSFORMER_DECODER_NAME: "MaskDINODecoder"
23 |     DEEP_SUPERVISION: True
24 |     NO_OBJECT_WEIGHT: 0.1
25 |     CLASS_WEIGHT: 4.0
26 |     MASK_WEIGHT: 5.0
27 |     DICE_WEIGHT: 5.0
28 |     BOX_WEIGHT: 5.0
29 |     GIOU_WEIGHT: 2.0
30 |     HIDDEN_DIM: 256
31 |     NUM_OBJECT_QUERIES: 300
32 |     NHEADS: 8
33 |     DROPOUT: 0.0
34 |     DIM_FEEDFORWARD: 2048
35 |     ENC_LAYERS: 0
36 |     PRE_NORM: False
37 |     ENFORCE_INPUT_PROJ: False
38 |     SIZE_DIVISIBILITY: 32
39 |     DEC_LAYERS: 9  # 9+1, 9 decoder layers, add one for the loss on learnable query
40 |     TRAIN_NUM_POINTS: 12544
41 |     OVERSAMPLE_RATIO: 3.0
42 |     IMPORTANCE_SAMPLE_RATIO: 0.75
43 |     EVAL_FLAG: 1
44 |     INITIAL_PRED: True
45 |     TWO_STAGE: True
46 |     DN: "seg"
47 |     DN_NUM: 100
48 |     INITIALIZE_BOX_TYPE: 'no'
49 |     TEST:
50 |       SEMANTIC_ON: False
51 |       INSTANCE_ON: True
52 |       PANOPTIC_ON: False
53 |       OVERLAP_THRESHOLD: 0.8
54 |       OBJECT_MASK_THRESHOLD: 0.25
55 | 
56 | SOLVER:
57 |   AMP:
58 |     ENABLED: True
59 | TEST:
60 |   EVAL_PERIOD: 5000
61 | #  EVAL_FLAG: 1


--------------------------------------------------------------------------------
/configs/coco/instance-segmentation/maskdino_R50_bs16_50ep_4s_dowsample1_2048.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-COCO-InstanceSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskDINO"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskDINOHead"
 6 |     IGNORE_VALUE: 255
 7 |     NUM_CLASSES: 80
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MaskDINOEncoder"
14 |     DIM_FEEDFORWARD: 2048
15 |     NUM_FEATURE_LEVELS: 3
16 |     TOTAL_NUM_FEATURE_LEVELS: 4
17 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
18 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res2","res3", "res4", "res5"]
19 |     COMMON_STRIDE: 4
20 |     TRANSFORMER_ENC_LAYERS: 6
21 |     FEATURE_ORDER: "low2high"
22 |   MaskDINO:
23 |     TRANSFORMER_DECODER_NAME: "MaskDINODecoder"
24 |     DEEP_SUPERVISION: True
25 |     NO_OBJECT_WEIGHT: 0.1
26 |     CLASS_WEIGHT: 4.0
27 |     MASK_WEIGHT: 5.0
28 |     DICE_WEIGHT: 5.0
29 |     BOX_WEIGHT: 5.0
30 |     GIOU_WEIGHT: 2.0
31 |     HIDDEN_DIM: 256
32 |     NUM_OBJECT_QUERIES: 300
33 |     NHEADS: 8
34 |     DROPOUT: 0.0
35 |     DIM_FEEDFORWARD: 2048
36 |     ENC_LAYERS: 0
37 |     PRE_NORM: False
38 |     ENFORCE_INPUT_PROJ: False
39 |     SIZE_DIVISIBILITY: 32
40 |     DEC_LAYERS: 9  # 9+1, 9 decoder layers, add one for the loss on learnable query
41 |     TRAIN_NUM_POINTS: 12544
42 |     OVERSAMPLE_RATIO: 3.0
43 |     IMPORTANCE_SAMPLE_RATIO: 0.75
44 |     EVAL_FLAG: 1
45 |     INITIAL_PRED: True
46 |     TWO_STAGE: True
47 |     DN: "seg"
48 |     DN_NUM: 100
49 |     INITIALIZE_BOX_TYPE: 'no'
50 |     TEST:
51 |       SEMANTIC_ON: False
52 |       INSTANCE_ON: True
53 |       PANOPTIC_ON: False
54 |       OVERLAP_THRESHOLD: 0.8
55 |       OBJECT_MASK_THRESHOLD: 0.25
56 | 
57 | SOLVER:
58 |   AMP:
59 |     ENABLED: True
60 | TEST:
61 |   EVAL_PERIOD: 5000
62 | #  EVAL_FLAG: 1


--------------------------------------------------------------------------------
/configs/coco/instance-segmentation/swin/maskdino_R50_bs16_50ep_4s_dowsample1_1024.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../Base-COCO-InstanceSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskDINO"
 4 |   BACKBONE:
 5 |     NAME: "D2SwinTransformer"
 6 |   SWIN:
 7 |     EMBED_DIM: 192
 8 |     DEPTHS: [ 2, 2, 18, 2 ]
 9 |     NUM_HEADS: [ 6, 12, 24, 48 ]
10 |     WINDOW_SIZE: 12
11 |     APE: False
12 |     DROP_PATH_RATE: 0.3
13 |     PATCH_NORM: True
14 |     PRETRAIN_IMG_SIZE: 384
15 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
16 |   PIXEL_MEAN: [ 123.675, 116.280, 103.530 ]
17 |   PIXEL_STD: [ 58.395, 57.120, 57.375 ]
18 |   # head
19 |   SEM_SEG_HEAD:
20 |     NAME: "MaskDINOHead"
21 |     IGNORE_VALUE: 255
22 |     NUM_CLASSES: 80
23 |     LOSS_WEIGHT: 1.0
24 |     CONVS_DIM: 256
25 |     MASK_DIM: 256
26 |     NORM: "GN"
27 |     # pixel decoder
28 |     PIXEL_DECODER_NAME: "MaskDINOEncoder"
29 |     DIM_FEEDFORWARD: 1024
30 |     NUM_FEATURE_LEVELS: 4
31 |     TOTAL_NUM_FEATURE_LEVELS: 5
32 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
33 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res2","res3", "res4", "res5"]
34 |     COMMON_STRIDE: 4
35 |     TRANSFORMER_ENC_LAYERS: 6
36 |     FEATURE_ORDER: "low2high"
37 |   MaskDINO:
38 |     TRANSFORMER_DECODER_NAME: "MaskDINODecoder"
39 |     DEEP_SUPERVISION: True
40 |     NO_OBJECT_WEIGHT: 0.1
41 |     CLASS_WEIGHT: 4.0
42 |     MASK_WEIGHT: 5.0
43 |     DICE_WEIGHT: 5.0
44 |     BOX_WEIGHT: 5.0
45 |     GIOU_WEIGHT: 2.0
46 |     HIDDEN_DIM: 256
47 |     NUM_OBJECT_QUERIES: 300
48 |     NHEADS: 8
49 |     DROPOUT: 0.0
50 |     DIM_FEEDFORWARD: 2048
51 |     ENC_LAYERS: 0
52 |     PRE_NORM: False
53 |     ENFORCE_INPUT_PROJ: False
54 |     SIZE_DIVISIBILITY: 32
55 |     DEC_LAYERS: 9  # 9+1, 9 decoder layers, add one for the loss on learnable query
56 |     TRAIN_NUM_POINTS: 12544
57 |     OVERSAMPLE_RATIO: 3.0
58 |     IMPORTANCE_SAMPLE_RATIO: 0.75
59 |     EVAL_FLAG: 1
60 |     INITIAL_PRED: True
61 |     TWO_STAGE: True
62 |     DN: "seg"
63 |     DN_NUM: 100
64 |     INITIALIZE_BOX_TYPE: 'bitmask'
65 |     TEST:
66 |       SEMANTIC_ON: False
67 |       INSTANCE_ON: True
68 |       PANOPTIC_ON: False
69 |       OVERLAP_THRESHOLD: 0.8
70 |       OBJECT_MASK_THRESHOLD: 0.25
71 | 
72 | SOLVER:
73 |   AMP:
74 |     ENABLED: True
75 | TEST:
76 |   EVAL_PERIOD: 5000
77 | #  EVAL_FLAG: 1


--------------------------------------------------------------------------------
/configs/coco/instance-segmentation/swin/maskdino_R50_bs16_50ep_4s_dowsample1_2048.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../Base-COCO-InstanceSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskDINO"
 4 |   BACKBONE:
 5 |     NAME: "D2SwinTransformer"
 6 |   SWIN:
 7 |     EMBED_DIM: 192
 8 |     DEPTHS: [ 2, 2, 18, 2 ]
 9 |     NUM_HEADS: [ 6, 12, 24, 48 ]
10 |     WINDOW_SIZE: 12
11 |     APE: False
12 |     DROP_PATH_RATE: 0.3
13 |     PATCH_NORM: True
14 |     PRETRAIN_IMG_SIZE: 384
15 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
16 |   PIXEL_MEAN: [ 123.675, 116.280, 103.530 ]
17 |   PIXEL_STD: [ 58.395, 57.120, 57.375 ]
18 |   # head
19 |   SEM_SEG_HEAD:
20 |     NAME: "MaskDINOHead"
21 |     IGNORE_VALUE: 255
22 |     NUM_CLASSES: 80
23 |     LOSS_WEIGHT: 1.0
24 |     CONVS_DIM: 256
25 |     MASK_DIM: 256
26 |     NORM: "GN"
27 |     # pixel decoder
28 |     PIXEL_DECODER_NAME: "MaskDINOEncoder"
29 |     DIM_FEEDFORWARD: 2048
30 |     NUM_FEATURE_LEVELS: 4
31 |     TOTAL_NUM_FEATURE_LEVELS: 5
32 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
33 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res2","res3", "res4", "res5"]
34 |     COMMON_STRIDE: 4
35 |     TRANSFORMER_ENC_LAYERS: 6
36 |     FEATURE_ORDER: "low2high"
37 |   MaskDINO:
38 |     TRANSFORMER_DECODER_NAME: "MaskDINODecoder"
39 |     DEEP_SUPERVISION: True
40 |     NO_OBJECT_WEIGHT: 0.1
41 |     CLASS_WEIGHT: 4.0
42 |     MASK_WEIGHT: 5.0
43 |     DICE_WEIGHT: 5.0
44 |     BOX_WEIGHT: 5.0
45 |     GIOU_WEIGHT: 2.0
46 |     HIDDEN_DIM: 256
47 |     NUM_OBJECT_QUERIES: 300
48 |     NHEADS: 8
49 |     DROPOUT: 0.0
50 |     DIM_FEEDFORWARD: 2048
51 |     ENC_LAYERS: 0
52 |     PRE_NORM: False
53 |     ENFORCE_INPUT_PROJ: False
54 |     SIZE_DIVISIBILITY: 32
55 |     DEC_LAYERS: 9  # 9+1, 9 decoder layers, add one for the loss on learnable query
56 |     TRAIN_NUM_POINTS: 12544
57 |     OVERSAMPLE_RATIO: 3.0
58 |     IMPORTANCE_SAMPLE_RATIO: 0.75
59 |     EVAL_FLAG: 1
60 |     INITIAL_PRED: True
61 |     TWO_STAGE: True
62 |     DN: "seg"
63 |     DN_NUM: 100
64 |     INITIALIZE_BOX_TYPE: 'bitmask'
65 |     TEST:
66 |       SEMANTIC_ON: False
67 |       INSTANCE_ON: True
68 |       PANOPTIC_ON: False
69 |       OVERLAP_THRESHOLD: 0.8
70 |       OBJECT_MASK_THRESHOLD: 0.25
71 | 
72 | SOLVER:
73 |   AMP:
74 |     ENABLED: True
75 | TEST:
76 |   EVAL_PERIOD: 5000
77 | #  EVAL_FLAG: 1


--------------------------------------------------------------------------------
/configs/coco/instance-segmentation/swin/maskdino_R50_bs16_50ep_4s_dowsample1_2048_no_maskEnhance.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../Base-COCO-InstanceSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskDINO"
 4 |   BACKBONE:
 5 |     NAME: "D2SwinTransformer"
 6 |   SWIN:
 7 |     EMBED_DIM: 192
 8 |     DEPTHS: [ 2, 2, 18, 2 ]
 9 |     NUM_HEADS: [ 6, 12, 24, 48 ]
10 |     WINDOW_SIZE: 12
11 |     APE: False
12 |     DROP_PATH_RATE: 0.3
13 |     PATCH_NORM: True
14 |     PRETRAIN_IMG_SIZE: 384
15 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
16 |   PIXEL_MEAN: [ 123.675, 116.280, 103.530 ]
17 |   PIXEL_STD: [ 58.395, 57.120, 57.375 ]
18 |   # head
19 |   SEM_SEG_HEAD:
20 |     NAME: "MaskDINOHead"
21 |     IGNORE_VALUE: 255
22 |     NUM_CLASSES: 80
23 |     LOSS_WEIGHT: 1.0
24 |     CONVS_DIM: 256
25 |     MASK_DIM: 256
26 |     NORM: "GN"
27 |     # pixel decoder
28 |     PIXEL_DECODER_NAME: "MaskDINOEncoder"
29 |     DIM_FEEDFORWARD: 2048
30 |     NUM_FEATURE_LEVELS: 4
31 |     TOTAL_NUM_FEATURE_LEVELS: 5
32 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
33 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res2","res3", "res4", "res5"]
34 |     COMMON_STRIDE: 4
35 |     TRANSFORMER_ENC_LAYERS: 6
36 |     FEATURE_ORDER: "low2high"
37 |   MaskDINO:
38 |     TRANSFORMER_DECODER_NAME: "MaskDINODecoder"
39 |     DEEP_SUPERVISION: True
40 |     NO_OBJECT_WEIGHT: 0.1
41 |     CLASS_WEIGHT: 4.0
42 |     MASK_WEIGHT: 5.0
43 |     DICE_WEIGHT: 5.0
44 |     BOX_WEIGHT: 5.0
45 |     GIOU_WEIGHT: 2.0
46 |     HIDDEN_DIM: 256
47 |     NUM_OBJECT_QUERIES: 300
48 |     NHEADS: 8
49 |     DROPOUT: 0.0
50 |     DIM_FEEDFORWARD: 2048
51 |     ENC_LAYERS: 0
52 |     PRE_NORM: False
53 |     ENFORCE_INPUT_PROJ: False
54 |     SIZE_DIVISIBILITY: 32
55 |     DEC_LAYERS: 9  # 9+1, 9 decoder layers, add one for the loss on learnable query
56 |     TRAIN_NUM_POINTS: 12544
57 |     OVERSAMPLE_RATIO: 3.0
58 |     IMPORTANCE_SAMPLE_RATIO: 0.75
59 |     EVAL_FLAG: 1
60 |     INITIAL_PRED: True
61 |     TWO_STAGE: True
62 |     DN: "seg"
63 |     DN_NUM: 100
64 |     INITIALIZE_BOX_TYPE: 'no'
65 |     TEST:
66 |       SEMANTIC_ON: False
67 |       INSTANCE_ON: True
68 |       PANOPTIC_ON: False
69 |       OVERLAP_THRESHOLD: 0.8
70 |       OBJECT_MASK_THRESHOLD: 0.25
71 | 
72 | SOLVER:
73 |   AMP:
74 |     ENABLED: True
75 | TEST:
76 |   EVAL_PERIOD: 5000
77 | #  EVAL_FLAG: 1


--------------------------------------------------------------------------------
/configs/coco/panoptic-segmentation/Base-COCO-PanopticSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     # NORM: "SyncBN"
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("coco_2017_train_panoptic",)
18 |   TEST: ("coco_2017_val_panoptic_with_sem_seg",)  # to evaluate instance and semantic performance as well
19 | SOLVER:
20 |   IMS_PER_BATCH: 16
21 |   BASE_LR: 0.0001
22 |   STEPS: (327778, 355092)
23 |   MAX_ITER: 368750
24 |   WARMUP_FACTOR: 1.0
25 |   WARMUP_ITERS: 10
26 |   WEIGHT_DECAY: 0.05
27 |   OPTIMIZER: "ADAMW"
28 |   BACKBONE_MULTIPLIER: 0.1
29 |   CLIP_GRADIENTS:
30 |     ENABLED: True
31 |     CLIP_TYPE: "full_model"
32 |     CLIP_VALUE: 0.01
33 |     NORM_TYPE: 2.0
34 |   AMP:
35 |     ENABLED: True
36 | INPUT:
37 |   IMAGE_SIZE: 1024
38 |   MIN_SCALE: 0.1
39 |   MAX_SCALE: 2.0
40 |   FORMAT: "RGB"
41 |   DATASET_MAPPER_NAME: "coco_panoptic_lsj"
42 | TEST:
43 |   EVAL_PERIOD: 5000
44 | DATALOADER:
45 |   FILTER_EMPTY_ANNOTATIONS: True
46 |   NUM_WORKERS: 4
47 | VERSION: 2
48 | 


--------------------------------------------------------------------------------
/configs/coco/panoptic-segmentation/maskdino_R50_bs16_50ep_3s_dowsample1_2048.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-COCO-PanopticSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskDINO"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskDINOHead"
 6 |     IGNORE_VALUE: 255
 7 |     NUM_CLASSES: 133
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MaskDINOEncoder"
14 |     DIM_FEEDFORWARD: 2048
15 |     NUM_FEATURE_LEVELS: 3
16 |     TOTAL_NUM_FEATURE_LEVELS: 4
17 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
18 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
19 |     COMMON_STRIDE: 4
20 |     TRANSFORMER_ENC_LAYERS: 6
21 |     FEATURE_ORDER: "low2high"
22 |   MaskDINO:
23 |     TRANSFORMER_DECODER_NAME: "MaskDINODecoder"
24 |     DEEP_SUPERVISION: True
25 |     NO_OBJECT_WEIGHT: 0.1
26 |     CLASS_WEIGHT: 4.0
27 |     MASK_WEIGHT: 5.0
28 |     DICE_WEIGHT: 5.0
29 |     BOX_WEIGHT: 5.0
30 |     GIOU_WEIGHT: 2.0
31 |     HIDDEN_DIM: 256
32 |     NUM_OBJECT_QUERIES: 300
33 |     NHEADS: 8
34 |     DROPOUT: 0.0
35 |     DIM_FEEDFORWARD: 2048
36 |     ENC_LAYERS: 0
37 |     PRE_NORM: False
38 |     ENFORCE_INPUT_PROJ: False
39 |     SIZE_DIVISIBILITY: 32
40 |     DEC_LAYERS: 9  # 9+1, 9 decoder layers, add one for the loss on learnable query
41 |     TRAIN_NUM_POINTS: 12544
42 |     OVERSAMPLE_RATIO: 3.0
43 |     IMPORTANCE_SAMPLE_RATIO: 0.75
44 |     EVAL_FLAG: 1
45 |     INITIAL_PRED: True
46 |     TWO_STAGE: True
47 |     DN: "seg"
48 |     DN_NUM: 100
49 |     INITIALIZE_BOX_TYPE: 'no'
50 |     PANO_BOX_LOSS: False
51 |     TEST:
52 |       SEMANTIC_ON: True
53 |       INSTANCE_ON: True
54 |       PANOPTIC_ON: True
55 |       OVERLAP_THRESHOLD: 0.8
56 |       OBJECT_MASK_THRESHOLD: 0.25
57 | 
58 | SOLVER:
59 |   AMP:
60 |     ENABLED: True
61 | TEST:
62 |   EVAL_PERIOD: 5000
63 | #  EVAL_FLAG: 1


--------------------------------------------------------------------------------
/configs/coco/panoptic-segmentation/swin/maskdino_R50_bs16_50ep_4s_dowsample1_2048.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../Base-COCO-PanopticSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskDINO"
 4 |   BACKBONE:
 5 |     NAME: "D2SwinTransformer"
 6 |   SWIN:
 7 |     EMBED_DIM: 192
 8 |     DEPTHS: [ 2, 2, 18, 2 ]
 9 |     NUM_HEADS: [ 6, 12, 24, 48 ]
10 |     WINDOW_SIZE: 12
11 |     APE: False
12 |     DROP_PATH_RATE: 0.3
13 |     PATCH_NORM: True
14 |     PRETRAIN_IMG_SIZE: 384
15 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
16 |   PIXEL_MEAN: [ 123.675, 116.280, 103.530 ]
17 |   PIXEL_STD: [ 58.395, 57.120, 57.375 ]
18 |   # head
19 |   SEM_SEG_HEAD:
20 |     NAME: "MaskDINOHead"
21 |     IGNORE_VALUE: 255
22 |     NUM_CLASSES: 133
23 |     LOSS_WEIGHT: 1.0
24 |     CONVS_DIM: 256
25 |     MASK_DIM: 256
26 |     NORM: "GN"
27 |     # pixel decoder
28 |     PIXEL_DECODER_NAME: "MaskDINOEncoder"
29 |     DIM_FEEDFORWARD: 2048
30 |     NUM_FEATURE_LEVELS: 4
31 |     TOTAL_NUM_FEATURE_LEVELS: 5
32 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
33 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res2","res3", "res4", "res5"]
34 |     COMMON_STRIDE: 4
35 |     TRANSFORMER_ENC_LAYERS: 6
36 |     FEATURE_ORDER: "low2high"
37 |   MaskDINO:
38 |     TRANSFORMER_DECODER_NAME: "MaskDINODecoder"
39 |     DEEP_SUPERVISION: True
40 |     NO_OBJECT_WEIGHT: 0.1
41 |     CLASS_WEIGHT: 4.0
42 |     MASK_WEIGHT: 5.0
43 |     DICE_WEIGHT: 5.0
44 |     BOX_WEIGHT: 5.0
45 |     GIOU_WEIGHT: 2.0
46 |     HIDDEN_DIM: 256
47 |     NUM_OBJECT_QUERIES: 300
48 |     NHEADS: 8
49 |     DROPOUT: 0.0
50 |     DIM_FEEDFORWARD: 2048
51 |     ENC_LAYERS: 0
52 |     PRE_NORM: False
53 |     ENFORCE_INPUT_PROJ: False
54 |     SIZE_DIVISIBILITY: 32
55 |     DEC_LAYERS: 9  # 9+1, 9 decoder layers, add one for the loss on learnable query
56 |     TRAIN_NUM_POINTS: 12544
57 |     OVERSAMPLE_RATIO: 3.0
58 |     IMPORTANCE_SAMPLE_RATIO: 0.75
59 |     EVAL_FLAG: 1
60 |     INITIAL_PRED: True
61 |     TWO_STAGE: True
62 |     DN: "seg"
63 |     DN_NUM: 100
64 |     INITIALIZE_BOX_TYPE: 'no'
65 |     PANO_BOX_LOSS: False
66 |     TEST:
67 |       SEMANTIC_ON: True
68 |       INSTANCE_ON: True
69 |       PANOPTIC_ON: True
70 |       OVERLAP_THRESHOLD: 0.8
71 |       OBJECT_MASK_THRESHOLD: 0.25
72 | 
73 | SOLVER:
74 |   AMP:
75 |     ENABLED: True
76 | TEST:
77 |   EVAL_PERIOD: 5000
78 | #  EVAL_FLAG: 1


--------------------------------------------------------------------------------
/configs/coco/panoptic-segmentation/swin/maskdino_R50_bs16_50ep_4s_dowsample1_large_eval.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../Base-COCO-PanopticSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskDINO"
 4 |   BACKBONE:
 5 |     NAME: "D2SwinTransformer"
 6 |   SWIN:
 7 |     EMBED_DIM: 192
 8 |     DEPTHS: [ 2, 2, 18, 2 ]
 9 |     NUM_HEADS: [ 6, 12, 24, 48 ]
10 |     WINDOW_SIZE: 12
11 |     APE: False
12 |     DROP_PATH_RATE: 0.3
13 |     PATCH_NORM: True
14 |     PRETRAIN_IMG_SIZE: 384
15 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
16 |   PIXEL_MEAN: [ 123.675, 116.280, 103.530 ]
17 |   PIXEL_STD: [ 58.395, 57.120, 57.375 ]
18 |   # head
19 |   SEM_SEG_HEAD:
20 |     NAME: "MaskDINOHead"
21 |     IGNORE_VALUE: 255
22 |     NUM_CLASSES: 133
23 |     LOSS_WEIGHT: 1.0
24 |     CONVS_DIM: 256
25 |     MASK_DIM: 256
26 |     NORM: "GN"
27 |     # pixel decoder
28 |     PIXEL_DECODER_NAME: "MaskDINOEncoder"
29 |     DIM_FEEDFORWARD: 2048
30 |     NUM_FEATURE_LEVELS: 4
31 |     TOTAL_NUM_FEATURE_LEVELS: 5
32 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
33 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res2","res3", "res4", "res5"]
34 |     COMMON_STRIDE: 4
35 |     TRANSFORMER_ENC_LAYERS: 6
36 |     FEATURE_ORDER: "low2high"
37 |   MaskDINO:
38 |     TRANSFORMER_DECODER_NAME: "MaskDINODecoder"
39 |     DEEP_SUPERVISION: True
40 |     NO_OBJECT_WEIGHT: 0.1
41 |     CLASS_WEIGHT: 4.0
42 |     MASK_WEIGHT: 5.0
43 |     DICE_WEIGHT: 5.0
44 |     BOX_WEIGHT: 5.0
45 |     GIOU_WEIGHT: 2.0
46 |     HIDDEN_DIM: 256
47 |     NUM_OBJECT_QUERIES: 300
48 |     NHEADS: 8
49 |     DROPOUT: 0.0
50 |     DIM_FEEDFORWARD: 2048
51 |     ENC_LAYERS: 0
52 |     PRE_NORM: False
53 |     ENFORCE_INPUT_PROJ: False
54 |     SIZE_DIVISIBILITY: 32
55 |     DEC_LAYERS: 9  # 9+1, 9 decoder layers, add one for the loss on learnable query
56 |     TRAIN_NUM_POINTS: 12544
57 |     OVERSAMPLE_RATIO: 3.0
58 |     IMPORTANCE_SAMPLE_RATIO: 0.75
59 |     EVAL_FLAG: 1
60 |     INITIAL_PRED: True
61 |     TWO_STAGE: True
62 |     DN: "seg"
63 |     DN_NUM: 100
64 |     INITIALIZE_BOX_TYPE: 'no'
65 |     PANO_BOX_LOSS: False
66 |     TEST:
67 |       SEMANTIC_ON: True
68 |       INSTANCE_ON: True
69 |       PANOPTIC_ON: True
70 |       OVERLAP_THRESHOLD: 0.8
71 |       OBJECT_MASK_THRESHOLD: 0.25
72 | 
73 | SOLVER:
74 |   AMP:
75 |     ENABLED: True
76 | TEST:
77 |   EVAL_PERIOD: 5000
78 | #  EVAL_FLAG: 1


--------------------------------------------------------------------------------
/configs/coco/panoptic-segmentation/swin/maskdino_R50_bs16_50ep_4s_dowsample1_maskEnhance_2048.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../Base-COCO-PanopticSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskDINO"
 4 |   BACKBONE:
 5 |     NAME: "D2SwinTransformer"
 6 |   SWIN:
 7 |     EMBED_DIM: 192
 8 |     DEPTHS: [ 2, 2, 18, 2 ]
 9 |     NUM_HEADS: [ 6, 12, 24, 48 ]
10 |     WINDOW_SIZE: 12
11 |     APE: False
12 |     DROP_PATH_RATE: 0.3
13 |     PATCH_NORM: True
14 |     PRETRAIN_IMG_SIZE: 384
15 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
16 |   PIXEL_MEAN: [ 123.675, 116.280, 103.530 ]
17 |   PIXEL_STD: [ 58.395, 57.120, 57.375 ]
18 |   # head
19 |   SEM_SEG_HEAD:
20 |     NAME: "MaskDINOHead"
21 |     IGNORE_VALUE: 255
22 |     NUM_CLASSES: 133
23 |     LOSS_WEIGHT: 1.0
24 |     CONVS_DIM: 256
25 |     MASK_DIM: 256
26 |     NORM: "GN"
27 |     # pixel decoder
28 |     PIXEL_DECODER_NAME: "MaskDINOEncoder"
29 |     DIM_FEEDFORWARD: 2048
30 |     NUM_FEATURE_LEVELS: 4
31 |     TOTAL_NUM_FEATURE_LEVELS: 5
32 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
33 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res2","res3", "res4", "res5"]
34 |     COMMON_STRIDE: 4
35 |     TRANSFORMER_ENC_LAYERS: 6
36 |     FEATURE_ORDER: "low2high"
37 |   MaskDINO:
38 |     TRANSFORMER_DECODER_NAME: "MaskDINODecoder"
39 |     DEEP_SUPERVISION: True
40 |     NO_OBJECT_WEIGHT: 0.1
41 |     CLASS_WEIGHT: 4.0
42 |     MASK_WEIGHT: 5.0
43 |     DICE_WEIGHT: 5.0
44 |     BOX_WEIGHT: 5.0
45 |     GIOU_WEIGHT: 2.0
46 |     HIDDEN_DIM: 256
47 |     NUM_OBJECT_QUERIES: 300
48 |     NHEADS: 8
49 |     DROPOUT: 0.0
50 |     DIM_FEEDFORWARD: 2048
51 |     ENC_LAYERS: 0
52 |     PRE_NORM: False
53 |     ENFORCE_INPUT_PROJ: False
54 |     SIZE_DIVISIBILITY: 32
55 |     DEC_LAYERS: 9  # 9+1, 9 decoder layers, add one for the loss on learnable query
56 |     TRAIN_NUM_POINTS: 12544
57 |     OVERSAMPLE_RATIO: 3.0
58 |     IMPORTANCE_SAMPLE_RATIO: 0.75
59 |     EVAL_FLAG: 1
60 |     INITIAL_PRED: True
61 |     TWO_STAGE: True
62 |     DN: "seg"
63 |     DN_NUM: 100
64 |     INITIALIZE_BOX_TYPE: 'bitmask'
65 |     PANO_BOX_LOSS: False
66 |     TEST:
67 |       SEMANTIC_ON: True
68 |       INSTANCE_ON: True
69 |       PANOPTIC_ON: True
70 |       OVERLAP_THRESHOLD: 0.8
71 |       OBJECT_MASK_THRESHOLD: 0.25
72 | 
73 | SOLVER:
74 |   AMP:
75 |     ENABLED: True
76 | TEST:
77 |   EVAL_PERIOD: 5000
78 | #  EVAL_FLAG: 1


--------------------------------------------------------------------------------
/datasets/README.md:
--------------------------------------------------------------------------------
  1 | # Prepare Datasets for MaskDINO
  2 | 
  3 | A dataset can be used by accessing [DatasetCatalog](https://detectron2.readthedocs.io/modules/data.html#detectron2.data.DatasetCatalog)
  4 | for its data, or [MetadataCatalog](https://detectron2.readthedocs.io/modules/data.html#detectron2.data.MetadataCatalog) for its metadata (class names, etc).
  5 | This document explains how to setup the builtin datasets so they can be used by the above APIs.
  6 | [Use Custom Datasets](https://detectron2.readthedocs.io/tutorials/datasets.html) gives a deeper dive on how to use `DatasetCatalog` and `MetadataCatalog`,
  7 | and how to add new datasets to them.
  8 | 
  9 | MaskDINO has builtin support for a few datasets.
 10 | The datasets are assumed to exist in a directory specified by the environment variable
 11 | `DETECTRON2_DATASETS`.
 12 | Under this directory, detectron2 will look for datasets in the structure described below, if needed.
 13 | ```
 14 | $DETECTRON2_DATASETS/
 15 |   ADEChallengeData2016/
 16 |   coco/
 17 |   cityscapes/
 18 | ```
 19 | 
 20 | You can set the location for builtin datasets by `export DETECTRON2_DATASETS=/path/to/datasets`.
 21 | If left unset, the default is `./datasets` relative to your current working directory.
 22 | 
 23 | [comment]: <> (The [model zoo]&#40;https://github.com/facebookresearch/MaskFormer/blob/master/MODEL_ZOO.md&#41;)
 24 | 
 25 | [comment]: <> (contains configs and models that use these builtin datasets.)
 26 | 
 27 | 
 28 | ## Expected dataset structure for [COCO](https://cocodataset.org/#download):
 29 | 
 30 | ```
 31 | coco/
 32 |   annotations/
 33 |     instances_{train,val}2017.json
 34 |     panoptic_{train,val}2017.json
 35 |   {train,val}2017/
 36 |     # image files that are mentioned in the corresponding json
 37 |   panoptic_{train,val}2017/  # png annotations
 38 |   panoptic_semseg_{train,val}2017/  # generated by the script mentioned below
 39 | ```
 40 | 
 41 | Install panopticapi by:
 42 | ```
 43 | pip install git+https://github.com/cocodataset/panopticapi.git
 44 | ```
 45 | Then, run `python datasets/prepare_coco_semantic_annos_from_panoptic_annos.py`, to extract semantic annotations from panoptic annotations (only used for evaluation).
 46 | 
 47 | 
 48 | ## Expected dataset structure for [cityscapes](https://www.cityscapes-dataset.com/downloads/):
 49 | ```
 50 | cityscapes/
 51 |   gtFine/
 52 |     train/
 53 |       aachen/
 54 |         color.png, instanceIds.png, labelIds.png, polygons.json,
 55 |         labelTrainIds.png
 56 |       ...
 57 |     val/
 58 |     test/
 59 |     # below are generated Cityscapes panoptic annotation
 60 |     cityscapes_panoptic_train.json
 61 |     cityscapes_panoptic_train/
 62 |     cityscapes_panoptic_val.json
 63 |     cityscapes_panoptic_val/
 64 |     cityscapes_panoptic_test.json
 65 |     cityscapes_panoptic_test/
 66 |   leftImg8bit/
 67 |     train/
 68 |     val/
 69 |     test/
 70 | ```
 71 | Install cityscapes scripts by:
 72 | ```
 73 | pip install git+https://github.com/mcordts/cityscapesScripts.git
 74 | ```
 75 | 
 76 | Note: to create labelTrainIds.png, first prepare the above structure, then run cityscapesescript with:
 77 | ```
 78 | CITYSCAPES_DATASET=/path/to/abovementioned/cityscapes python cityscapesscripts/preparation/createTrainIdLabelImgs.py
 79 | ```
 80 | These files are not needed for instance segmentation.
 81 | 
 82 | Note: to generate Cityscapes panoptic dataset, run cityscapesescript with:
 83 | ```
 84 | CITYSCAPES_DATASET=/path/to/abovementioned/cityscapes python cityscapesscripts/preparation/createPanopticImgs.py
 85 | ```
 86 | These files are not needed for semantic and instance segmentation.
 87 | 
 88 | 
 89 | ## Expected dataset structure for [ADE20k](http://sceneparsing.csail.mit.edu/):
 90 | ```
 91 | ADEChallengeData2016/
 92 |   images/
 93 |   annotations/
 94 |   objectInfo150.txt
 95 |   # download instance annotation
 96 |   annotations_instance/
 97 |   # generated by prepare_ade20k_sem_seg.py
 98 |   annotations_detectron2/
 99 |   # below are generated by prepare_ade20k_pan_seg.py
100 |   ade20k_panoptic_{train,val}.json
101 |   ade20k_panoptic_{train,val}/
102 |   # below are generated by prepare_ade20k_ins_seg.py
103 |   ade20k_instance_{train,val}.json
104 | ```
105 | 
106 | The directory `annotations_detectron2` is generated by running `python datasets/prepare_ade20k_sem_seg.py`.
107 | 
108 | Install panopticapi by:
109 | ```bash
110 | pip install git+https://github.com/cocodataset/panopticapi.git
111 | ```
112 | 
113 | Download the instance annotation from http://sceneparsing.csail.mit.edu/:
114 | ```bash
115 | wget http://sceneparsing.csail.mit.edu/data/ChallengeData2017/annotations_instance.tar
116 | ```
117 | 
118 | Then, run `python datasets/prepare_ade20k_pan_seg.py`, to combine semantic and instance annotations for panoptic annotations.
119 | 
120 | And run `python datasets/prepare_ade20k_ins_seg.py`, to extract instance annotations in COCO format.
121 | 
122 | 
123 | 


--------------------------------------------------------------------------------
/datasets/ade20k_instance_catid_mapping.txt:
--------------------------------------------------------------------------------
  1 | Instacne100	SceneParse150	FullADE20K
  2 | 1		8		165
  3 | 2		9		3055
  4 | 3		11		350
  5 | 4		13		1831
  6 | 5		15		774
  7 | 5		15		783
  8 | 6		16		2684
  9 | 7		19		687
 10 | 8		20		471
 11 | 9		21		401
 12 | 10		23		1735
 13 | 11		24		2473
 14 | 12		25		2329
 15 | 13		28		1564
 16 | 14		31		57
 17 | 15		32		2272
 18 | 16		33		907
 19 | 17		34		724
 20 | 18		36		2985
 21 | 18		36		533
 22 | 19		37		1395
 23 | 20		38		155
 24 | 21		39		2053
 25 | 22		40		689
 26 | 23		42		266
 27 | 24		43		581
 28 | 25		44		2380
 29 | 26		45		491
 30 | 27		46		627
 31 | 28		48		2388
 32 | 29		50		943
 33 | 30		51		2096
 34 | 31		54		2530
 35 | 32		56		420
 36 | 33		57		1948
 37 | 34		58		1869
 38 | 35		59		2251
 39 | 36		63		239
 40 | 37		65		571
 41 | 38		66		2793
 42 | 39		67		978
 43 | 40		68		236
 44 | 41		70		181
 45 | 42		71		629
 46 | 43		72		2598
 47 | 44		73		1744
 48 | 45		74		1374
 49 | 46		75		591
 50 | 47		76		2679
 51 | 48		77		223
 52 | 49		79		47
 53 | 50		81		327
 54 | 51		82		2821
 55 | 52		83		1451
 56 | 53		84		2880
 57 | 54		86		480
 58 | 55		87		77
 59 | 56		88		2616
 60 | 57		89		246
 61 | 57		89		247
 62 | 58		90		2733
 63 | 59		91		14
 64 | 60		93		38
 65 | 61		94		1936
 66 | 62		96		120
 67 | 63		98		1702
 68 | 64		99		249
 69 | 65		103		2928
 70 | 66		104		2337
 71 | 67		105		1023
 72 | 68		108		2989
 73 | 69		109		1930
 74 | 70		111		2586
 75 | 71		112		131
 76 | 72		113		146
 77 | 73		116		95
 78 | 74		117		1563
 79 | 75		119		1708
 80 | 76		120		103
 81 | 77		121		1002
 82 | 78		122		2569
 83 | 79		124		2833
 84 | 80		125		1551
 85 | 81		126		1981
 86 | 82		127		29
 87 | 83		128		187
 88 | 84		130		747
 89 | 85		131		2254
 90 | 86		133		2262
 91 | 87		134		1260
 92 | 88		135		2243
 93 | 89		136		2932
 94 | 90		137		2836
 95 | 91		138		2850
 96 | 92		139		64
 97 | 93		140		894
 98 | 94		143		1919
 99 | 95		144		1583
100 | 96		145		318
101 | 97		147		2046
102 | 98		148		1098
103 | 99		149		530
104 | 100		150		954
105 | 


--------------------------------------------------------------------------------
/datasets/prepare_ade20k_ins_seg.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | # Copyright (c) Facebook, Inc. and its affiliates.
  4 | import glob
  5 | import json
  6 | import os
  7 | from collections import Counter
  8 | 
  9 | import numpy as np
 10 | import tqdm
 11 | from panopticapi.utils import IdGenerator, save_json
 12 | from PIL import Image
 13 | import pycocotools.mask as mask_util
 14 | 
 15 | 
 16 | if __name__ == "__main__":
 17 |     dataset_dir = os.getenv("DETECTRON2_DATASETS", "datasets")
 18 | 
 19 |     for name, dirname in [("train", "training"), ("val", "validation")]:
 20 |         image_dir = os.path.join(dataset_dir, f"ADEChallengeData2016/images/{dirname}/")
 21 |         instance_dir = os.path.join(
 22 |             dataset_dir, f"ADEChallengeData2016/annotations_instance/{dirname}/"
 23 |         )
 24 | 
 25 |         # img_id = 0
 26 |         ann_id = 1
 27 | 
 28 |         # json
 29 |         out_file = os.path.join(dataset_dir, f"ADEChallengeData2016/ade20k_instance_{name}.json")
 30 | 
 31 |         # json config
 32 |         instance_config_file = "datasets/ade20k_instance_imgCatIds.json"
 33 |         with open(instance_config_file) as f:
 34 |             category_dict = json.load(f)["categories"]
 35 | 
 36 |         # load catid mapping
 37 |         # it is important to share category id for both instance and panoptic annotations
 38 |         mapping_file = "datasets/ade20k_instance_catid_mapping.txt"
 39 |         with open(mapping_file) as f:
 40 |             map_id = {}
 41 |             for i, line in enumerate(f.readlines()):
 42 |                 if i == 0:
 43 |                     continue
 44 |                 ins_id, sem_id, _ = line.strip().split()
 45 |                 # shift id by 1 because we want it to start from 0!
 46 |                 # ignore_label becomes 255
 47 |                 map_id[int(ins_id)] = int(sem_id) - 1
 48 | 
 49 |         for cat in category_dict:
 50 |             cat["id"] = map_id[cat["id"]]
 51 | 
 52 |         filenames = sorted(glob.glob(os.path.join(image_dir, "*.jpg")))
 53 | 
 54 |         ann_dict = {}
 55 |         images = []
 56 |         annotations = []
 57 | 
 58 |         for idx, filename in enumerate(tqdm.tqdm(filenames)):
 59 |             image = {}
 60 |             image_id = os.path.basename(filename).split(".")[0]
 61 | 
 62 |             image["id"] = image_id
 63 |             image["file_name"] = os.path.basename(filename)
 64 | 
 65 |             original_format = np.array(Image.open(filename))
 66 |             image["width"] = original_format.shape[1]
 67 |             image["height"] = original_format.shape[0]
 68 | 
 69 |             images.append(image)
 70 | 
 71 |             filename_instance = os.path.join(instance_dir, image_id + ".png")
 72 |             ins_seg = np.asarray(Image.open(filename_instance))
 73 |             assert ins_seg.dtype == np.uint8
 74 | 
 75 |             instance_cat_ids = ins_seg[..., 0]
 76 |             # instance id starts from 1!
 77 |             # because 0 is reserved as VOID label
 78 |             instance_ins_ids = ins_seg[..., 1]
 79 | 
 80 |             # process things
 81 |             for thing_id in np.unique(instance_ins_ids):
 82 |                 if thing_id == 0:
 83 |                     continue
 84 |                 mask = instance_ins_ids == thing_id
 85 |                 instance_cat_id = np.unique(instance_cat_ids[mask])
 86 |                 assert len(instance_cat_id) == 1
 87 | 
 88 |                 anno = {}
 89 |                 anno['id'] = ann_id
 90 |                 ann_id += 1
 91 |                 anno['image_id'] = image['id']
 92 |                 anno["iscrowd"] = int(0)
 93 |                 anno["category_id"] = int(map_id[instance_cat_id[0]])
 94 | 
 95 |                 inds = np.nonzero(mask)
 96 |                 ymin, ymax = inds[0].min(), inds[0].max()
 97 |                 xmin, xmax = inds[1].min(), inds[1].max()
 98 |                 anno["bbox"] = [int(xmin), int(ymin), int(xmax - xmin + 1), int(ymax - ymin + 1)]
 99 |                 # if xmax <= xmin or ymax <= ymin:
100 |                 #     continue
101 |                 rle = mask_util.encode(np.array(mask[:, :, None], order="F", dtype="uint8"))[0]
102 |                 rle["counts"] = rle["counts"].decode("utf-8")
103 |                 anno["segmentation"] = rle
104 |                 anno["area"] = int(mask_util.area(rle))
105 |                 annotations.append(anno)
106 | 
107 |         # save this
108 |         ann_dict['images'] = images
109 |         ann_dict['categories'] = category_dict
110 |         ann_dict['annotations'] = annotations
111 | 
112 |         save_json(ann_dict, out_file)
113 | 


--------------------------------------------------------------------------------
/datasets/prepare_ade20k_sem_seg.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | # Copyright (c) Facebook, Inc. and its affiliates.
 4 | import os
 5 | from pathlib import Path
 6 | 
 7 | import numpy as np
 8 | import tqdm
 9 | from PIL import Image
10 | 
11 | 
12 | def convert(input, output):
13 |     img = np.asarray(Image.open(input))
14 |     assert img.dtype == np.uint8
15 |     img = img - 1  # 0 (ignore) becomes 255. others are shifted by 1
16 |     Image.fromarray(img).save(output)
17 | 
18 | 
19 | if __name__ == "__main__":
20 |     dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "ADEChallengeData2016"
21 |     for name in ["training", "validation"]:
22 |         annotation_dir = dataset_dir / "annotations" / name
23 |         output_dir = dataset_dir / "annotations_detectron2" / name
24 |         output_dir.mkdir(parents=True, exist_ok=True)
25 |         for file in tqdm.tqdm(list(annotation_dir.iterdir())):
26 |             output_file = output_dir / file.name
27 |             convert(file, output_file)
28 | 


--------------------------------------------------------------------------------
/datasets/prepare_coco_semantic_annos_from_panoptic_annos.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | # Copyright (c) Facebook, Inc. and its affiliates.
 4 | 
 5 | import functools
 6 | import json
 7 | import multiprocessing as mp
 8 | import numpy as np
 9 | import os
10 | import time
11 | from fvcore.common.download import download
12 | from panopticapi.utils import rgb2id
13 | from PIL import Image
14 | 
15 | from detectron2.data.datasets.builtin_meta import COCO_CATEGORIES
16 | 
17 | 
18 | def _process_panoptic_to_semantic(input_panoptic, output_semantic, segments, id_map):
19 |     panoptic = np.asarray(Image.open(input_panoptic), dtype=np.uint32)
20 |     panoptic = rgb2id(panoptic)
21 |     output = np.zeros_like(panoptic, dtype=np.uint8) + 255
22 |     for seg in segments:
23 |         cat_id = seg["category_id"]
24 |         new_cat_id = id_map[cat_id]
25 |         output[panoptic == seg["id"]] = new_cat_id
26 |     Image.fromarray(output).save(output_semantic)
27 | 
28 | 
29 | def separate_coco_semantic_from_panoptic(panoptic_json, panoptic_root, sem_seg_root, categories):
30 |     """
31 |     Create semantic segmentation annotations from panoptic segmentation
32 |     annotations, to be used by PanopticFPN.
33 |     It maps all thing categories to class 0, and maps all unlabeled pixels to class 255.
34 |     It maps all stuff categories to contiguous ids starting from 1.
35 |     Args:
36 |         panoptic_json (str): path to the panoptic json file, in COCO's format.
37 |         panoptic_root (str): a directory with panoptic annotation files, in COCO's format.
38 |         sem_seg_root (str): a directory to output semantic annotation files
39 |         categories (list[dict]): category metadata. Each dict needs to have:
40 |             "id": corresponds to the "category_id" in the json annotations
41 |             "isthing": 0 or 1
42 |     """
43 |     os.makedirs(sem_seg_root, exist_ok=True)
44 | 
45 |     id_map = {}  # map from category id to id in the output semantic annotation
46 |     assert len(categories) <= 254
47 |     for i, k in enumerate(categories):
48 |         id_map[k["id"]] = i
49 |     # what is id = 0?
50 |     # id_map[0] = 255
51 |     print(id_map)
52 | 
53 |     with open(panoptic_json) as f:
54 |         obj = json.load(f)
55 | 
56 |     pool = mp.Pool(processes=max(mp.cpu_count() // 2, 4))
57 | 
58 |     def iter_annotations():
59 |         for anno in obj["annotations"]:
60 |             file_name = anno["file_name"]
61 |             segments = anno["segments_info"]
62 |             input = os.path.join(panoptic_root, file_name)
63 |             output = os.path.join(sem_seg_root, file_name)
64 |             yield input, output, segments
65 | 
66 |     print("Start writing to {} ...".format(sem_seg_root))
67 |     start = time.time()
68 |     pool.starmap(
69 |         functools.partial(_process_panoptic_to_semantic, id_map=id_map),
70 |         iter_annotations(),
71 |         chunksize=100,
72 |     )
73 |     print("Finished. time: {:.2f}s".format(time.time() - start))
74 | 
75 | 
76 | if __name__ == "__main__":
77 |     dataset_dir = os.path.join(os.getenv("DETECTRON2_DATASETS", "datasets"), "coco")
78 |     for s in ["val2017", "train2017"]:
79 |         separate_coco_semantic_from_panoptic(
80 |             os.path.join(dataset_dir, "annotations/panoptic_{}.json".format(s)),
81 |             os.path.join(dataset_dir, "panoptic_{}".format(s)),
82 |             os.path.join(dataset_dir, "panoptic_semseg_{}".format(s)),
83 |             COCO_CATEGORIES,
84 |         )
85 | 


--------------------------------------------------------------------------------
/demo/README.md:
--------------------------------------------------------------------------------
 1 | ## Getting Started with MaskDINO
 2 | 
 3 | This document provides a brief intro of the usage of **MaskDINO**.
 4 | 
 5 | Please see [Getting Started with Detectron2](https://github.com/facebookresearch/detectron2/blob/master/GETTING_STARTED.md) for full usage.
 6 | 
 7 | 
 8 | ### Inference Demo with Pre-trained Models
 9 | 
10 | 1. Pick a model and its config file
11 | - for example
12 |    - config file at `/configs/coco/instance-segmentation/maskdino_R50_bs16_50ep_3s.yaml`.
13 |    - Model file [MaskDINO (hid 1024) ](https://github.com/IDEA-Research/detrex-storage/releases/download/maskdino-v0.1.0/maskdino_r50_50ep_300q_hid1024_3sd1_instance_maskenhanced_mask46.1ap_box51.5ap.pth)
14 | 2. We provide `demo.py` that is able to demo builtin configs. 
15 | 3. Run it with:
16 | ```
17 | cd demo/
18 | python demo.py --config-file /configs/coco/instance-segmentation/maskdino_R50_bs16_50ep_3s.yaml \
19 |   --input input1.jpg input2.jpg \
20 |   [--other-options]
21 |   --opts MODEL.WEIGHTS /path/to/model_file
22 | ```
23 | The configs are made for training, therefore we need to specify `MODEL.WEIGHTS` to a model from model zoo for evaluation.
24 | This command will run the inference and show visualizations in an OpenCV window.
25 | 
26 | For details of the command line arguments, see `demo.py -h` or look at its source code
27 | to understand its behavior. Some common arguments are:
28 | * To run __on your webcam__, replace `--input files` with `--webcam`.
29 | * To run __on a video__, replace `--input files` with `--video-input video.mp4`.
30 | * To run __on cpu__, add `MODEL.DEVICE cpu` after `--opts`.
31 | * To save outputs to a directory (for images) or a file (for webcam or video), use `--output`.
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/demo/demo.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # Modified by Bowen Cheng from: https://github.com/facebookresearch/detectron2/blob/master/demo/demo.py
  3 | import argparse
  4 | import glob
  5 | import multiprocessing as mp
  6 | import os
  7 | 
  8 | # fmt: off
  9 | import sys
 10 | sys.path.insert(1, os.path.join(sys.path[0], '..'))
 11 | # fmt: on
 12 | 
 13 | import tempfile
 14 | import time
 15 | import warnings
 16 | 
 17 | import cv2
 18 | import numpy as np
 19 | import tqdm
 20 | 
 21 | from detectron2.config import get_cfg
 22 | from detectron2.data.detection_utils import read_image
 23 | from detectron2.projects.deeplab import add_deeplab_config
 24 | from detectron2.utils.logger import setup_logger
 25 | 
 26 | from maskdino import add_maskdino_config
 27 | from predictor import VisualizationDemo
 28 | 
 29 | 
 30 | # constants
 31 | WINDOW_NAME = "mask2former demo"
 32 | 
 33 | 
 34 | def setup_cfg(args):
 35 |     # load config from file and command-line arguments
 36 |     cfg = get_cfg()
 37 |     add_deeplab_config(cfg)
 38 |     add_maskdino_config(cfg)
 39 |     cfg.merge_from_file(args.config_file)
 40 |     cfg.merge_from_list(args.opts)
 41 |     cfg.freeze()
 42 |     return cfg
 43 | 
 44 | 
 45 | def get_parser():
 46 |     parser = argparse.ArgumentParser(description="maskdino demo for builtin configs")
 47 |     parser.add_argument(
 48 |         "--config-file",
 49 |         default="configs/coco/instance-segmentation/maskdino_R50_bs16_50ep_3s.yaml",
 50 |         metavar="FILE",
 51 |         help="path to config file",
 52 |     )
 53 |     parser.add_argument("--webcam", action="store_true", help="Take inputs from webcam.")
 54 |     parser.add_argument("--video-input", help="Path to video file.")
 55 |     parser.add_argument(
 56 |         "--input",
 57 |         nargs="+",
 58 |         help="A list of space separated input images; "
 59 |         "or a single glob pattern such as 'directory/*.jpg'",
 60 |     )
 61 |     parser.add_argument(
 62 |         "--output",
 63 |         help="A file or directory to save output visualizations. "
 64 |         "If not given, will show output in an OpenCV window.",
 65 |     )
 66 | 
 67 |     parser.add_argument(
 68 |         "--confidence-threshold",
 69 |         type=float,
 70 |         default=0.5,
 71 |         help="Minimum score for instance predictions to be shown",
 72 |     )
 73 |     parser.add_argument(
 74 |         "--opts",
 75 |         help="Modify config options using the command-line 'KEY VALUE' pairs",
 76 |         default=[],
 77 |         nargs=argparse.REMAINDER,
 78 |     )
 79 |     return parser
 80 | 
 81 | 
 82 | def test_opencv_video_format(codec, file_ext):
 83 |     with tempfile.TemporaryDirectory(prefix="video_format_test") as dir:
 84 |         filename = os.path.join(dir, "test_file" + file_ext)
 85 |         writer = cv2.VideoWriter(
 86 |             filename=filename,
 87 |             fourcc=cv2.VideoWriter_fourcc(*codec),
 88 |             fps=float(30),
 89 |             frameSize=(10, 10),
 90 |             isColor=True,
 91 |         )
 92 |         [writer.write(np.zeros((10, 10, 3), np.uint8)) for _ in range(30)]
 93 |         writer.release()
 94 |         if os.path.isfile(filename):
 95 |             return True
 96 |         return False
 97 | 
 98 | 
 99 | if __name__ == "__main__":
100 |     mp.set_start_method("spawn", force=True)
101 |     args = get_parser().parse_args()
102 |     setup_logger(name="fvcore")
103 |     logger = setup_logger()
104 |     logger.info("Arguments: " + str(args))
105 | 
106 |     cfg = setup_cfg(args)
107 | 
108 |     demo = VisualizationDemo(cfg)
109 | 
110 |     if args.input:
111 |         if len(args.input) == 1:
112 |             args.input = glob.glob(os.path.expanduser(args.input[0]))
113 |             assert args.input, "The input path(s) was not found"
114 |         for path in tqdm.tqdm(args.input, disable=not args.output):
115 |             # use PIL, to be consistent with evaluation
116 |             img = read_image(path, format="BGR")
117 |             start_time = time.time()
118 |             predictions, visualized_output = demo.run_on_image(img)
119 |             logger.info(
120 |                 "{}: {} in {:.2f}s".format(
121 |                     path,
122 |                     "detected {} instances".format(len(predictions["instances"]))
123 |                     if "instances" in predictions
124 |                     else "finished",
125 |                     time.time() - start_time,
126 |                 )
127 |             )
128 | 
129 |             if args.output:
130 |                 if os.path.isdir(args.output):
131 |                     assert os.path.isdir(args.output), args.output
132 |                     out_filename = os.path.join(args.output, os.path.basename(path))
133 |                 else:
134 |                     assert len(args.input) == 1, "Please specify a directory with args.output"
135 |                     out_filename = args.output
136 |                 visualized_output.save(out_filename)
137 |             else:
138 |                 cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL)
139 |                 cv2.imshow(WINDOW_NAME, visualized_output.get_image()[:, :, ::-1])
140 |                 if cv2.waitKey(0) == 27:
141 |                     break  # esc to quit
142 |     elif args.webcam:
143 |         assert args.input is None, "Cannot have both --input and --webcam!"
144 |         assert args.output is None, "output not yet supported with --webcam!"
145 |         cam = cv2.VideoCapture(0)
146 |         for vis in tqdm.tqdm(demo.run_on_video(cam)):
147 |             cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL)
148 |             cv2.imshow(WINDOW_NAME, vis)
149 |             if cv2.waitKey(1) == 27:
150 |                 break  # esc to quit
151 |         cam.release()
152 |         cv2.destroyAllWindows()
153 |     elif args.video_input:
154 |         video = cv2.VideoCapture(args.video_input)
155 |         width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
156 |         height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
157 |         frames_per_second = video.get(cv2.CAP_PROP_FPS)
158 |         num_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
159 |         basename = os.path.basename(args.video_input)
160 |         codec, file_ext = (
161 |             ("x264", ".mkv") if test_opencv_video_format("x264", ".mkv") else ("mp4v", ".mp4")
162 |         )
163 |         if codec == ".mp4v":
164 |             warnings.warn("x264 codec not available, switching to mp4v")
165 |         if args.output:
166 |             if os.path.isdir(args.output):
167 |                 output_fname = os.path.join(args.output, basename)
168 |                 output_fname = os.path.splitext(output_fname)[0] + file_ext
169 |             else:
170 |                 output_fname = args.output
171 |             assert not os.path.isfile(output_fname), output_fname
172 |             output_file = cv2.VideoWriter(
173 |                 filename=output_fname,
174 |                 # some installation of opencv may not support x264 (due to its license),
175 |                 # you can try other format (e.g. MPEG)
176 |                 fourcc=cv2.VideoWriter_fourcc(*codec),
177 |                 fps=float(frames_per_second),
178 |                 frameSize=(width, height),
179 |                 isColor=True,
180 |             )
181 |         assert os.path.isfile(args.video_input)
182 |         for vis_frame in tqdm.tqdm(demo.run_on_video(video), total=num_frames):
183 |             if args.output:
184 |                 output_file.write(vis_frame)
185 |             else:
186 |                 cv2.namedWindow(basename, cv2.WINDOW_NORMAL)
187 |                 cv2.imshow(basename, vis_frame)
188 |                 if cv2.waitKey(1) == 27:
189 |                     break  # esc to quit
190 |         video.release()
191 |         if args.output:
192 |             output_file.release()
193 |         else:
194 |             cv2.destroyAllWindows()
195 | 


--------------------------------------------------------------------------------
/demo/predictor.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # Copied from: https://github.com/facebookresearch/detectron2/blob/master/demo/predictor.py
  3 | import atexit
  4 | import bisect
  5 | import multiprocessing as mp
  6 | from collections import deque
  7 | 
  8 | import cv2
  9 | import torch
 10 | 
 11 | from detectron2.data import MetadataCatalog
 12 | from detectron2.engine.defaults import DefaultPredictor
 13 | from detectron2.utils.video_visualizer import VideoVisualizer
 14 | from detectron2.utils.visualizer import ColorMode, Visualizer
 15 | 
 16 | 
 17 | class VisualizationDemo(object):
 18 |     def __init__(self, cfg, instance_mode=ColorMode.IMAGE, parallel=False):
 19 |         """
 20 |         Args:
 21 |             cfg (CfgNode):
 22 |             instance_mode (ColorMode):
 23 |             parallel (bool): whether to run the model in different processes from visualization.
 24 |                 Useful since the visualization logic can be slow.
 25 |         """
 26 |         self.metadata = MetadataCatalog.get(
 27 |             cfg.DATASETS.TEST[0] if len(cfg.DATASETS.TEST) else "__unused"
 28 |         )
 29 |         self.cpu_device = torch.device("cpu")
 30 |         self.instance_mode = instance_mode
 31 | 
 32 |         self.parallel = parallel
 33 |         if parallel:
 34 |             num_gpu = torch.cuda.device_count()
 35 |             self.predictor = AsyncPredictor(cfg, num_gpus=num_gpu)
 36 |         else:
 37 |             self.predictor = DefaultPredictor(cfg)
 38 | 
 39 |     def run_on_image(self, image):
 40 |         """
 41 |         Args:
 42 |             image (np.ndarray): an image of shape (H, W, C) (in BGR order).
 43 |                 This is the format used by OpenCV.
 44 |         Returns:
 45 |             predictions (dict): the output of the model.
 46 |             vis_output (VisImage): the visualized image output.
 47 |         """
 48 |         vis_output = None
 49 |         predictions = self.predictor(image)
 50 |         # Convert image from OpenCV BGR format to Matplotlib RGB format.
 51 |         image = image[:, :, ::-1]
 52 |         visualizer = Visualizer(image, self.metadata, instance_mode=self.instance_mode)
 53 |         if "panoptic_seg" in predictions:
 54 |             panoptic_seg, segments_info = predictions["panoptic_seg"]
 55 |             vis_output = visualizer.draw_panoptic_seg_predictions(
 56 |                 panoptic_seg.to(self.cpu_device), segments_info
 57 |             )
 58 |         else:
 59 |             if "sem_seg" in predictions:
 60 |                 vis_output = visualizer.draw_sem_seg(
 61 |                     predictions["sem_seg"].argmax(dim=0).to(self.cpu_device)
 62 |                 )
 63 |             if "instances" in predictions:
 64 |                 instances = predictions["instances"].to(self.cpu_device)
 65 |                 vis_output = visualizer.draw_instance_predictions(predictions=instances)
 66 | 
 67 |         return predictions, vis_output
 68 | 
 69 |     def _frame_from_video(self, video):
 70 |         while video.isOpened():
 71 |             success, frame = video.read()
 72 |             if success:
 73 |                 yield frame
 74 |             else:
 75 |                 break
 76 | 
 77 |     def run_on_video(self, video):
 78 |         """
 79 |         Visualizes predictions on frames of the input video.
 80 |         Args:
 81 |             video (cv2.VideoCapture): a :class:`VideoCapture` object, whose source can be
 82 |                 either a webcam or a video file.
 83 |         Yields:
 84 |             ndarray: BGR visualizations of each video frame.
 85 |         """
 86 |         video_visualizer = VideoVisualizer(self.metadata, self.instance_mode)
 87 | 
 88 |         def process_predictions(frame, predictions):
 89 |             frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
 90 |             if "panoptic_seg" in predictions:
 91 |                 panoptic_seg, segments_info = predictions["panoptic_seg"]
 92 |                 vis_frame = video_visualizer.draw_panoptic_seg_predictions(
 93 |                     frame, panoptic_seg.to(self.cpu_device), segments_info
 94 |                 )
 95 |             elif "instances" in predictions:
 96 |                 predictions = predictions["instances"].to(self.cpu_device)
 97 |                 vis_frame = video_visualizer.draw_instance_predictions(frame, predictions)
 98 |             elif "sem_seg" in predictions:
 99 |                 vis_frame = video_visualizer.draw_sem_seg(
100 |                     frame, predictions["sem_seg"].argmax(dim=0).to(self.cpu_device)
101 |                 )
102 | 
103 |             # Converts Matplotlib RGB format to OpenCV BGR format
104 |             vis_frame = cv2.cvtColor(vis_frame.get_image(), cv2.COLOR_RGB2BGR)
105 |             return vis_frame
106 | 
107 |         frame_gen = self._frame_from_video(video)
108 |         if self.parallel:
109 |             buffer_size = self.predictor.default_buffer_size
110 | 
111 |             frame_data = deque()
112 | 
113 |             for cnt, frame in enumerate(frame_gen):
114 |                 frame_data.append(frame)
115 |                 self.predictor.put(frame)
116 | 
117 |                 if cnt >= buffer_size:
118 |                     frame = frame_data.popleft()
119 |                     predictions = self.predictor.get()
120 |                     yield process_predictions(frame, predictions)
121 | 
122 |             while len(frame_data):
123 |                 frame = frame_data.popleft()
124 |                 predictions = self.predictor.get()
125 |                 yield process_predictions(frame, predictions)
126 |         else:
127 |             for frame in frame_gen:
128 |                 yield process_predictions(frame, self.predictor(frame))
129 | 
130 | 
131 | class AsyncPredictor:
132 |     """
133 |     A predictor that runs the model asynchronously, possibly on >1 GPUs.
134 |     Because rendering the visualization takes considerably amount of time,
135 |     this helps improve throughput a little bit when rendering videos.
136 |     """
137 | 
138 |     class _StopToken:
139 |         pass
140 | 
141 |     class _PredictWorker(mp.Process):
142 |         def __init__(self, cfg, task_queue, result_queue):
143 |             self.cfg = cfg
144 |             self.task_queue = task_queue
145 |             self.result_queue = result_queue
146 |             super().__init__()
147 | 
148 |         def run(self):
149 |             predictor = DefaultPredictor(self.cfg)
150 | 
151 |             while True:
152 |                 task = self.task_queue.get()
153 |                 if isinstance(task, AsyncPredictor._StopToken):
154 |                     break
155 |                 idx, data = task
156 |                 result = predictor(data)
157 |                 self.result_queue.put((idx, result))
158 | 
159 |     def __init__(self, cfg, num_gpus: int = 1):
160 |         """
161 |         Args:
162 |             cfg (CfgNode):
163 |             num_gpus (int): if 0, will run on CPU
164 |         """
165 |         num_workers = max(num_gpus, 1)
166 |         self.task_queue = mp.Queue(maxsize=num_workers * 3)
167 |         self.result_queue = mp.Queue(maxsize=num_workers * 3)
168 |         self.procs = []
169 |         for gpuid in range(max(num_gpus, 1)):
170 |             cfg = cfg.clone()
171 |             cfg.defrost()
172 |             cfg.MODEL.DEVICE = "cuda:{}".format(gpuid) if num_gpus > 0 else "cpu"
173 |             self.procs.append(
174 |                 AsyncPredictor._PredictWorker(cfg, self.task_queue, self.result_queue)
175 |             )
176 | 
177 |         self.put_idx = 0
178 |         self.get_idx = 0
179 |         self.result_rank = []
180 |         self.result_data = []
181 | 
182 |         for p in self.procs:
183 |             p.start()
184 |         atexit.register(self.shutdown)
185 | 
186 |     def put(self, image):
187 |         self.put_idx += 1
188 |         self.task_queue.put((self.put_idx, image))
189 | 
190 |     def get(self):
191 |         self.get_idx += 1  # the index needed for this request
192 |         if len(self.result_rank) and self.result_rank[0] == self.get_idx:
193 |             res = self.result_data[0]
194 |             del self.result_data[0], self.result_rank[0]
195 |             return res
196 | 
197 |         while True:
198 |             # make sure the results are returned in the correct order
199 |             idx, res = self.result_queue.get()
200 |             if idx == self.get_idx:
201 |                 return res
202 |             insert = bisect.bisect(self.result_rank, idx)
203 |             self.result_rank.insert(insert, idx)
204 |             self.result_data.insert(insert, res)
205 | 
206 |     def __len__(self):
207 |         return self.put_idx - self.get_idx
208 | 
209 |     def __call__(self, image):
210 |         self.put(image)
211 |         return self.get()
212 | 
213 |     def shutdown(self):
214 |         for _ in self.procs:
215 |             self.task_queue.put(AsyncPredictor._StopToken())
216 | 
217 |     @property
218 |     def default_buffer_size(self):
219 |         return len(self.procs) * 5
220 | 


--------------------------------------------------------------------------------
/figures/dinosaur.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IDEA-Research/MaskDINO/3831d8514a3728535ace8d4ecc7d28044c42dd14/figures/dinosaur.png


--------------------------------------------------------------------------------
/figures/framework.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IDEA-Research/MaskDINO/3831d8514a3728535ace8d4ecc7d28044c42dd14/figures/framework.jpg


--------------------------------------------------------------------------------
/figures/instance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IDEA-Research/MaskDINO/3831d8514a3728535ace8d4ecc7d28044c42dd14/figures/instance.png


--------------------------------------------------------------------------------
/figures/panoptic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IDEA-Research/MaskDINO/3831d8514a3728535ace8d4ecc7d28044c42dd14/figures/panoptic.png


--------------------------------------------------------------------------------
/figures/semantic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IDEA-Research/MaskDINO/3831d8514a3728535ace8d4ecc7d28044c42dd14/figures/semantic.png


--------------------------------------------------------------------------------
/figures/sota.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IDEA-Research/MaskDINO/3831d8514a3728535ace8d4ecc7d28044c42dd14/figures/sota.png


--------------------------------------------------------------------------------
/maskdino/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IDEA-Research/MaskDINO/3831d8514a3728535ace8d4ecc7d28044c42dd14/maskdino/.DS_Store


--------------------------------------------------------------------------------
/maskdino/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------
 2 | # Copyright (c) 2022 IDEA. All Rights Reserved.
 3 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 4 | # ------------------------------------------------------------------------
 5 | # Modified from Mask2Former https://github.com/facebookresearch/Mask2Former by Feng Li and Hao Zhang.
 6 | # ------------------------------------------------------------------------------
 7 | from . import data  # register all new datasets
 8 | from . import modeling
 9 | 
10 | # config
11 | from .config import add_maskdino_config
12 | 
13 | # dataset loading
14 | from .data.dataset_mappers.coco_instance_new_baseline_dataset_mapper import COCOInstanceNewBaselineDatasetMapper
15 | from .data.dataset_mappers.coco_panoptic_new_baseline_dataset_mapper import COCOPanopticNewBaselineDatasetMapper
16 | from .data.dataset_mappers.detr_dataset_mapper import DetrDatasetMapper
17 | 
18 | from .data.dataset_mappers.mask_former_semantic_dataset_mapper import (
19 |     MaskFormerSemanticDatasetMapper,
20 | )
21 | 
22 | # models
23 | from .maskdino import MaskDINO
24 | # from .data.datasets_detr import coco
25 | from .test_time_augmentation import SemanticSegmentorWithTTA
26 | 
27 | # evaluation
28 | from .evaluation.instance_evaluation import InstanceSegEvaluator
29 | # util
30 | from .utils import box_ops, misc, utils
31 | 


--------------------------------------------------------------------------------
/maskdino/config.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # ------------------------------------------------------------------------
  3 | # Copyright (c) 2022 IDEA. All Rights Reserved.
  4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  5 | # ------------------------------------------------------------------------
  6 | from detectron2.config import CfgNode as CN
  7 | 
  8 | 
  9 | def add_maskdino_config(cfg):
 10 |     """
 11 |     Add config for MaskDINO.
 12 |     """
 13 |     # NOTE: configs from original mask2former
 14 |     # data config
 15 |     # select the dataset mapper
 16 |     cfg.INPUT.DATASET_MAPPER_NAME = "MaskDINO_semantic"
 17 |     # Color augmentation
 18 |     cfg.INPUT.COLOR_AUG_SSD = False
 19 |     # We retry random cropping until no single category in semantic segmentation GT occupies more
 20 |     # than `SINGLE_CATEGORY_MAX_AREA` part of the crop.
 21 |     cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA = 1.0
 22 |     # Pad image and segmentation GT in dataset mapper.
 23 |     cfg.INPUT.SIZE_DIVISIBILITY = -1
 24 | 
 25 |     # solver config
 26 |     # weight decay on embedding
 27 |     cfg.SOLVER.WEIGHT_DECAY_EMBED = 0.0
 28 |     # optimizer
 29 |     cfg.SOLVER.OPTIMIZER = "ADAMW"
 30 |     cfg.SOLVER.BACKBONE_MULTIPLIER = 0.1
 31 | 
 32 |     # MaskDINO model config
 33 |     cfg.MODEL.MaskDINO = CN()
 34 |     cfg.MODEL.MaskDINO.LEARN_TGT = False
 35 | 
 36 |     # loss
 37 |     cfg.MODEL.MaskDINO.PANO_BOX_LOSS = False
 38 |     cfg.MODEL.MaskDINO.SEMANTIC_CE_LOSS = False
 39 |     cfg.MODEL.MaskDINO.DEEP_SUPERVISION = True
 40 |     cfg.MODEL.MaskDINO.NO_OBJECT_WEIGHT = 0.1
 41 |     cfg.MODEL.MaskDINO.CLASS_WEIGHT = 4.0
 42 |     cfg.MODEL.MaskDINO.DICE_WEIGHT = 5.0
 43 |     cfg.MODEL.MaskDINO.MASK_WEIGHT = 5.0
 44 |     cfg.MODEL.MaskDINO.BOX_WEIGHT = 5.
 45 |     cfg.MODEL.MaskDINO.GIOU_WEIGHT = 2.
 46 | 
 47 |     # cost weight
 48 |     cfg.MODEL.MaskDINO.COST_CLASS_WEIGHT = 4.0
 49 |     cfg.MODEL.MaskDINO.COST_DICE_WEIGHT = 5.0
 50 |     cfg.MODEL.MaskDINO.COST_MASK_WEIGHT = 5.0
 51 |     cfg.MODEL.MaskDINO.COST_BOX_WEIGHT = 5.
 52 |     cfg.MODEL.MaskDINO.COST_GIOU_WEIGHT = 2.
 53 | 
 54 |     # transformer config
 55 |     cfg.MODEL.MaskDINO.NHEADS = 8
 56 |     cfg.MODEL.MaskDINO.DROPOUT = 0.1
 57 |     cfg.MODEL.MaskDINO.DIM_FEEDFORWARD = 2048
 58 |     cfg.MODEL.MaskDINO.ENC_LAYERS = 0
 59 |     cfg.MODEL.MaskDINO.DEC_LAYERS = 6
 60 |     cfg.MODEL.MaskDINO.INITIAL_PRED = True
 61 |     cfg.MODEL.MaskDINO.PRE_NORM = False
 62 |     cfg.MODEL.MaskDINO.BOX_LOSS = True
 63 |     cfg.MODEL.MaskDINO.HIDDEN_DIM = 256
 64 |     cfg.MODEL.MaskDINO.NUM_OBJECT_QUERIES = 100
 65 | 
 66 |     cfg.MODEL.MaskDINO.ENFORCE_INPUT_PROJ = False
 67 |     cfg.MODEL.MaskDINO.TWO_STAGE = True
 68 |     cfg.MODEL.MaskDINO.INITIALIZE_BOX_TYPE = 'no'  # ['no', 'bitmask', 'mask2box']
 69 |     cfg.MODEL.MaskDINO.DN="seg"
 70 |     cfg.MODEL.MaskDINO.DN_NOISE_SCALE=0.4
 71 |     cfg.MODEL.MaskDINO.DN_NUM=100
 72 |     cfg.MODEL.MaskDINO.PRED_CONV=False
 73 | 
 74 |     cfg.MODEL.MaskDINO.EVAL_FLAG = 1
 75 | 
 76 |     # MSDeformAttn encoder configs
 77 |     cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES = ["res3", "res4", "res5"]
 78 |     cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_POINTS = 4
 79 |     cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_HEADS = 8
 80 |     cfg.MODEL.SEM_SEG_HEAD.DIM_FEEDFORWARD = 1024
 81 |     cfg.MODEL.SEM_SEG_HEAD.NUM_FEATURE_LEVELS = 3
 82 |     cfg.MODEL.SEM_SEG_HEAD.TOTAL_NUM_FEATURE_LEVELS = 4
 83 |     cfg.MODEL.SEM_SEG_HEAD.FEATURE_ORDER = 'high2low'  # ['low2high', 'high2low'] high2low: from high level to low level
 84 | 
 85 |     #####################
 86 | 
 87 |     # MaskDINO inference config
 88 |     cfg.MODEL.MaskDINO.TEST = CN()
 89 |     cfg.MODEL.MaskDINO.TEST.TEST_FOUCUS_ON_BOX = False
 90 |     cfg.MODEL.MaskDINO.TEST.SEMANTIC_ON = True
 91 |     cfg.MODEL.MaskDINO.TEST.INSTANCE_ON = False
 92 |     cfg.MODEL.MaskDINO.TEST.PANOPTIC_ON = False
 93 |     cfg.MODEL.MaskDINO.TEST.OBJECT_MASK_THRESHOLD = 0.0
 94 |     cfg.MODEL.MaskDINO.TEST.OVERLAP_THRESHOLD = 0.0
 95 |     cfg.MODEL.MaskDINO.TEST.SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE = False
 96 |     cfg.MODEL.MaskDINO.TEST.PANO_TRANSFORM_EVAL = True
 97 |     cfg.MODEL.MaskDINO.TEST.PANO_TEMPERATURE = 0.06
 98 |     # cfg.MODEL.MaskDINO.TEST.EVAL_FLAG = 1
 99 | 
100 |     # Sometimes `backbone.size_divisibility` is set to 0 for some backbone (e.g. ResNet)
101 |     # you can use this config to override
102 |     cfg.MODEL.MaskDINO.SIZE_DIVISIBILITY = 32
103 | 
104 |     # pixel decoder config
105 |     cfg.MODEL.SEM_SEG_HEAD.MASK_DIM = 256
106 |     # adding transformer in pixel decoder
107 |     cfg.MODEL.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS = 0
108 |     # pixel decoder
109 |     cfg.MODEL.SEM_SEG_HEAD.PIXEL_DECODER_NAME = "MaskDINOEncoder"
110 | 
111 |     # transformer module
112 |     cfg.MODEL.MaskDINO.TRANSFORMER_DECODER_NAME = "MaskDINODecoder"
113 | 
114 |     # LSJ aug
115 |     cfg.INPUT.IMAGE_SIZE = 1024
116 |     cfg.INPUT.MIN_SCALE = 0.1
117 |     cfg.INPUT.MAX_SCALE = 2.0
118 | 
119 |     # point loss configs
120 |     # Number of points sampled during training for a mask point head.
121 |     cfg.MODEL.MaskDINO.TRAIN_NUM_POINTS = 112 * 112
122 |     # Oversampling parameter for PointRend point sampling during training. Parameter `k` in the
123 |     # original paper.
124 |     cfg.MODEL.MaskDINO.OVERSAMPLE_RATIO = 3.0
125 |     # Importance sampling parameter for PointRend point sampling during training. Parametr `beta` in
126 |     # the original paper.
127 |     cfg.MODEL.MaskDINO.IMPORTANCE_SAMPLE_RATIO = 0.75
128 | 
129 |     # swin transformer backbone
130 |     cfg.MODEL.SWIN = CN()
131 |     cfg.MODEL.SWIN.PRETRAIN_IMG_SIZE = 224
132 |     cfg.MODEL.SWIN.PATCH_SIZE = 4
133 |     cfg.MODEL.SWIN.EMBED_DIM = 96
134 |     cfg.MODEL.SWIN.DEPTHS = [2, 2, 6, 2]
135 |     cfg.MODEL.SWIN.NUM_HEADS = [3, 6, 12, 24]
136 |     cfg.MODEL.SWIN.WINDOW_SIZE = 7
137 |     cfg.MODEL.SWIN.MLP_RATIO = 4.0
138 |     cfg.MODEL.SWIN.QKV_BIAS = True
139 |     cfg.MODEL.SWIN.QK_SCALE = None
140 |     cfg.MODEL.SWIN.DROP_RATE = 0.0
141 |     cfg.MODEL.SWIN.ATTN_DROP_RATE = 0.0
142 |     cfg.MODEL.SWIN.DROP_PATH_RATE = 0.3
143 |     cfg.MODEL.SWIN.APE = False
144 |     cfg.MODEL.SWIN.PATCH_NORM = True
145 |     cfg.MODEL.SWIN.OUT_FEATURES = ["res2", "res3", "res4", "res5"]
146 |     cfg.MODEL.SWIN.USE_CHECKPOINT = False
147 | 
148 |     cfg.Default_loading=True  # a bug in my d2. resume use this; if first time ResNet load, set it false
149 | 


--------------------------------------------------------------------------------
/maskdino/data/__init__.py:
--------------------------------------------------------------------------------
1 | from . import datasets
2 | 
3 | 


--------------------------------------------------------------------------------
/maskdino/data/dataset_mappers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IDEA-Research/MaskDINO/3831d8514a3728535ace8d4ecc7d28044c42dd14/maskdino/data/dataset_mappers/__init__.py


--------------------------------------------------------------------------------
/maskdino/data/dataset_mappers/coco_instance_new_baseline_dataset_mapper.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------
  2 | # Copyright (c) 2022 IDEA. All Rights Reserved.
  3 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  4 | # ------------------------------------------------------------------------
  5 | # Modified from Mask2Former https://github.com/facebookresearch/Mask2Former by Feng Li.
  6 | import copy
  7 | import logging
  8 | 
  9 | import numpy as np
 10 | import torch
 11 | 
 12 | from detectron2.config import configurable
 13 | from detectron2.data import detection_utils as utils
 14 | from detectron2.data import transforms as T
 15 | from detectron2.data.transforms import TransformGen
 16 | from detectron2.structures import BitMasks, Instances, PolygonMasks
 17 | 
 18 | from pycocotools import mask as coco_mask
 19 | 
 20 | __all__ = ["COCOInstanceNewBaselineDatasetMapper"]
 21 | 
 22 | 
 23 | def convert_coco_poly_to_mask(segmentations, height, width):
 24 |     masks = []
 25 |     for polygons in segmentations:
 26 |         rles = coco_mask.frPyObjects(polygons, height, width)
 27 |         mask = coco_mask.decode(rles)
 28 |         if len(mask.shape) < 3:
 29 |             mask = mask[..., None]
 30 |         mask = torch.as_tensor(mask, dtype=torch.uint8)
 31 |         mask = mask.any(dim=2)
 32 |         masks.append(mask)
 33 |     if masks:
 34 |         masks = torch.stack(masks, dim=0)
 35 |     else:
 36 |         masks = torch.zeros((0, height, width), dtype=torch.uint8)
 37 |     return masks
 38 | 
 39 | 
 40 | def build_transform_gen(cfg, is_train):
 41 |     """
 42 |     Create a list of default :class:`Augmentation` from config.
 43 |     Now it includes resizing and flipping.
 44 |     Returns:
 45 |         list[Augmentation]
 46 |     """
 47 |     assert is_train, "Only support training augmentation"
 48 |     image_size = cfg.INPUT.IMAGE_SIZE
 49 |     min_scale = cfg.INPUT.MIN_SCALE
 50 |     max_scale = cfg.INPUT.MAX_SCALE
 51 | 
 52 |     augmentation = []
 53 | 
 54 |     if cfg.INPUT.RANDOM_FLIP != "none":
 55 |         augmentation.append(
 56 |             T.RandomFlip(
 57 |                 horizontal=cfg.INPUT.RANDOM_FLIP == "horizontal",
 58 |                 vertical=cfg.INPUT.RANDOM_FLIP == "vertical",
 59 |             )
 60 |         )
 61 | 
 62 |     augmentation.extend([
 63 |         T.ResizeScale(
 64 |             min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size
 65 |         ),
 66 |         T.FixedSizeCrop(crop_size=(image_size, image_size)),
 67 |     ])
 68 | 
 69 |     return augmentation
 70 | 
 71 | 
 72 | class COCOInstanceNewBaselineDatasetMapper:
 73 |     """
 74 |     A callable which takes a dataset dict in Detectron2 Dataset format,
 75 |     and map it into a format used by MaskFormer.
 76 | 
 77 |     This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation.
 78 | 
 79 |     The callable currently does the following:
 80 | 
 81 |     1. Read the image from "file_name"
 82 |     2. Applies geometric transforms to the image and annotation
 83 |     3. Find and applies suitable cropping to the image and annotation
 84 |     4. Prepare image and annotation to Tensors
 85 |     """
 86 | 
 87 |     @configurable
 88 |     def __init__(
 89 |         self,
 90 |         is_train=True,
 91 |         *,
 92 |         tfm_gens,
 93 |         image_format,
 94 |     ):
 95 |         """
 96 |         NOTE: this interface is experimental.
 97 |         Args:
 98 |             is_train: for training or inference
 99 |             augmentations: a list of augmentations or deterministic transforms to apply
100 |             tfm_gens: data augmentation
101 |             image_format: an image format supported by :func:`detection_utils.read_image`.
102 |         """
103 |         self.tfm_gens = tfm_gens
104 |         logging.getLogger(__name__).info(
105 |             "[COCOInstanceNewBaselineDatasetMapper] Full TransformGens used in training: {}".format(str(self.tfm_gens))
106 |         )
107 | 
108 |         self.img_format = image_format
109 |         self.is_train = is_train
110 |     
111 |     @classmethod
112 |     def from_config(cls, cfg, is_train=True):
113 |         # Build augmentation
114 |         tfm_gens = build_transform_gen(cfg, is_train)
115 | 
116 |         ret = {
117 |             "is_train": is_train,
118 |             "tfm_gens": tfm_gens,
119 |             "image_format": cfg.INPUT.FORMAT,
120 |         }
121 |         return ret
122 | 
123 |     def __call__(self, dataset_dict):
124 |         """
125 |         Args:
126 |             dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
127 | 
128 |         Returns:
129 |             dict: a format that builtin models in detectron2 accept
130 |         """
131 |         dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
132 |         image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
133 |         utils.check_image_size(dataset_dict, image)
134 | 
135 |         # TODO: get padding mask
136 |         # by feeding a "segmentation mask" to the same transforms
137 |         padding_mask = np.ones(image.shape[:2])
138 | 
139 |         image, transforms = T.apply_transform_gens(self.tfm_gens, image)
140 |         # the crop transformation has default padding value 0 for segmentation
141 |         padding_mask = transforms.apply_segmentation(padding_mask)
142 |         padding_mask = ~ padding_mask.astype(bool)
143 | 
144 |         image_shape = image.shape[:2]  # h, w
145 | 
146 |         # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
147 |         # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
148 |         # Therefore it's important to use torch.Tensor.
149 |         dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
150 |         dataset_dict["padding_mask"] = torch.as_tensor(np.ascontiguousarray(padding_mask))
151 | 
152 |         if not self.is_train:
153 |             # USER: Modify this if you want to keep them for some reason.
154 |             dataset_dict.pop("annotations", None)
155 |             return dataset_dict
156 | 
157 |         if "annotations" in dataset_dict:
158 |             # USER: Modify this if you want to keep them for some reason.
159 |             for anno in dataset_dict["annotations"]:
160 |                 # Let's always keep mask
161 |                 anno.pop("keypoints", None)
162 | 
163 |             # USER: Implement additional transformations if you have other types of data
164 |             annos = [
165 |                 utils.transform_instance_annotations(obj, transforms, image_shape)
166 |                 for obj in dataset_dict.pop("annotations")
167 |                 if obj.get("iscrowd", 0) == 0
168 |             ]
169 |             # NOTE: does not support BitMask due to augmentation
170 |             # Current BitMask cannot handle empty objects
171 |             instances = utils.annotations_to_instances(annos, image_shape)
172 |             # After transforms such as cropping are applied, the bounding box may no longer
173 |             # tightly bound the object. As an example, imagine a triangle object
174 |             # [(0,0), (2,0), (0,2)] cropped by a box [(1,0),(2,2)] (XYXY format). The tight
175 |             # bounding box of the cropped triangle should be [(1,0),(2,1)], which is not equal to
176 |             # the intersection of original bounding box and the cropping box.
177 |             if not instances.has('gt_masks'):  # this is to avoid empty annotation
178 |                 instances.gt_masks = PolygonMasks([])
179 |             instances.gt_boxes = instances.gt_masks.get_bounding_boxes()
180 |             # Need to filter empty instances first (due to augmentation)
181 |             instances = utils.filter_empty_instances(instances)
182 |             # Generate masks from polygon
183 |             h, w = instances.image_size
184 |             if hasattr(instances, 'gt_masks'):
185 |                 gt_masks = instances.gt_masks
186 |                 gt_masks = convert_coco_poly_to_mask(gt_masks.polygons, h, w)
187 |                 instances.gt_masks = gt_masks
188 | 
189 |             dataset_dict["instances"] = instances
190 | 
191 |         return dataset_dict
192 | 


--------------------------------------------------------------------------------
/maskdino/data/dataset_mappers/coco_panoptic_new_baseline_dataset_mapper.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------
  2 | # Copyright (c) 2022 IDEA. All Rights Reserved.
  3 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  4 | # ------------------------------------------------------------------------
  5 | # Modified from Mask2Former https://github.com/facebookresearch/Mask2Former by Feng Li.
  6 | import copy
  7 | import logging
  8 | 
  9 | import numpy as np
 10 | import torch
 11 | 
 12 | from detectron2.config import configurable
 13 | from detectron2.data import detection_utils as utils
 14 | from detectron2.data import transforms as T
 15 | from detectron2.data.transforms import TransformGen
 16 | from detectron2.structures import BitMasks, Boxes, Instances
 17 | 
 18 | __all__ = ["COCOPanopticNewBaselineDatasetMapper"]
 19 | 
 20 | 
 21 | def build_transform_gen(cfg, is_train):
 22 |     """
 23 |     Create a list of default :class:`Augmentation` from config.
 24 |     Now it includes resizing and flipping.
 25 |     Returns:
 26 |         list[Augmentation]
 27 |     """
 28 |     assert is_train, "Only support training augmentation"
 29 |     image_size = cfg.INPUT.IMAGE_SIZE
 30 |     min_scale = cfg.INPUT.MIN_SCALE
 31 |     max_scale = cfg.INPUT.MAX_SCALE
 32 | 
 33 |     augmentation = []
 34 | 
 35 |     if cfg.INPUT.RANDOM_FLIP != "none":
 36 |         augmentation.append(
 37 |             T.RandomFlip(
 38 |                 horizontal=cfg.INPUT.RANDOM_FLIP == "horizontal",
 39 |                 vertical=cfg.INPUT.RANDOM_FLIP == "vertical",
 40 |             )
 41 |         )
 42 | 
 43 |     augmentation.extend([
 44 |         T.ResizeScale(
 45 |             min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size
 46 |         ),
 47 |         T.FixedSizeCrop(crop_size=(image_size, image_size)),
 48 |     ])
 49 | 
 50 |     return augmentation
 51 | 
 52 | 
 53 | # This is specifically designed for the COCO dataset.
 54 | class COCOPanopticNewBaselineDatasetMapper:
 55 |     """
 56 |     A callable which takes a dataset dict in Detectron2 Dataset format,
 57 |     and map it into a format used by MaskFormer.
 58 | 
 59 |     This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation.
 60 | 
 61 |     The callable currently does the following:
 62 | 
 63 |     1. Read the image from "file_name"
 64 |     2. Applies geometric transforms to the image and annotation
 65 |     3. Find and applies suitable cropping to the image and annotation
 66 |     4. Prepare image and annotation to Tensors
 67 |     """
 68 | 
 69 |     @configurable
 70 |     def __init__(
 71 |         self,
 72 |         is_train=True,
 73 |         *,
 74 |         tfm_gens,
 75 |         image_format,
 76 |     ):
 77 |         """
 78 |         NOTE: this interface is experimental.
 79 |         Args:
 80 |             is_train: for training or inference
 81 |             augmentations: a list of augmentations or deterministic transforms to apply
 82 |             crop_gen: crop augmentation
 83 |             tfm_gens: data augmentation
 84 |             image_format: an image format supported by :func:`detection_utils.read_image`.
 85 |         """
 86 |         self.tfm_gens = tfm_gens
 87 |         logging.getLogger(__name__).info(
 88 |             "[COCOPanopticNewBaselineDatasetMapper] Full TransformGens used in training: {}".format(
 89 |                 str(self.tfm_gens)
 90 |             )
 91 |         )
 92 | 
 93 |         self.img_format = image_format
 94 |         self.is_train = is_train
 95 | 
 96 |     @classmethod
 97 |     def from_config(cls, cfg, is_train=True):
 98 |         # Build augmentation
 99 |         tfm_gens = build_transform_gen(cfg, is_train)
100 | 
101 |         ret = {
102 |             "is_train": is_train,
103 |             "tfm_gens": tfm_gens,
104 |             "image_format": cfg.INPUT.FORMAT,
105 |         }
106 |         return ret
107 | 
108 |     def __call__(self, dataset_dict):
109 |         """
110 |         Args:
111 |             dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
112 | 
113 |         Returns:
114 |             dict: a format that builtin models in detectron2 accept
115 |         """
116 |         dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
117 |         image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
118 |         utils.check_image_size(dataset_dict, image)
119 | 
120 |         image, transforms = T.apply_transform_gens(self.tfm_gens, image)
121 |         image_shape = image.shape[:2]  # h, w
122 | 
123 |         # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
124 |         # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
125 |         # Therefore it's important to use torch.Tensor.
126 |         dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
127 | 
128 |         if not self.is_train:
129 |             # USER: Modify this if you want to keep them for some reason.
130 |             dataset_dict.pop("annotations", None)
131 |             return dataset_dict
132 | 
133 |         if "pan_seg_file_name" in dataset_dict:
134 |             pan_seg_gt = utils.read_image(dataset_dict.pop("pan_seg_file_name"), "RGB")
135 |             segments_info = dataset_dict["segments_info"]
136 | 
137 |             # apply the same transformation to panoptic segmentation
138 |             pan_seg_gt = transforms.apply_segmentation(pan_seg_gt)
139 | 
140 |             from panopticapi.utils import rgb2id
141 | 
142 |             pan_seg_gt = rgb2id(pan_seg_gt)
143 | 
144 |             instances = Instances(image_shape)
145 |             classes = []
146 |             masks = []
147 |             for segment_info in segments_info:
148 |                 class_id = segment_info["category_id"]
149 |                 if not segment_info["iscrowd"]:
150 |                     classes.append(class_id)
151 |                     masks.append(pan_seg_gt == segment_info["id"])
152 | 
153 |             classes = np.array(classes)
154 |             instances.gt_classes = torch.tensor(classes, dtype=torch.int64)
155 |             if len(masks) == 0:
156 |                 # Some image does not have annotation (all ignored)
157 |                 instances.gt_masks = torch.zeros((0, pan_seg_gt.shape[-2], pan_seg_gt.shape[-1]))
158 |                 instances.gt_boxes = Boxes(torch.zeros((0, 4)))
159 |             else:
160 |                 masks = BitMasks(
161 |                     torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks])
162 |                 )
163 |                 instances.gt_masks = masks.tensor
164 |                 instances.gt_boxes = masks.get_bounding_boxes()
165 | 
166 |             dataset_dict["instances"] = instances
167 | 
168 |         return dataset_dict
169 | 


--------------------------------------------------------------------------------
/maskdino/data/dataset_mappers/detr_dataset_mapper.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  2 | import copy
  3 | import logging
  4 | 
  5 | import numpy as np
  6 | import torch
  7 | 
  8 | from detectron2.data import detection_utils as utils
  9 | from detectron2.data import transforms as T
 10 | from detectron2.data.transforms import TransformGen
 11 | from pycocotools import mask as coco_mask
 12 | __all__ = ["DetrDatasetMapper"]
 13 | def convert_coco_poly_to_mask(segmentations, height, width):
 14 |     masks = []
 15 |     for polygons in segmentations:
 16 |         rles = coco_mask.frPyObjects(polygons, height, width)
 17 |         mask = coco_mask.decode(rles)
 18 |         if len(mask.shape) < 3:
 19 |             mask = mask[..., None]
 20 |         mask = torch.as_tensor(mask, dtype=torch.uint8)
 21 |         mask = mask.any(dim=2)
 22 |         masks.append(mask)
 23 |     if masks:
 24 |         masks = torch.stack(masks, dim=0)
 25 |     else:
 26 |         masks = torch.zeros((0, height, width), dtype=torch.uint8)
 27 |     return masks
 28 | 
 29 | def build_transform_gen(cfg, is_train):
 30 |     """
 31 |     Create a list of :class:`TransformGen` from config.
 32 |     Returns:
 33 |         list[TransformGen]
 34 |     """
 35 |     if is_train:
 36 |         min_size = cfg.INPUT.MIN_SIZE_TRAIN
 37 |         max_size = cfg.INPUT.MAX_SIZE_TRAIN
 38 |         sample_style = cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING
 39 |     else:
 40 |         min_size = cfg.INPUT.MIN_SIZE_TEST
 41 |         max_size = cfg.INPUT.MAX_SIZE_TEST
 42 |         sample_style = "choice"
 43 |     if sample_style == "range":
 44 |         assert len(min_size) == 2, "more than 2 ({}) min_size(s) are provided for ranges".format(len(min_size))
 45 | 
 46 |     logger = logging.getLogger(__name__)
 47 | 
 48 |     tfm_gens = []
 49 |     if is_train:
 50 |         tfm_gens.append(T.RandomFlip())
 51 |     tfm_gens.append(T.ResizeShortestEdge(min_size, max_size, sample_style))
 52 |     if is_train:
 53 |         logger.info("TransformGens used in training: " + str(tfm_gens))
 54 |     return tfm_gens
 55 | 
 56 | 
 57 | class DetrDatasetMapper:
 58 |     """
 59 |     A callable which takes a dataset dict in Detectron2 Dataset format,
 60 |     and map it into a format used by DETR.
 61 | 
 62 |     The callable currently does the following:
 63 | 
 64 |     1. Read the image from "file_name"
 65 |     2. Applies geometric transforms to the image and annotation
 66 |     3. Find and applies suitable cropping to the image and annotation
 67 |     4. Prepare image and annotation to Tensors
 68 |     """
 69 | 
 70 |     def __init__(self, cfg, is_train=True):
 71 |         if cfg.INPUT.CROP.ENABLED and is_train:
 72 |             self.crop_gen = [
 73 |                 T.ResizeShortestEdge([400, 500, 600], sample_style="choice"),
 74 |                 T.RandomCrop(cfg.INPUT.CROP.TYPE, cfg.INPUT.CROP.SIZE),
 75 |             ]
 76 |         else:
 77 |             self.crop_gen = None
 78 | 
 79 |         self.mask_on = True
 80 |         self.tfm_gens = build_transform_gen(cfg, is_train)
 81 |         logging.getLogger(__name__).info(
 82 |             "Full TransformGens used in training: {}, crop: {}".format(str(self.tfm_gens), str(self.crop_gen))
 83 |         )
 84 | 
 85 |         self.img_format = cfg.INPUT.FORMAT
 86 |         self.is_train = is_train
 87 | 
 88 |     def __call__(self, dataset_dict):
 89 |         """
 90 |         Args:
 91 |             dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
 92 | 
 93 |         Returns:
 94 |             dict: a format that builtin models in detectron2 accept
 95 |         """
 96 |         dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
 97 |         image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
 98 |         utils.check_image_size(dataset_dict, image)
 99 | 
100 |         if self.crop_gen is None:
101 |             image, transforms = T.apply_transform_gens(self.tfm_gens, image)
102 |         else:
103 |             if np.random.rand() > 0.5:
104 |                 image, transforms = T.apply_transform_gens(self.tfm_gens, image)
105 |             else:
106 |                 image, transforms = T.apply_transform_gens(
107 |                     self.tfm_gens[:-1] + self.crop_gen + self.tfm_gens[-1:], image
108 |                 )
109 | 
110 |         image_shape = image.shape[:2]  # h, w
111 | 
112 |         # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
113 |         # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
114 |         # Therefore it's important to use torch.Tensor.
115 |         dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
116 | 
117 |         if not self.is_train:
118 |             # USER: Modify this if you want to keep them for some reason.
119 |             dataset_dict.pop("annotations", None)
120 |             return dataset_dict
121 | 
122 |         if "annotations" in dataset_dict:
123 |             # USER: Modify this if you want to keep them for some reason.
124 |             for anno in dataset_dict["annotations"]:
125 |                 if not self.mask_on:
126 |                     anno.pop("segmentation", None)
127 |                 anno.pop("keypoints", None)
128 | 
129 |             # USER: Implement additional transformations if you have other types of data
130 |             annos = [
131 |                 utils.transform_instance_annotations(obj, transforms, image_shape)
132 |                 for obj in dataset_dict.pop("annotations")
133 |                 if obj.get("iscrowd", 0) == 0
134 |             ]
135 |             instances = utils.annotations_to_instances(annos, image_shape)
136 |             instances = utils.filter_empty_instances(instances)
137 |             h, w = instances.image_size
138 |             if hasattr(instances, 'gt_masks'):
139 |                 gt_masks = instances.gt_masks
140 |                 gt_masks = convert_coco_poly_to_mask(gt_masks.polygons, h, w)
141 |                 instances.gt_masks = gt_masks
142 | 
143 |             dataset_dict["instances"] = instances
144 |         return dataset_dict
145 | 


--------------------------------------------------------------------------------
/maskdino/data/dataset_mappers/mask_former_semantic_dataset_mapper.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | import copy
  3 | import logging
  4 | 
  5 | import numpy as np
  6 | import torch
  7 | from torch.nn import functional as F
  8 | 
  9 | from detectron2.config import configurable
 10 | from detectron2.data import MetadataCatalog
 11 | from detectron2.data import detection_utils as utils
 12 | from detectron2.data import transforms as T
 13 | from detectron2.projects.point_rend import ColorAugSSDTransform
 14 | from detectron2.structures import BitMasks, Boxes, Instances
 15 | 
 16 | __all__ = ["MaskFormerSemanticDatasetMapper"]
 17 | 
 18 | 
 19 | class MaskFormerSemanticDatasetMapper:
 20 |     """
 21 |     A callable which takes a dataset dict in Detectron2 Dataset format,
 22 |     and map it into a format used by MaskFormer for semantic segmentation.
 23 | 
 24 |     The callable currently does the following:
 25 | 
 26 |     1. Read the image from "file_name"
 27 |     2. Applies geometric transforms to the image and annotation
 28 |     3. Find and applies suitable cropping to the image and annotation
 29 |     4. Prepare image and annotation to Tensors
 30 |     """
 31 | 
 32 |     @configurable
 33 |     def __init__(
 34 |         self,
 35 |         is_train=True,
 36 |         *,
 37 |         augmentations,
 38 |         image_format,
 39 |         ignore_label,
 40 |         size_divisibility,
 41 |     ):
 42 |         """
 43 |         NOTE: this interface is experimental.
 44 |         Args:
 45 |             is_train: for training or inference
 46 |             augmentations: a list of augmentations or deterministic transforms to apply
 47 |             image_format: an image format supported by :func:`detection_utils.read_image`.
 48 |             ignore_label: the label that is ignored to evaluation
 49 |             size_divisibility: pad image size to be divisible by this value
 50 |         """
 51 |         self.is_train = is_train
 52 |         self.tfm_gens = augmentations
 53 |         self.img_format = image_format
 54 |         self.ignore_label = ignore_label
 55 |         self.size_divisibility = size_divisibility
 56 | 
 57 |         logger = logging.getLogger(__name__)
 58 |         mode = "training" if is_train else "inference"
 59 |         logger.info(f"[{self.__class__.__name__}] Augmentations used in {mode}: {augmentations}")
 60 | 
 61 |     @classmethod
 62 |     def from_config(cls, cfg, is_train=True):
 63 |         # Build augmentation
 64 |         augs = [
 65 |             T.ResizeShortestEdge(
 66 |                 cfg.INPUT.MIN_SIZE_TRAIN,
 67 |                 cfg.INPUT.MAX_SIZE_TRAIN,
 68 |                 cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING,
 69 |             )
 70 |         ]
 71 |         if cfg.INPUT.CROP.ENABLED:
 72 |             augs.append(
 73 |                 T.RandomCrop_CategoryAreaConstraint(
 74 |                     cfg.INPUT.CROP.TYPE,
 75 |                     cfg.INPUT.CROP.SIZE,
 76 |                     cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA,
 77 |                     cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
 78 |                 )
 79 |             )
 80 |         if cfg.INPUT.COLOR_AUG_SSD:
 81 |             augs.append(ColorAugSSDTransform(img_format=cfg.INPUT.FORMAT))
 82 |         augs.append(T.RandomFlip())
 83 | 
 84 |         # Assume always applies to the training set.
 85 |         dataset_names = cfg.DATASETS.TRAIN
 86 |         meta = MetadataCatalog.get(dataset_names[0])
 87 |         ignore_label = meta.ignore_label
 88 | 
 89 |         ret = {
 90 |             "is_train": is_train,
 91 |             "augmentations": augs,
 92 |             "image_format": cfg.INPUT.FORMAT,
 93 |             "ignore_label": ignore_label,
 94 |             "size_divisibility": cfg.INPUT.SIZE_DIVISIBILITY,
 95 |         }
 96 |         return ret
 97 | 
 98 |     def __call__(self, dataset_dict):
 99 |         """
100 |         Args:
101 |             dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
102 | 
103 |         Returns:
104 |             dict: a format that builtin models in detectron2 accept
105 |         """
106 |         assert self.is_train, "MaskFormerSemanticDatasetMapper should only be used for training!"
107 | 
108 |         dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
109 |         image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
110 |         utils.check_image_size(dataset_dict, image)
111 | 
112 |         if "sem_seg_file_name" in dataset_dict:
113 |             # PyTorch transformation not implemented for uint16, so converting it to double first
114 |             sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name")).astype("double")
115 |         else:
116 |             sem_seg_gt = None
117 | 
118 |         if sem_seg_gt is None:
119 |             raise ValueError(
120 |                 "Cannot find 'sem_seg_file_name' for semantic segmentation dataset {}.".format(
121 |                     dataset_dict["file_name"]
122 |                 )
123 |             )
124 | 
125 |         aug_input = T.AugInput(image, sem_seg=sem_seg_gt)
126 |         aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input)
127 |         image = aug_input.image
128 |         sem_seg_gt = aug_input.sem_seg
129 | 
130 |         # Pad image and segmentation label here!
131 |         image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
132 |         if sem_seg_gt is not None:
133 |             sem_seg_gt = torch.as_tensor(sem_seg_gt.astype("long"))
134 | 
135 |         if self.size_divisibility > 0:
136 |             image_size = (image.shape[-2], image.shape[-1])
137 |             padding_size = [
138 |                 0,
139 |                 self.size_divisibility - image_size[1],
140 |                 0,
141 |                 self.size_divisibility - image_size[0],
142 |             ]
143 |             image = F.pad(image, padding_size, value=128).contiguous()
144 |             if sem_seg_gt is not None:
145 |                 sem_seg_gt = F.pad(sem_seg_gt, padding_size, value=self.ignore_label).contiguous()
146 | 
147 |         image_shape = (image.shape[-2], image.shape[-1])  # h, w
148 | 
149 |         # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
150 |         # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
151 |         # Therefore it's important to use torch.Tensor.
152 |         dataset_dict["image"] = image
153 | 
154 |         if sem_seg_gt is not None:
155 |             dataset_dict["sem_seg"] = sem_seg_gt.long()
156 | 
157 |         if "annotations" in dataset_dict:
158 |             raise ValueError("Semantic segmentation dataset should not have 'annotations'.")
159 | 
160 |         # Prepare per-category binary masks
161 |         if sem_seg_gt is not None:
162 |             sem_seg_gt = sem_seg_gt.numpy()
163 |             instances = Instances(image_shape)
164 |             classes = np.unique(sem_seg_gt)
165 |             # remove ignored region
166 |             classes = classes[classes != self.ignore_label]
167 |             instances.gt_classes = torch.tensor(classes, dtype=torch.int64)
168 | 
169 |             masks = []
170 |             for class_id in classes:
171 |                 masks.append(sem_seg_gt == class_id)
172 | 
173 |             if len(masks) == 0:
174 |                 # Some image does not have annotation (all ignored)
175 |                 instances.gt_masks = torch.zeros((0, sem_seg_gt.shape[-2], sem_seg_gt.shape[-1]))
176 |                 instances.gt_boxes = Boxes(torch.zeros((0,4)))
177 |             else:
178 |                 masks = BitMasks(
179 |                     torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks])
180 |                 )
181 |                 instances.gt_masks = masks.tensor
182 |                 instances.gt_boxes = masks.get_bounding_boxes()
183 | 
184 |             dataset_dict["instances"] = instances
185 | 
186 |         return dataset_dict
187 | 


--------------------------------------------------------------------------------
/maskdino/data/datasets/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | from . import (
 3 |     register_ade20k_full,
 4 |     register_ade20k_panoptic,
 5 |     register_coco_stuff_10k,
 6 |     register_mapillary_vistas,
 7 |     register_coco_panoptic_annos_semseg,
 8 |     register_ade20k_instance,
 9 |     register_mapillary_vistas_panoptic,
10 | )
11 | 


--------------------------------------------------------------------------------
/maskdino/data/datasets/register_ade20k_instance.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | import json
 3 | import logging
 4 | import numpy as np
 5 | import os
 6 | from PIL import Image
 7 | 
 8 | from detectron2.data import DatasetCatalog, MetadataCatalog
 9 | from detectron2.data.datasets.coco import load_coco_json, register_coco_instances
10 | from detectron2.utils.file_io import PathManager
11 | 
12 | ADE_CATEGORIES = [{'id': 7, 'name': 'bed'}, {'id': 8, 'name': 'windowpane'}, {'id': 10, 'name': 'cabinet'}, {'id': 12, 'name': 'person'}, {'id': 14, 'name': 'door'}, {'id': 15, 'name': 'table'}, {'id': 18, 'name': 'curtain'}, {'id': 19, 'name': 'chair'}, {'id': 20, 'name': 'car'}, {'id': 22, 'name': 'painting'}, {'id': 23, 'name': 'sofa'}, {'id': 24, 'name': 'shelf'}, {'id': 27, 'name': 'mirror'}, {'id': 30, 'name': 'armchair'}, {'id': 31, 'name': 'seat'}, {'id': 32, 'name': 'fence'}, {'id': 33, 'name': 'desk'}, {'id': 35, 'name': 'wardrobe'}, {'id': 36, 'name': 'lamp'}, {'id': 37, 'name': 'bathtub'}, {'id': 38, 'name': 'railing'}, {'id': 39, 'name': 'cushion'}, {'id': 41, 'name': 'box'}, {'id': 42, 'name': 'column'}, {'id': 43, 'name': 'signboard'}, {'id': 44, 'name': 'chest of drawers'}, {'id': 45, 'name': 'counter'}, {'id': 47, 'name': 'sink'}, {'id': 49, 'name': 'fireplace'}, {'id': 50, 'name': 'refrigerator'}, {'id': 53, 'name': 'stairs'}, {'id': 55, 'name': 'case'}, {'id': 56, 'name': 'pool table'}, {'id': 57, 'name': 'pillow'}, {'id': 58, 'name': 'screen door'}, {'id': 62, 'name': 'bookcase'}, {'id': 64, 'name': 'coffee table'}, {'id': 65, 'name': 'toilet'}, {'id': 66, 'name': 'flower'}, {'id': 67, 'name': 'book'}, {'id': 69, 'name': 'bench'}, {'id': 70, 'name': 'countertop'}, {'id': 71, 'name': 'stove'}, {'id': 72, 'name': 'palm'}, {'id': 73, 'name': 'kitchen island'}, {'id': 74, 'name': 'computer'}, {'id': 75, 'name': 'swivel chair'}, {'id': 76, 'name': 'boat'}, {'id': 78, 'name': 'arcade machine'}, {'id': 80, 'name': 'bus'}, {'id': 81, 'name': 'towel'}, {'id': 82, 'name': 'light'}, {'id': 83, 'name': 'truck'}, {'id': 85, 'name': 'chandelier'}, {'id': 86, 'name': 'awning'}, {'id': 87, 'name': 'streetlight'}, {'id': 88, 'name': 'booth'}, {'id': 89, 'name': 'television receiver'}, {'id': 90, 'name': 'airplane'}, {'id': 92, 'name': 'apparel'}, {'id': 93, 'name': 'pole'}, {'id': 95, 'name': 'bannister'}, {'id': 97, 'name': 'ottoman'}, {'id': 98, 'name': 'bottle'}, {'id': 102, 'name': 'van'}, {'id': 103, 'name': 'ship'}, {'id': 104, 'name': 'fountain'}, {'id': 107, 'name': 'washer'}, {'id': 108, 'name': 'plaything'}, {'id': 110, 'name': 'stool'}, {'id': 111, 'name': 'barrel'}, {'id': 112, 'name': 'basket'}, {'id': 115, 'name': 'bag'}, {'id': 116, 'name': 'minibike'}, {'id': 118, 'name': 'oven'}, {'id': 119, 'name': 'ball'}, {'id': 120, 'name': 'food'}, {'id': 121, 'name': 'step'}, {'id': 123, 'name': 'trade name'}, {'id': 124, 'name': 'microwave'}, {'id': 125, 'name': 'pot'}, {'id': 126, 'name': 'animal'}, {'id': 127, 'name': 'bicycle'}, {'id': 129, 'name': 'dishwasher'}, {'id': 130, 'name': 'screen'}, {'id': 132, 'name': 'sculpture'}, {'id': 133, 'name': 'hood'}, {'id': 134, 'name': 'sconce'}, {'id': 135, 'name': 'vase'}, {'id': 136, 'name': 'traffic light'}, {'id': 137, 'name': 'tray'}, {'id': 138, 'name': 'ashcan'}, {'id': 139, 'name': 'fan'}, {'id': 142, 'name': 'plate'}, {'id': 143, 'name': 'monitor'}, {'id': 144, 'name': 'bulletin board'}, {'id': 146, 'name': 'radiator'}, {'id': 147, 'name': 'glass'}, {'id': 148, 'name': 'clock'}, {'id': 149, 'name': 'flag'}]
13 | 
14 | 
15 | _PREDEFINED_SPLITS = {
16 |     # point annotations without masks
17 |     "ade20k_instance_train": (
18 |         "ADEChallengeData2016/images/training",
19 |         "ADEChallengeData2016/ade20k_instance_train.json",
20 |     ),
21 |     "ade20k_instance_val": (
22 |         "ADEChallengeData2016/images/validation",
23 |         "ADEChallengeData2016/ade20k_instance_val.json",
24 |     ),
25 | }
26 | 
27 | 
28 | def _get_ade_instances_meta():
29 |     thing_ids = [k["id"] for k in ADE_CATEGORIES]
30 |     assert len(thing_ids) == 100, len(thing_ids)
31 |     # Mapping from the incontiguous ADE category id to an id in [0, 99]
32 |     thing_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(thing_ids)}
33 |     thing_classes = [k["name"] for k in ADE_CATEGORIES]
34 |     ret = {
35 |         "thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id,
36 |         "thing_classes": thing_classes,
37 |     }
38 |     return ret
39 | 
40 | 
41 | def register_all_ade20k_instance(root):
42 |     for key, (image_root, json_file) in _PREDEFINED_SPLITS.items():
43 |         # Assume pre-defined datasets live in `./datasets`.
44 |         register_coco_instances(
45 |             key,
46 |             _get_ade_instances_meta(),
47 |             os.path.join(root, json_file) if "://" not in json_file else json_file,
48 |             os.path.join(root, image_root),
49 |         )
50 | 
51 | 
52 | _root = os.getenv("DETECTRON2_DATASETS", "datasets")
53 | register_all_ade20k_instance(_root)
54 | 


--------------------------------------------------------------------------------
/maskdino/data/datasets/register_coco_panoptic_annos_semseg.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | import json
  3 | import os
  4 | 
  5 | from detectron2.data import DatasetCatalog, MetadataCatalog
  6 | from detectron2.data.datasets import load_sem_seg
  7 | from detectron2.data.datasets.builtin_meta import COCO_CATEGORIES
  8 | from detectron2.utils.file_io import PathManager
  9 | 
 10 | 
 11 | _PREDEFINED_SPLITS_COCO_PANOPTIC = {
 12 |     "coco_2017_train_panoptic": (
 13 |         # This is the original panoptic annotation directory
 14 |         "coco/panoptic_train2017",
 15 |         "coco/annotations/panoptic_train2017.json",
 16 |         # This directory contains semantic annotations that are
 17 |         # converted from panoptic annotations.
 18 |         # It is used by PanopticFPN.
 19 |         # You can use the script at detectron2/datasets/prepare_panoptic_fpn.py
 20 |         # to create these directories.
 21 |         "coco/panoptic_semseg_train2017",
 22 |     ),
 23 |     "coco_2017_val_panoptic": (
 24 |         "coco/panoptic_val2017",
 25 |         "coco/annotations/panoptic_val2017.json",
 26 |         "coco/panoptic_semseg_val2017",
 27 |     ),
 28 | }
 29 | 
 30 | 
 31 | def get_metadata():
 32 |     meta = {}
 33 |     # The following metadata maps contiguous id from [0, #thing categories +
 34 |     # #stuff categories) to their names and colors. We have to replica of the
 35 |     # same name and color under "thing_*" and "stuff_*" because the current
 36 |     # visualization function in D2 handles thing and class classes differently
 37 |     # due to some heuristic used in Panoptic FPN. We keep the same naming to
 38 |     # enable reusing existing visualization functions.
 39 |     thing_classes = [k["name"] for k in COCO_CATEGORIES if k["isthing"] == 1]
 40 |     thing_colors = [k["color"] for k in COCO_CATEGORIES if k["isthing"] == 1]
 41 |     stuff_classes = [k["name"] for k in COCO_CATEGORIES]
 42 |     stuff_colors = [k["color"] for k in COCO_CATEGORIES]
 43 | 
 44 |     meta["thing_classes"] = thing_classes
 45 |     meta["thing_colors"] = thing_colors
 46 |     meta["stuff_classes"] = stuff_classes
 47 |     meta["stuff_colors"] = stuff_colors
 48 | 
 49 |     # Convert category id for training:
 50 |     #   category id: like semantic segmentation, it is the class id for each
 51 |     #   pixel. Since there are some classes not used in evaluation, the category
 52 |     #   id is not always contiguous and thus we have two set of category ids:
 53 |     #       - original category id: category id in the original dataset, mainly
 54 |     #           used for evaluation.
 55 |     #       - contiguous category id: [0, #classes), in order to train the linear
 56 |     #           softmax classifier.
 57 |     thing_dataset_id_to_contiguous_id = {}
 58 |     stuff_dataset_id_to_contiguous_id = {}
 59 | 
 60 |     for i, cat in enumerate(COCO_CATEGORIES):
 61 |         if cat["isthing"]:
 62 |             thing_dataset_id_to_contiguous_id[cat["id"]] = i
 63 |         # else:
 64 |         #     stuff_dataset_id_to_contiguous_id[cat["id"]] = i
 65 | 
 66 |         # in order to use sem_seg evaluator
 67 |         stuff_dataset_id_to_contiguous_id[cat["id"]] = i
 68 | 
 69 |     meta["thing_dataset_id_to_contiguous_id"] = thing_dataset_id_to_contiguous_id
 70 |     meta["stuff_dataset_id_to_contiguous_id"] = stuff_dataset_id_to_contiguous_id
 71 | 
 72 |     return meta
 73 | 
 74 | 
 75 | def load_coco_panoptic_json(json_file, image_dir, gt_dir, semseg_dir, meta):
 76 |     """
 77 |     Args:
 78 |         image_dir (str): path to the raw dataset. e.g., "~/coco/train2017".
 79 |         gt_dir (str): path to the raw annotations. e.g., "~/coco/panoptic_train2017".
 80 |         json_file (str): path to the json file. e.g., "~/coco/annotations/panoptic_train2017.json".
 81 |     Returns:
 82 |         list[dict]: a list of dicts in Detectron2 standard format. (See
 83 |         `Using Custom Datasets </tutorials/datasets.html>`_ )
 84 |     """
 85 | 
 86 |     def _convert_category_id(segment_info, meta):
 87 |         if segment_info["category_id"] in meta["thing_dataset_id_to_contiguous_id"]:
 88 |             segment_info["category_id"] = meta["thing_dataset_id_to_contiguous_id"][
 89 |                 segment_info["category_id"]
 90 |             ]
 91 |             segment_info["isthing"] = True
 92 |         else:
 93 |             segment_info["category_id"] = meta["stuff_dataset_id_to_contiguous_id"][
 94 |                 segment_info["category_id"]
 95 |             ]
 96 |             segment_info["isthing"] = False
 97 |         return segment_info
 98 | 
 99 |     with PathManager.open(json_file) as f:
100 |         json_info = json.load(f)
101 | 
102 |     ret = []
103 |     for ann in json_info["annotations"]:
104 |         image_id = int(ann["image_id"])
105 |         # TODO: currently we assume image and label has the same filename but
106 |         # different extension, and images have extension ".jpg" for COCO. Need
107 |         # to make image extension a user-provided argument if we extend this
108 |         # function to support other COCO-like datasets.
109 |         image_file = os.path.join(image_dir, os.path.splitext(ann["file_name"])[0] + ".jpg")
110 |         label_file = os.path.join(gt_dir, ann["file_name"])
111 |         sem_label_file = os.path.join(semseg_dir, ann["file_name"])
112 |         segments_info = [_convert_category_id(x, meta) for x in ann["segments_info"]]
113 |         ret.append(
114 |             {
115 |                 "file_name": image_file,
116 |                 "image_id": image_id,
117 |                 "pan_seg_file_name": label_file,
118 |                 "sem_seg_file_name": sem_label_file,
119 |                 "segments_info": segments_info,
120 |             }
121 |         )
122 |     assert len(ret), f"No images found in {image_dir}!"
123 |     assert PathManager.isfile(ret[0]["file_name"]), ret[0]["file_name"]
124 |     assert PathManager.isfile(ret[0]["pan_seg_file_name"]), ret[0]["pan_seg_file_name"]
125 |     assert PathManager.isfile(ret[0]["sem_seg_file_name"]), ret[0]["sem_seg_file_name"]
126 |     return ret
127 | 
128 | 
129 | def register_coco_panoptic_annos_sem_seg(
130 |     name, metadata, image_root, panoptic_root, panoptic_json, sem_seg_root, instances_json
131 | ):
132 |     panoptic_name = name
133 |     delattr(MetadataCatalog.get(panoptic_name), "thing_classes")
134 |     delattr(MetadataCatalog.get(panoptic_name), "thing_colors")
135 |     MetadataCatalog.get(panoptic_name).set(
136 |         thing_classes=metadata["thing_classes"],
137 |         thing_colors=metadata["thing_colors"],
138 |         # thing_dataset_id_to_contiguous_id=metadata["thing_dataset_id_to_contiguous_id"],
139 |     )
140 | 
141 |     # the name is "coco_2017_train_panoptic_with_sem_seg" and "coco_2017_val_panoptic_with_sem_seg"
142 |     semantic_name = name + "_with_sem_seg"
143 |     DatasetCatalog.register(
144 |         semantic_name,
145 |         lambda: load_coco_panoptic_json(panoptic_json, image_root, panoptic_root, sem_seg_root, metadata),
146 |     )
147 |     MetadataCatalog.get(semantic_name).set(
148 |         sem_seg_root=sem_seg_root,
149 |         panoptic_root=panoptic_root,
150 |         image_root=image_root,
151 |         panoptic_json=panoptic_json,
152 |         json_file=instances_json,
153 |         evaluator_type="coco_panoptic_seg",
154 |         ignore_label=255,
155 |         label_divisor=1000,
156 |         **metadata,
157 |     )
158 | 
159 | 
160 | def register_all_coco_panoptic_annos_sem_seg(root):
161 |     for (
162 |         prefix,
163 |         (panoptic_root, panoptic_json, semantic_root),
164 |     ) in _PREDEFINED_SPLITS_COCO_PANOPTIC.items():
165 |         prefix_instances = prefix[: -len("_panoptic")]
166 |         instances_meta = MetadataCatalog.get(prefix_instances)
167 |         image_root, instances_json = instances_meta.image_root, instances_meta.json_file
168 | 
169 |         register_coco_panoptic_annos_sem_seg(
170 |             prefix,
171 |             get_metadata(),
172 |             image_root,
173 |             os.path.join(root, panoptic_root),
174 |             os.path.join(root, panoptic_json),
175 |             os.path.join(root, semantic_root),
176 |             instances_json,
177 |         )
178 | 
179 | 
180 | _root = os.getenv("DETECTRON2_DATASETS", "datasets")
181 | register_all_coco_panoptic_annos_sem_seg(_root)
182 | 


--------------------------------------------------------------------------------
/maskdino/evaluation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IDEA-Research/MaskDINO/3831d8514a3728535ace8d4ecc7d28044c42dd14/maskdino/evaluation/__init__.py


--------------------------------------------------------------------------------
/maskdino/evaluation/instance_evaluation.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | import contextlib
  3 | import copy
  4 | import io
  5 | import itertools
  6 | import json
  7 | import logging
  8 | import numpy as np
  9 | import os
 10 | import pickle
 11 | from collections import OrderedDict
 12 | import pycocotools.mask as mask_util
 13 | import torch
 14 | from pycocotools.coco import COCO
 15 | from pycocotools.cocoeval import COCOeval
 16 | from tabulate import tabulate
 17 | 
 18 | import detectron2.utils.comm as comm
 19 | from detectron2.config import CfgNode
 20 | from detectron2.data import MetadataCatalog
 21 | from detectron2.data.datasets.coco import convert_to_coco_json
 22 | from detectron2.evaluation.coco_evaluation import COCOEvaluator, _evaluate_predictions_on_coco
 23 | from detectron2.evaluation.fast_eval_api import COCOeval_opt
 24 | from detectron2.structures import Boxes, BoxMode, pairwise_iou
 25 | from detectron2.utils.file_io import PathManager
 26 | from detectron2.utils.logger import create_small_table
 27 | 
 28 | 
 29 | # modified from COCOEvaluator for instance segmetnat
 30 | class InstanceSegEvaluator(COCOEvaluator):
 31 |     """
 32 |     Evaluate AR for object proposals, AP for instance detection/segmentation, AP
 33 |     for keypoint detection outputs using COCO's metrics.
 34 |     See http://cocodataset.org/#detection-eval and
 35 |     http://cocodataset.org/#keypoints-eval to understand its metrics.
 36 |     The metrics range from 0 to 100 (instead of 0 to 1), where a -1 or NaN means
 37 |     the metric cannot be computed (e.g. due to no predictions made).
 38 | 
 39 |     In addition to COCO, this evaluator is able to support any bounding box detection,
 40 |     instance segmentation, or keypoint detection dataset.
 41 |     """
 42 | 
 43 |     def _eval_predictions(self, predictions, img_ids=None):
 44 |         """
 45 |         Evaluate predictions. Fill self._results with the metrics of the tasks.
 46 |         """
 47 |         self._logger.info("Preparing results for COCO format ...")
 48 |         coco_results = list(itertools.chain(*[x["instances"] for x in predictions]))
 49 |         tasks = self._tasks or self._tasks_from_predictions(coco_results)
 50 | 
 51 |         # unmap the category ids for COCO
 52 |         if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"):
 53 |             dataset_id_to_contiguous_id = self._metadata.thing_dataset_id_to_contiguous_id
 54 |             # all_contiguous_ids = list(dataset_id_to_contiguous_id.values())
 55 |             # num_classes = len(all_contiguous_ids)
 56 |             # assert min(all_contiguous_ids) == 0 and max(all_contiguous_ids) == num_classes - 1
 57 | 
 58 |             reverse_id_mapping = {v: k for k, v in dataset_id_to_contiguous_id.items()}
 59 |             for result in coco_results:
 60 |                 category_id = result["category_id"]
 61 |                 # assert category_id < num_classes, (
 62 |                 #     f"A prediction has class={category_id}, "
 63 |                 #     f"but the dataset only has {num_classes} classes and "
 64 |                 #     f"predicted class id should be in [0, {num_classes - 1}]."
 65 |                 # )
 66 |                 assert category_id in reverse_id_mapping, (
 67 |                     f"A prediction has class={category_id}, "
 68 |                     f"but the dataset only has class ids in {dataset_id_to_contiguous_id}."
 69 |                 )
 70 |                 result["category_id"] = reverse_id_mapping[category_id]
 71 | 
 72 |         if self._output_dir:
 73 |             file_path = os.path.join(self._output_dir, "coco_instances_results.json")
 74 |             self._logger.info("Saving results to {}".format(file_path))
 75 |             with PathManager.open(file_path, "w") as f:
 76 |                 f.write(json.dumps(coco_results))
 77 |                 f.flush()
 78 | 
 79 |         if not self._do_evaluation:
 80 |             self._logger.info("Annotations are not available for evaluation.")
 81 |             return
 82 | 
 83 |         self._logger.info(
 84 |             "Evaluating predictions with {} COCO API...".format(
 85 |                 "unofficial" if self._use_fast_impl else "official"
 86 |             )
 87 |         )
 88 |         for task in sorted(tasks):
 89 |             assert task in {"bbox", "segm", "keypoints"}, f"Got unknown task: {task}!"
 90 |             coco_eval = (
 91 |                 _evaluate_predictions_on_coco(
 92 |                     self._coco_api,
 93 |                     coco_results,
 94 |                     task,
 95 |                     kpt_oks_sigmas=self._kpt_oks_sigmas,
 96 |                     use_fast_impl=self._use_fast_impl,
 97 |                     img_ids=img_ids,
 98 |                     max_dets_per_image=self._max_dets_per_image,
 99 |                 )
100 |                 if len(coco_results) > 0
101 |                 else None  # cocoapi does not handle empty results very well
102 |             )
103 | 
104 |             res = self._derive_coco_results(
105 |                 coco_eval, task, class_names=self._metadata.get("thing_classes")
106 |             )
107 |             self._results[task] = res
108 | 


--------------------------------------------------------------------------------
/maskdino/modeling/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) IDEA, Inc. and its affiliates.
2 | from .backbone.swin import D2SwinTransformer
3 | from .pixel_decoder.maskdino_encoder import MaskDINOEncoder
4 | from .meta_arch.maskdino_head import MaskDINOHead
5 | 
6 | 


--------------------------------------------------------------------------------
/maskdino/modeling/backbone/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/maskdino/modeling/meta_arch/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) IDEA, Inc. and its affiliates.
2 | 
3 | 


--------------------------------------------------------------------------------
/maskdino/modeling/meta_arch/maskdino_head.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------
 2 | # Copyright (c) 2022 IDEA. All Rights Reserved.
 3 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 4 | # ------------------------------------------------------------------------
 5 | # Modified from Mask2Former https://github.com/facebookresearch/Mask2Former by Feng Li and Hao Zhang.
 6 | # ------------------------------------------------------------------------------
 7 | import logging
 8 | from typing import Callable, Dict, List, Optional, Tuple, Union
 9 | 
10 | from torch import nn
11 | 
12 | from detectron2.config import configurable
13 | from detectron2.layers import Conv2d, ShapeSpec, get_norm
14 | from detectron2.modeling import SEM_SEG_HEADS_REGISTRY
15 | 
16 | from ..transformer_decoder.maskdino_decoder import build_transformer_decoder
17 | from ..pixel_decoder.maskdino_encoder import build_pixel_decoder
18 | 
19 | 
20 | @SEM_SEG_HEADS_REGISTRY.register()
21 | class MaskDINOHead(nn.Module):
22 |     @configurable
23 |     def __init__(
24 |         self,
25 |         input_shape: Dict[str, ShapeSpec],
26 |         *,
27 |         num_classes: int,
28 |         pixel_decoder: nn.Module,
29 |         loss_weight: float = 1.0,
30 |         ignore_value: int = -1,
31 |         transformer_predictor: nn.Module,
32 |     ):
33 |         """
34 |         Args:
35 |             input_shape: shapes (channels and stride) of the input features
36 |             num_classes: number of classes to predict
37 |             pixel_decoder: the pixel decoder module
38 |             loss_weight: loss weight
39 |             ignore_value: category id to be ignored during training.
40 |             transformer_predictor: the transformer decoder that makes prediction
41 |             transformer_in_feature: input feature name to the transformer_predictor
42 |         """
43 |         super().__init__()
44 |         input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride)
45 |         self.in_features = [k for k, v in input_shape]
46 |         self.ignore_value = ignore_value
47 |         self.common_stride = 4
48 |         self.loss_weight = loss_weight
49 | 
50 |         self.pixel_decoder = pixel_decoder
51 |         self.predictor = transformer_predictor
52 | 
53 |         self.num_classes = num_classes
54 | 
55 |     @classmethod
56 |     def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
57 |         transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
58 | 
59 |         return {
60 |             "input_shape": {
61 |                 k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES
62 |             },
63 |             "ignore_value": cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
64 |             "num_classes": cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES,
65 |             "pixel_decoder": build_pixel_decoder(cfg, input_shape),
66 |             "loss_weight": cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT,
67 |             "transformer_predictor": build_transformer_decoder(
68 |                 cfg,
69 |                 transformer_predictor_in_channels,
70 |                 mask_classification=True,
71 |             ),
72 |         }
73 | 
74 |     def forward(self, features, mask=None,targets=None):
75 |         return self.layers(features, mask,targets=targets)
76 | 
77 |     def layers(self, features, mask=None,targets=None):
78 |         mask_features, transformer_encoder_features, multi_scale_features = self.pixel_decoder.forward_features(features, mask)
79 | 
80 |         predictions = self.predictor(multi_scale_features, mask_features, mask, targets=targets)
81 | 
82 |         return predictions
83 | 


--------------------------------------------------------------------------------
/maskdino/modeling/pixel_decoder/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) IDEA, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/maskdino/modeling/pixel_decoder/ops/functions/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from .ms_deform_attn_func import MSDeformAttnFunction
13 | 
14 | 


--------------------------------------------------------------------------------
/maskdino/modeling/pixel_decoder/ops/functions/ms_deform_attn_func.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from __future__ import absolute_import
13 | from __future__ import print_function
14 | from __future__ import division
15 | 
16 | import torch
17 | import torch.nn.functional as F
18 | from torch.autograd import Function
19 | from torch.autograd.function import once_differentiable
20 | 
21 | try:
22 |     import MultiScaleDeformableAttention as MSDA
23 | except ModuleNotFoundError as e:
24 |     info_string = (
25 |         "\n\nPlease compile MultiScaleDeformableAttention CUDA op with the following commands:\n"
26 |         "\t`cd maskdino/modeling/pixel_decoder/ops`\n"
27 |         "\t`sh make.sh`\n"
28 |     )
29 |     raise ModuleNotFoundError(info_string)
30 | 
31 | 
32 | class MSDeformAttnFunction(Function):
33 |     @staticmethod
34 |     def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step):
35 |         ctx.im2col_step = im2col_step
36 |         output = MSDA.ms_deform_attn_forward(
37 |             value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step)
38 |         ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights)
39 |         return output
40 | 
41 |     @staticmethod
42 |     @once_differentiable
43 |     def backward(ctx, grad_output):
44 |         value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors
45 |         grad_value, grad_sampling_loc, grad_attn_weight = \
46 |             MSDA.ms_deform_attn_backward(
47 |                 value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step)
48 | 
49 |         return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
50 | 
51 | 
52 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
53 |     # for debug and test only,
54 |     # need to use cuda version instead
55 |     N_, S_, M_, D_ = value.shape
56 |     _, Lq_, M_, L_, P_, _ = sampling_locations.shape
57 |     value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
58 |     sampling_grids = 2 * sampling_locations - 1
59 |     sampling_value_list = []
60 |     for lid_, (H_, W_) in enumerate(value_spatial_shapes):
61 |         # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
62 |         value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_)
63 |         # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
64 |         sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)
65 |         # N_*M_, D_, Lq_, P_
66 |         sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,
67 |                                           mode='bilinear', padding_mode='zeros', align_corners=False)
68 |         sampling_value_list.append(sampling_value_l_)
69 |     # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
70 |     attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_)
71 |     output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)
72 |     return output.transpose(1, 2).contiguous()
73 | 


--------------------------------------------------------------------------------
/maskdino/modeling/pixel_decoder/ops/make.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # ------------------------------------------------------------------------------------------------
 3 | # Deformable DETR
 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | # ------------------------------------------------------------------------------------------------
 7 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | # ------------------------------------------------------------------------------------------------
 9 | 
10 | # Copyright (c) Facebook, Inc. and its affiliates.
11 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
12 | 
13 | python setup.py build install
14 | 


--------------------------------------------------------------------------------
/maskdino/modeling/pixel_decoder/ops/modules/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from .ms_deform_attn import MSDeformAttn
13 | 


--------------------------------------------------------------------------------
/maskdino/modeling/pixel_decoder/ops/modules/ms_deform_attn.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------------------------------
  2 | # Deformable DETR
  3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
  4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  5 | # ------------------------------------------------------------------------------------------------
  6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
  7 | # ------------------------------------------------------------------------------------------------
  8 | 
  9 | # Copyright (c) Facebook, Inc. and its affiliates.
 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
 11 | 
 12 | from __future__ import absolute_import
 13 | from __future__ import print_function
 14 | from __future__ import division
 15 | 
 16 | import warnings
 17 | import math
 18 | 
 19 | import torch
 20 | from torch import nn
 21 | import torch.nn.functional as F
 22 | from torch.nn.init import xavier_uniform_, constant_
 23 | 
 24 | from ..functions import MSDeformAttnFunction
 25 | from ..functions.ms_deform_attn_func import ms_deform_attn_core_pytorch
 26 | 
 27 | 
 28 | def _is_power_of_2(n):
 29 |     if (not isinstance(n, int)) or (n < 0):
 30 |         raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n)))
 31 |     return (n & (n-1) == 0) and n != 0
 32 | 
 33 | 
 34 | class MSDeformAttn(nn.Module):
 35 |     def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4):
 36 |         """
 37 |         Multi-Scale Deformable Attention Module
 38 |         :param d_model      hidden dimension
 39 |         :param n_levels     number of feature levels
 40 |         :param n_heads      number of attention heads
 41 |         :param n_points     number of sampling points per attention head per feature level
 42 |         """
 43 |         super().__init__()
 44 |         if d_model % n_heads != 0:
 45 |             raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads))
 46 |         _d_per_head = d_model // n_heads
 47 |         # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation
 48 |         if not _is_power_of_2(_d_per_head):
 49 |             warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 "
 50 |                           "which is more efficient in our CUDA implementation.")
 51 | 
 52 |         self.im2col_step = 128
 53 | 
 54 |         self.d_model = d_model
 55 |         self.n_levels = n_levels
 56 |         self.n_heads = n_heads
 57 |         self.n_points = n_points
 58 | 
 59 |         self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2)
 60 |         self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points)
 61 |         self.value_proj = nn.Linear(d_model, d_model)
 62 |         self.output_proj = nn.Linear(d_model, d_model)
 63 | 
 64 |         self._reset_parameters()
 65 | 
 66 |     def _reset_parameters(self):
 67 |         constant_(self.sampling_offsets.weight.data, 0.)
 68 |         thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
 69 |         grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
 70 |         grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1)
 71 |         for i in range(self.n_points):
 72 |             grid_init[:, :, i, :] *= i + 1
 73 |         with torch.no_grad():
 74 |             self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
 75 |         constant_(self.attention_weights.weight.data, 0.)
 76 |         constant_(self.attention_weights.bias.data, 0.)
 77 |         xavier_uniform_(self.value_proj.weight.data)
 78 |         constant_(self.value_proj.bias.data, 0.)
 79 |         xavier_uniform_(self.output_proj.weight.data)
 80 |         constant_(self.output_proj.bias.data, 0.)
 81 | 
 82 |     def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None):
 83 |         """
 84 |         :param query                       (N, Length_{query}, C)
 85 |         :param reference_points            (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area
 86 |                                         or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes
 87 |         :param input_flatten               (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C)
 88 |         :param input_spatial_shapes        (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
 89 |         :param input_level_start_index     (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}]
 90 |         :param input_padding_mask          (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements
 91 | 
 92 |         :return output                     (N, Length_{query}, C)
 93 |         """
 94 |         N, Len_q, _ = query.shape
 95 |         N, Len_in, _ = input_flatten.shape
 96 |         assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in
 97 | 
 98 |         value = self.value_proj(input_flatten)
 99 |         if input_padding_mask is not None:
100 |             value = value.masked_fill(input_padding_mask[..., None], float(0))
101 |         value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads)
102 |         sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2)
103 |         attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points)
104 |         attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points)
105 |         # N, Len_q, n_heads, n_levels, n_points, 2
106 |         if reference_points.shape[-1] == 2:
107 |             offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1)
108 |             sampling_locations = reference_points[:, :, None, :, None, :] \
109 |                                  + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
110 |         elif reference_points.shape[-1] == 4:
111 |             sampling_locations = reference_points[:, :, None, :, None, :2] \
112 |                                  + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
113 |         else:
114 |             raise ValueError(
115 |                 'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1]))
116 |         try:
117 |             output = MSDeformAttnFunction.apply(
118 |                 value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step)
119 |         except:
120 |             # CPU
121 |             output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights)
122 |         # # For FLOPs calculation only
123 |         # output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights)
124 |         output = self.output_proj(output)
125 |         return output
126 | 


--------------------------------------------------------------------------------
/maskdino/modeling/pixel_decoder/ops/setup.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | import os
13 | import glob
14 | 
15 | import torch
16 | 
17 | from torch.utils.cpp_extension import CUDA_HOME
18 | from torch.utils.cpp_extension import CppExtension
19 | from torch.utils.cpp_extension import CUDAExtension
20 | 
21 | from setuptools import find_packages
22 | from setuptools import setup
23 | 
24 | requirements = ["torch", "torchvision"]
25 | 
26 | def get_extensions():
27 |     this_dir = os.path.dirname(os.path.abspath(__file__))
28 |     extensions_dir = os.path.join(this_dir, "src")
29 | 
30 |     main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
31 |     source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
32 |     source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
33 | 
34 |     sources = main_file + source_cpu
35 |     extension = CppExtension
36 |     extra_compile_args = {"cxx": []}
37 |     define_macros = []
38 | 
39 |     # Force cuda since torch ask for a device, not if cuda is in fact available.
40 |     if (os.environ.get('FORCE_CUDA') or torch.cuda.is_available()) and CUDA_HOME is not None:
41 |         extension = CUDAExtension
42 |         sources += source_cuda
43 |         define_macros += [("WITH_CUDA", None)]
44 |         extra_compile_args["nvcc"] = [
45 |             "-DCUDA_HAS_FP16=1",
46 |             "-D__CUDA_NO_HALF_OPERATORS__",
47 |             "-D__CUDA_NO_HALF_CONVERSIONS__",
48 |             "-D__CUDA_NO_HALF2_OPERATORS__",
49 |         ]
50 |     else:
51 |         if CUDA_HOME is None:
52 |             raise NotImplementedError('CUDA_HOME is None. Please set environment variable CUDA_HOME.')
53 |         else:
54 |             raise NotImplementedError('No CUDA runtime is found. Please set FORCE_CUDA=1 or test it by running torch.cuda.is_available().')
55 | 
56 |     sources = [os.path.join(extensions_dir, s) for s in sources]
57 |     include_dirs = [extensions_dir]
58 |     ext_modules = [
59 |         extension(
60 |             "MultiScaleDeformableAttention",
61 |             sources,
62 |             include_dirs=include_dirs,
63 |             define_macros=define_macros,
64 |             extra_compile_args=extra_compile_args,
65 |         )
66 |     ]
67 |     return ext_modules
68 | 
69 | setup(
70 |     name="MultiScaleDeformableAttention",
71 |     version="1.0",
72 |     author="Weijie Su",
73 |     url="https://github.com/fundamentalvision/Deformable-DETR",
74 |     description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention",
75 |     packages=find_packages(exclude=("configs", "tests",)),
76 |     ext_modules=get_extensions(),
77 |     cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
78 | )
79 | 


--------------------------------------------------------------------------------
/maskdino/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.cpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #include <vector>
17 | 
18 | #include <ATen/ATen.h>
19 | #include <ATen/cuda/CUDAContext.h>
20 | 
21 | 
22 | at::Tensor
23 | ms_deform_attn_cpu_forward(
24 |     const at::Tensor &value, 
25 |     const at::Tensor &spatial_shapes,
26 |     const at::Tensor &level_start_index,
27 |     const at::Tensor &sampling_loc,
28 |     const at::Tensor &attn_weight,
29 |     const int im2col_step)
30 | {
31 |     AT_ERROR("Not implement on cpu");
32 | }
33 | 
34 | std::vector<at::Tensor>
35 | ms_deform_attn_cpu_backward(
36 |     const at::Tensor &value, 
37 |     const at::Tensor &spatial_shapes,
38 |     const at::Tensor &level_start_index,
39 |     const at::Tensor &sampling_loc,
40 |     const at::Tensor &attn_weight,
41 |     const at::Tensor &grad_output,
42 |     const int im2col_step)
43 | {
44 |     AT_ERROR("Not implement on cpu");
45 | }
46 | 
47 | 


--------------------------------------------------------------------------------
/maskdino/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #pragma once
17 | #include <torch/extension.h>
18 | 
19 | at::Tensor
20 | ms_deform_attn_cpu_forward(
21 |     const at::Tensor &value, 
22 |     const at::Tensor &spatial_shapes,
23 |     const at::Tensor &level_start_index,
24 |     const at::Tensor &sampling_loc,
25 |     const at::Tensor &attn_weight,
26 |     const int im2col_step);
27 | 
28 | std::vector<at::Tensor>
29 | ms_deform_attn_cpu_backward(
30 |     const at::Tensor &value, 
31 |     const at::Tensor &spatial_shapes,
32 |     const at::Tensor &level_start_index,
33 |     const at::Tensor &sampling_loc,
34 |     const at::Tensor &attn_weight,
35 |     const at::Tensor &grad_output,
36 |     const int im2col_step);
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/maskdino/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.cu:
--------------------------------------------------------------------------------
  1 | /*!
  2 | **************************************************************************************************
  3 | * Deformable DETR
  4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
  5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  6 | **************************************************************************************************
  7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
  8 | **************************************************************************************************
  9 | */
 10 | 
 11 | /*!
 12 | * Copyright (c) Facebook, Inc. and its affiliates.
 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
 14 | */
 15 | 
 16 | #include <vector>
 17 | #include "cuda/ms_deform_im2col_cuda.cuh"
 18 | 
 19 | #include <ATen/ATen.h>
 20 | #include <ATen/cuda/CUDAContext.h>
 21 | #include <cuda.h>
 22 | #include <cuda_runtime.h>
 23 | 
 24 | 
 25 | at::Tensor ms_deform_attn_cuda_forward(
 26 |     const at::Tensor &value, 
 27 |     const at::Tensor &spatial_shapes,
 28 |     const at::Tensor &level_start_index,
 29 |     const at::Tensor &sampling_loc,
 30 |     const at::Tensor &attn_weight,
 31 |     const int im2col_step)
 32 | {
 33 |     AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
 34 |     AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
 35 |     AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
 36 |     AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
 37 |     AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
 38 | 
 39 |     AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
 40 |     AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
 41 |     AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
 42 |     AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
 43 |     AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
 44 | 
 45 |     const int batch = value.size(0);
 46 |     const int spatial_size = value.size(1);
 47 |     const int num_heads = value.size(2);
 48 |     const int channels = value.size(3);
 49 | 
 50 |     const int num_levels = spatial_shapes.size(0);
 51 | 
 52 |     const int num_query = sampling_loc.size(1);
 53 |     const int num_point = sampling_loc.size(4);
 54 | 
 55 |     const int im2col_step_ = std::min(batch, im2col_step);
 56 | 
 57 |     AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
 58 |     
 59 |     auto output = at::zeros({batch, num_query, num_heads, channels}, value.options());
 60 | 
 61 |     const int batch_n = im2col_step_;
 62 |     auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
 63 |     auto per_value_size = spatial_size * num_heads * channels;
 64 |     auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
 65 |     auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
 66 |     for (int n = 0; n < batch/im2col_step_; ++n)
 67 |     {
 68 |         auto columns = output_n.select(0, n);
 69 |         AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_forward_cuda", ([&] {
 70 |             ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(),
 71 |                 value.data<scalar_t>() + n * im2col_step_ * per_value_size,
 72 |                 spatial_shapes.data<int64_t>(),
 73 |                 level_start_index.data<int64_t>(),
 74 |                 sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
 75 |                 attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
 76 |                 batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
 77 |                 columns.data<scalar_t>());
 78 | 
 79 |         }));
 80 |     }
 81 | 
 82 |     output = output.view({batch, num_query, num_heads*channels});
 83 | 
 84 |     return output;
 85 | }
 86 | 
 87 | 
 88 | std::vector<at::Tensor> ms_deform_attn_cuda_backward(
 89 |     const at::Tensor &value, 
 90 |     const at::Tensor &spatial_shapes,
 91 |     const at::Tensor &level_start_index,
 92 |     const at::Tensor &sampling_loc,
 93 |     const at::Tensor &attn_weight,
 94 |     const at::Tensor &grad_output,
 95 |     const int im2col_step)
 96 | {
 97 | 
 98 |     AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
 99 |     AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
100 |     AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
101 |     AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
102 |     AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
103 |     AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous");
104 | 
105 |     AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
106 |     AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
107 |     AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
108 |     AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
109 |     AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
110 |     AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor");
111 | 
112 |     const int batch = value.size(0);
113 |     const int spatial_size = value.size(1);
114 |     const int num_heads = value.size(2);
115 |     const int channels = value.size(3);
116 | 
117 |     const int num_levels = spatial_shapes.size(0);
118 | 
119 |     const int num_query = sampling_loc.size(1);
120 |     const int num_point = sampling_loc.size(4);
121 | 
122 |     const int im2col_step_ = std::min(batch, im2col_step);
123 | 
124 |     AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
125 | 
126 |     auto grad_value = at::zeros_like(value);
127 |     auto grad_sampling_loc = at::zeros_like(sampling_loc);
128 |     auto grad_attn_weight = at::zeros_like(attn_weight);
129 | 
130 |     const int batch_n = im2col_step_;
131 |     auto per_value_size = spatial_size * num_heads * channels;
132 |     auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
133 |     auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
134 |     auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
135 |     
136 |     for (int n = 0; n < batch/im2col_step_; ++n)
137 |     {
138 |         auto grad_output_g = grad_output_n.select(0, n);
139 |         AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_backward_cuda", ([&] {
140 |             ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(),
141 |                                     grad_output_g.data<scalar_t>(),
142 |                                     value.data<scalar_t>() + n * im2col_step_ * per_value_size,
143 |                                     spatial_shapes.data<int64_t>(),
144 |                                     level_start_index.data<int64_t>(),
145 |                                     sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
146 |                                     attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
147 |                                     batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
148 |                                     grad_value.data<scalar_t>() +  n * im2col_step_ * per_value_size,
149 |                                     grad_sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
150 |                                     grad_attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size);
151 | 
152 |         }));
153 |     }
154 | 
155 |     return {
156 |         grad_value, grad_sampling_loc, grad_attn_weight
157 |     };
158 | }


--------------------------------------------------------------------------------
/maskdino/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #pragma once
17 | #include <torch/extension.h>
18 | 
19 | at::Tensor ms_deform_attn_cuda_forward(
20 |     const at::Tensor &value, 
21 |     const at::Tensor &spatial_shapes,
22 |     const at::Tensor &level_start_index,
23 |     const at::Tensor &sampling_loc,
24 |     const at::Tensor &attn_weight,
25 |     const int im2col_step);
26 | 
27 | std::vector<at::Tensor> ms_deform_attn_cuda_backward(
28 |     const at::Tensor &value, 
29 |     const at::Tensor &spatial_shapes,
30 |     const at::Tensor &level_start_index,
31 |     const at::Tensor &sampling_loc,
32 |     const at::Tensor &attn_weight,
33 |     const at::Tensor &grad_output,
34 |     const int im2col_step);
35 | 
36 | 


--------------------------------------------------------------------------------
/maskdino/modeling/pixel_decoder/ops/src/ms_deform_attn.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #pragma once
17 | 
18 | #include "cpu/ms_deform_attn_cpu.h"
19 | 
20 | #ifdef WITH_CUDA
21 | #include "cuda/ms_deform_attn_cuda.h"
22 | #endif
23 | 
24 | 
25 | at::Tensor
26 | ms_deform_attn_forward(
27 |     const at::Tensor &value, 
28 |     const at::Tensor &spatial_shapes,
29 |     const at::Tensor &level_start_index,
30 |     const at::Tensor &sampling_loc,
31 |     const at::Tensor &attn_weight,
32 |     const int im2col_step)
33 | {
34 |     if (value.type().is_cuda())
35 |     {
36 | #ifdef WITH_CUDA
37 |         return ms_deform_attn_cuda_forward(
38 |             value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step);
39 | #else
40 |         AT_ERROR("Not compiled with GPU support");
41 | #endif
42 |     }
43 |     AT_ERROR("Not implemented on the CPU");
44 | }
45 | 
46 | std::vector<at::Tensor>
47 | ms_deform_attn_backward(
48 |     const at::Tensor &value, 
49 |     const at::Tensor &spatial_shapes,
50 |     const at::Tensor &level_start_index,
51 |     const at::Tensor &sampling_loc,
52 |     const at::Tensor &attn_weight,
53 |     const at::Tensor &grad_output,
54 |     const int im2col_step)
55 | {
56 |     if (value.type().is_cuda())
57 |     {
58 | #ifdef WITH_CUDA
59 |         return ms_deform_attn_cuda_backward(
60 |             value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step);
61 | #else
62 |         AT_ERROR("Not compiled with GPU support");
63 | #endif
64 |     }
65 |     AT_ERROR("Not implemented on the CPU");
66 | }
67 | 
68 | 


--------------------------------------------------------------------------------
/maskdino/modeling/pixel_decoder/ops/src/vision.cpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #include "ms_deform_attn.h"
17 | 
18 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
19 |   m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward");
20 |   m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward");
21 | }
22 | 


--------------------------------------------------------------------------------
/maskdino/modeling/pixel_decoder/ops/test.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from __future__ import absolute_import
13 | from __future__ import print_function
14 | from __future__ import division
15 | 
16 | import time
17 | import torch
18 | import torch.nn as nn
19 | from torch.autograd import gradcheck
20 | 
21 | from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch
22 | 
23 | 
24 | N, M, D = 1, 2, 2
25 | Lq, L, P = 2, 2, 2
26 | shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda()
27 | level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1]))
28 | S = sum([(H*W).item() for H, W in shapes])
29 | 
30 | 
31 | torch.manual_seed(3)
32 | 
33 | 
34 | @torch.no_grad()
35 | def check_forward_equal_with_pytorch_double():
36 |     value = torch.rand(N, S, M, D).cuda() * 0.01
37 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
38 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
39 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
40 |     im2col_step = 2
41 |     output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu()
42 |     output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu()
43 |     fwdok = torch.allclose(output_cuda, output_pytorch)
44 |     max_abs_err = (output_cuda - output_pytorch).abs().max()
45 |     max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
46 | 
47 |     print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
48 | 
49 | 
50 | @torch.no_grad()
51 | def check_forward_equal_with_pytorch_float():
52 |     value = torch.rand(N, S, M, D).cuda() * 0.01
53 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
54 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
55 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
56 |     im2col_step = 2
57 |     output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu()
58 |     output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu()
59 |     fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3)
60 |     max_abs_err = (output_cuda - output_pytorch).abs().max()
61 |     max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
62 | 
63 |     print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
64 | 
65 | 
66 | def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True):
67 | 
68 |     value = torch.rand(N, S, M, channels).cuda() * 0.01
69 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
70 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
71 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
72 |     im2col_step = 2
73 |     func = MSDeformAttnFunction.apply
74 | 
75 |     value.requires_grad = grad_value
76 |     sampling_locations.requires_grad = grad_sampling_loc
77 |     attention_weights.requires_grad = grad_attn_weight
78 | 
79 |     gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step))
80 | 
81 |     print(f'* {gradok} check_gradient_numerical(D={channels})')
82 | 
83 | 
84 | if __name__ == '__main__':
85 |     check_forward_equal_with_pytorch_double()
86 |     check_forward_equal_with_pytorch_float()
87 | 
88 |     for channels in [30, 32, 64, 71, 1025, 2048, 3096]:
89 |         check_gradient_numerical(channels, True, True, True)
90 | 
91 | 
92 | 
93 | 


--------------------------------------------------------------------------------
/maskdino/modeling/pixel_decoder/position_encoding.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------
 2 | # Copyright (c) 2022 IDEA. All Rights Reserved.
 3 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 4 | # ------------------------------------------------------------------------
 5 | # Modified from Mask2Former https://github.com/facebookresearch/Mask2Former by Feng Li and Hao Zhang.
 6 | """
 7 | Various positional encodings for the transformer.
 8 | """
 9 | import math
10 | 
11 | import torch
12 | from torch import nn
13 | 
14 | 
15 | class PositionEmbeddingSine(nn.Module):
16 |     """
17 |     This is a more standard version of the position embedding, very similar to the one
18 |     used by the Attention is all you need paper, generalized to work on images.
19 |     """
20 | 
21 |     def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
22 |         super().__init__()
23 |         self.num_pos_feats = num_pos_feats
24 |         self.temperature = temperature
25 |         self.normalize = normalize
26 |         if scale is not None and normalize is False:
27 |             raise ValueError("normalize should be True if scale is passed")
28 |         if scale is None:
29 |             scale = 2 * math.pi
30 |         self.scale = scale
31 | 
32 |     def forward(self, x, mask=None):
33 |         if mask is None:
34 |             mask = torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool)
35 |         not_mask = ~mask
36 |         y_embed = not_mask.cumsum(1, dtype=torch.float32)
37 |         x_embed = not_mask.cumsum(2, dtype=torch.float32)
38 |         if self.normalize:
39 |             eps = 1e-6
40 |             y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
41 |             x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
42 | 
43 |         dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
44 |         dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
45 | 
46 |         pos_x = x_embed[:, :, :, None] / dim_t
47 |         pos_y = y_embed[:, :, :, None] / dim_t
48 |         pos_x = torch.stack(
49 |             (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4
50 |         ).flatten(3)
51 |         pos_y = torch.stack(
52 |             (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4
53 |         ).flatten(3)
54 |         pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
55 |         return pos
56 | 
57 |     def __repr__(self, _repr_indent=4):
58 |         head = "Positional encoding " + self.__class__.__name__
59 |         body = [
60 |             "num_pos_feats: {}".format(self.num_pos_feats),
61 |             "temperature: {}".format(self.temperature),
62 |             "normalize: {}".format(self.normalize),
63 |             "scale: {}".format(self.scale),
64 |         ]
65 |         # _repr_indent = 4
66 |         lines = [head] + [" " * _repr_indent + line for line in body]
67 |         return "\n".join(lines)
68 | 


--------------------------------------------------------------------------------
/maskdino/modeling/transformer_decoder/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) IDEA, Inc. and its affiliates.
2 | from .maskdino_decoder import MaskDINODecoder
3 | 
4 | 


--------------------------------------------------------------------------------
/maskdino/test_time_augmentation.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | import copy
  3 | import logging
  4 | from itertools import count
  5 | 
  6 | import numpy as np
  7 | import torch
  8 | from fvcore.transforms import HFlipTransform
  9 | from torch import nn
 10 | from torch.nn.parallel import DistributedDataParallel
 11 | 
 12 | from detectron2.data.detection_utils import read_image
 13 | from detectron2.modeling import DatasetMapperTTA
 14 | 
 15 | 
 16 | __all__ = [
 17 |     "SemanticSegmentorWithTTA",
 18 | ]
 19 | 
 20 | 
 21 | class SemanticSegmentorWithTTA(nn.Module):
 22 |     """
 23 |     A SemanticSegmentor with test-time augmentation enabled.
 24 |     Its :meth:`__call__` method has the same interface as :meth:`SemanticSegmentor.forward`.
 25 |     """
 26 | 
 27 |     def __init__(self, cfg, model, tta_mapper=None, batch_size=1):
 28 |         """
 29 |         Args:
 30 |             cfg (CfgNode):
 31 |             model (SemanticSegmentor): a SemanticSegmentor to apply TTA on.
 32 |             tta_mapper (callable): takes a dataset dict and returns a list of
 33 |                 augmented versions of the dataset dict. Defaults to
 34 |                 `DatasetMapperTTA(cfg)`.
 35 |             batch_size (int): batch the augmented images into this batch size for inference.
 36 |         """
 37 |         super().__init__()
 38 |         if isinstance(model, DistributedDataParallel):
 39 |             model = model.module
 40 |         self.cfg = cfg.clone()
 41 | 
 42 |         self.model = model
 43 | 
 44 |         if tta_mapper is None:
 45 |             tta_mapper = DatasetMapperTTA(cfg)
 46 |         self.tta_mapper = tta_mapper
 47 |         self.batch_size = batch_size
 48 | 
 49 |     def __call__(self, batched_inputs):
 50 |         """
 51 |         Same input/output format as :meth:`SemanticSegmentor.forward`
 52 |         """
 53 | 
 54 |         def _maybe_read_image(dataset_dict):
 55 |             ret = copy.copy(dataset_dict)
 56 |             if "image" not in ret:
 57 |                 image = read_image(ret.pop("file_name"), self.model.input_format)
 58 |                 image = torch.from_numpy(np.ascontiguousarray(image.transpose(2, 0, 1)))  # CHW
 59 |                 ret["image"] = image
 60 |             if "height" not in ret and "width" not in ret:
 61 |                 ret["height"] = image.shape[1]
 62 |                 ret["width"] = image.shape[2]
 63 |             return ret
 64 | 
 65 |         processed_results = []
 66 |         for x in batched_inputs:
 67 |             result = self._inference_one_image(_maybe_read_image(x))
 68 |             processed_results.append(result)
 69 |         return processed_results
 70 | 
 71 |     def _inference_one_image(self, input):
 72 |         """
 73 |         Args:
 74 |             input (dict): one dataset dict with "image" field being a CHW tensor
 75 |         Returns:
 76 |             dict: one output dict
 77 |         """
 78 |         orig_shape = (input["height"], input["width"])
 79 |         augmented_inputs, tfms = self._get_augmented_inputs(input)
 80 | 
 81 |         final_predictions = None
 82 |         count_predictions = 0
 83 |         for input, tfm in zip(augmented_inputs, tfms):
 84 |             count_predictions += 1
 85 |             with torch.no_grad():
 86 |                 if final_predictions is None:
 87 |                     if any(isinstance(t, HFlipTransform) for t in tfm.transforms):
 88 |                         final_predictions = self.model([input])[0].pop("sem_seg").flip(dims=[2])
 89 |                     else:
 90 |                         final_predictions = self.model([input])[0].pop("sem_seg")
 91 |                 else:
 92 |                     if any(isinstance(t, HFlipTransform) for t in tfm.transforms):
 93 |                         final_predictions += self.model([input])[0].pop("sem_seg").flip(dims=[2])
 94 |                     else:
 95 |                         final_predictions += self.model([input])[0].pop("sem_seg")
 96 | 
 97 |         final_predictions = final_predictions / count_predictions
 98 |         return {"sem_seg": final_predictions}
 99 | 
100 |     def _get_augmented_inputs(self, input):
101 |         augmented_inputs = self.tta_mapper(input)
102 |         tfms = [x.pop("transforms") for x in augmented_inputs]
103 |         return augmented_inputs, tfms
104 | 


--------------------------------------------------------------------------------
/maskdino/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # import misc


--------------------------------------------------------------------------------
/maskdino/utils/box_ops.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  2 | """
  3 | Utilities for bounding box manipulation and GIoU.
  4 | """
  5 | import torch
  6 | from torchvision.ops.boxes import box_area
  7 | 
  8 | 
  9 | def box_cxcywh_to_xyxy(x):
 10 |     x_c, y_c, w, h = x.unbind(-1)
 11 |     b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
 12 |          (x_c + 0.5 * w), (y_c + 0.5 * h)]
 13 |     return torch.stack(b, dim=-1)
 14 | 
 15 | 
 16 | def box_xyxy_to_cxcywh(x):
 17 |     x0, y0, x1, y1 = x.unbind(-1)
 18 |     b = [(x0 + x1) / 2, (y0 + y1) / 2,
 19 |          (x1 - x0), (y1 - y0)]
 20 |     return torch.stack(b, dim=-1)
 21 | 
 22 | 
 23 | # modified from torchvision to also return the union
 24 | def box_iou(boxes1, boxes2):
 25 |     area1 = box_area(boxes1)
 26 |     area2 = box_area(boxes2)
 27 | 
 28 | 
 29 |     lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
 30 |     rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
 31 | 
 32 |     wh = (rb - lt).clamp(min=0)  # [N,M,2]
 33 |     inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
 34 | 
 35 |     union = area1[:, None] + area2 - inter
 36 | 
 37 |     iou = inter / (union + 1e-6)
 38 |     return iou, union
 39 | 
 40 | 
 41 | def generalized_box_iou(boxes1, boxes2):
 42 |     """
 43 |     Generalized IoU from https://giou.stanford.edu/
 44 | 
 45 |     The boxes should be in [x0, y0, x1, y1] format
 46 | 
 47 |     Returns a [N, M] pairwise matrix, where N = len(boxes1)
 48 |     and M = len(boxes2)
 49 |     """
 50 |     # degenerate boxes gives inf / nan results
 51 |     # so do an early check
 52 |     assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
 53 |     assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
 54 |     iou, union = box_iou(boxes1, boxes2)
 55 | 
 56 |     lt = torch.min(boxes1[:, None, :2], boxes2[:, :2])
 57 |     rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
 58 | 
 59 |     wh = (rb - lt).clamp(min=0)  # [N,M,2]
 60 |     area = wh[:, :, 0] * wh[:, :, 1]
 61 | 
 62 |     return iou - (area - union) / (area + 1e-6)
 63 | 
 64 | 
 65 | 
 66 | # modified from torchvision to also return the union
 67 | def box_iou_pairwise(boxes1, boxes2):
 68 |     area1 = box_area(boxes1)
 69 |     area2 = box_area(boxes2)
 70 | 
 71 |     lt = torch.max(boxes1[:, :2], boxes2[:, :2])  # [N,2]
 72 |     rb = torch.min(boxes1[:, 2:], boxes2[:, 2:])  # [N,2]
 73 | 
 74 |     wh = (rb - lt).clamp(min=0)  # [N,2]
 75 |     inter = wh[:, 0] * wh[:, 1]  # [N]
 76 | 
 77 |     union = area1 + area2 - inter
 78 | 
 79 |     iou = inter / union
 80 |     return iou, union
 81 | 
 82 | 
 83 | def generalized_box_iou_pairwise(boxes1, boxes2):
 84 |     """
 85 |     Generalized IoU from https://giou.stanford.edu/
 86 | 
 87 |     Input:
 88 |         - boxes1, boxes2: N,4
 89 |     Output:
 90 |         - giou: N, 4
 91 |     """
 92 |     # degenerate boxes gives inf / nan results
 93 |     # so do an early check
 94 |     assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
 95 |     assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
 96 |     assert boxes1.shape == boxes2.shape
 97 |     iou, union = box_iou_pairwise(boxes1, boxes2) # N, 4
 98 | 
 99 |     lt = torch.min(boxes1[:, :2], boxes2[:, :2])
100 |     rb = torch.max(boxes1[:, 2:], boxes2[:, 2:])
101 | 
102 |     wh = (rb - lt).clamp(min=0)  # [N,2]
103 |     area = wh[:, 0] * wh[:, 1]
104 | 
105 |     return iou - (area - union) / area
106 | 
107 | def masks_to_boxes(masks):
108 |     """Compute the bounding boxes around the provided masks
109 | 
110 |     The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions.
111 | 
112 |     Returns a [N, 4] tensors, with the boxes in xyxy format
113 |     """
114 |     if masks.numel() == 0:
115 |         return torch.zeros((0, 4), device=masks.device)
116 | 
117 |     h, w = masks.shape[-2:]
118 | 
119 |     y = torch.arange(0, h, dtype=torch.float, device=masks.device)
120 |     x = torch.arange(0, w, dtype=torch.float, device=masks.device)
121 |     y, x = torch.meshgrid(y, x)
122 | 
123 |     x_mask = (masks * x.unsqueeze(0))
124 |     x_max = x_mask.flatten(1).max(-1)[0]
125 |     x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
126 | 
127 |     y_mask = (masks * y.unsqueeze(0))
128 |     y_max = y_mask.flatten(1).max(-1)[0]
129 |     y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
130 | 
131 |     return torch.stack([x_min, y_min, x_max, y_max], 1)
132 | 
133 | if __name__ == '__main__':
134 |     x = torch.rand(5, 4)
135 |     y = torch.rand(3, 4)
136 |     iou, union = box_iou(x, y)


--------------------------------------------------------------------------------
/maskdino/utils/misc.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/util/misc.py
  3 | """
  4 | Misc functions, including distributed helpers.
  5 | 
  6 | Mostly copy-paste from torchvision references.
  7 | """
  8 | from typing import List, Optional
  9 | 
 10 | import torch
 11 | import torch.distributed as dist
 12 | import torchvision
 13 | from torch import Tensor
 14 | 
 15 | 
 16 | def _max_by_axis(the_list):
 17 |     # type: (List[List[int]]) -> List[int]
 18 |     maxes = the_list[0]
 19 |     for sublist in the_list[1:]:
 20 |         for index, item in enumerate(sublist):
 21 |             maxes[index] = max(maxes[index], item)
 22 |     return maxes
 23 | 
 24 | 
 25 | class NestedTensor(object):
 26 |     def __init__(self, tensors, mask: Optional[Tensor]):
 27 |         self.tensors = tensors
 28 |         self.mask = mask
 29 | 
 30 |     def to(self, device):
 31 |         # type: (Device) -> NestedTensor # noqa
 32 |         cast_tensor = self.tensors.to(device)
 33 |         mask = self.mask
 34 |         if mask is not None:
 35 |             assert mask is not None
 36 |             cast_mask = mask.to(device)
 37 |         else:
 38 |             cast_mask = None
 39 |         return NestedTensor(cast_tensor, cast_mask)
 40 | 
 41 |     def decompose(self):
 42 |         return self.tensors, self.mask
 43 | 
 44 |     def __repr__(self):
 45 |         return str(self.tensors)
 46 | 
 47 | 
 48 | def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
 49 |     # TODO make this more general
 50 |     if tensor_list[0].ndim == 3:
 51 |         if torchvision._is_tracing():
 52 |             # nested_tensor_from_tensor_list() does not export well to ONNX
 53 |             # call _onnx_nested_tensor_from_tensor_list() instead
 54 |             return _onnx_nested_tensor_from_tensor_list(tensor_list)
 55 | 
 56 |         # TODO make it support different-sized images
 57 |         max_size = _max_by_axis([list(img.shape) for img in tensor_list])
 58 |         # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list]))
 59 |         batch_shape = [len(tensor_list)] + max_size
 60 |         b, c, h, w = batch_shape
 61 |         dtype = tensor_list[0].dtype
 62 |         device = tensor_list[0].device
 63 |         tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
 64 |         mask = torch.ones((b, h, w), dtype=torch.bool, device=device)
 65 |         for img, pad_img, m in zip(tensor_list, tensor, mask):
 66 |             pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
 67 |             m[: img.shape[1], : img.shape[2]] = False
 68 |     else:
 69 |         raise ValueError("not supported")
 70 |     return NestedTensor(tensor, mask)
 71 | 
 72 | 
 73 | # _onnx_nested_tensor_from_tensor_list() is an implementation of
 74 | # nested_tensor_from_tensor_list() that is supported by ONNX tracing.
 75 | @torch.jit.unused
 76 | def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor:
 77 |     max_size = []
 78 |     for i in range(tensor_list[0].dim()):
 79 |         max_size_i = torch.max(
 80 |             torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32)
 81 |         ).to(torch.int64)
 82 |         max_size.append(max_size_i)
 83 |     max_size = tuple(max_size)
 84 | 
 85 |     # work around for
 86 |     # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
 87 |     # m[: img.shape[1], :img.shape[2]] = False
 88 |     # which is not yet supported in onnx
 89 |     padded_imgs = []
 90 |     padded_masks = []
 91 |     for img in tensor_list:
 92 |         padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))]
 93 |         padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0]))
 94 |         padded_imgs.append(padded_img)
 95 | 
 96 |         m = torch.zeros_like(img[0], dtype=torch.int, device=img.device)
 97 |         padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1)
 98 |         padded_masks.append(padded_mask.to(torch.bool))
 99 | 
100 |     tensor = torch.stack(padded_imgs)
101 |     mask = torch.stack(padded_masks)
102 | 
103 |     return NestedTensor(tensor, mask=mask)
104 | 
105 | 
106 | def is_dist_avail_and_initialized():
107 |     if not dist.is_available():
108 |         return False
109 |     if not dist.is_initialized():
110 |         return False
111 |     return True
112 | 
113 | def masks_to_boxes(masks):
114 |     """Compute the bounding boxes around the provided masks
115 |     The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions.
116 |     Returns a [N, 4] tensors, with the boxes in xyxy format
117 |     """
118 |     if masks.numel() == 0:
119 |         return torch.zeros((0, 4), device=masks.device)
120 | 
121 |     h, w = masks.shape[-2:]
122 | 
123 |     y = torch.arange(0, h, dtype=torch.float)
124 |     x = torch.arange(0, w, dtype=torch.float)
125 |     y, x = torch.meshgrid(y, x)
126 | 
127 |     x_mask = masks * x.unsqueeze(0)
128 |     x_max = x_mask.flatten(1).max(-1)[0]
129 |     x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
130 | 
131 |     y_mask = masks * y.unsqueeze(0)
132 |     y_max = y_mask.flatten(1).max(-1)[0]
133 |     y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
134 | 
135 |     return torch.stack([x_min, y_min, x_max, y_max], 1)


--------------------------------------------------------------------------------
/maskdino/utils/utils.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import copy
  3 | from torch import nn, Tensor
  4 | import os
  5 | 
  6 | import math
  7 | import torch.nn.functional as F
  8 | from torch import nn
  9 | 
 10 | 
 11 | class MLP(nn.Module):
 12 |     """ Very simple multi-layer perceptron (also called FFN)"""
 13 | 
 14 |     def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
 15 |         super().__init__()
 16 |         self.num_layers = num_layers
 17 |         h = [hidden_dim] * (num_layers - 1)
 18 |         self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
 19 | 
 20 |     def forward(self, x):
 21 |         for i, layer in enumerate(self.layers):
 22 |             x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
 23 |         return x
 24 | 
 25 | 
 26 | def inverse_sigmoid(x, eps=1e-5):
 27 |     x = x.clamp(min=0, max=1)
 28 |     x1 = x.clamp(min=eps)
 29 |     x2 = (1 - x).clamp(min=eps)
 30 |     return torch.log(x1/x2)
 31 | 
 32 | 
 33 | def gen_encoder_output_proposals(memory:Tensor, memory_padding_mask:Tensor, spatial_shapes:Tensor):
 34 |     """
 35 |     Input:
 36 |         - memory: bs, \sum{hw}, d_model
 37 |         - memory_padding_mask: bs, \sum{hw}
 38 |         - spatial_shapes: nlevel, 2
 39 |     Output:
 40 |         - output_memory: bs, \sum{hw}, d_model
 41 |         - output_proposals: bs, \sum{hw}, 4
 42 |     """
 43 |     N_, S_, C_ = memory.shape
 44 |     base_scale = 4.0
 45 |     proposals = []
 46 |     _cur = 0
 47 |     for lvl, (H_, W_) in enumerate(spatial_shapes):
 48 |         mask_flatten_ = memory_padding_mask[:, _cur:(_cur + H_ * W_)].view(N_, H_, W_, 1)
 49 |         valid_H = torch.sum(~mask_flatten_[:, :, 0, 0], 1)
 50 |         valid_W = torch.sum(~mask_flatten_[:, 0, :, 0], 1)
 51 | 
 52 |         grid_y, grid_x = torch.meshgrid(torch.linspace(0, H_ - 1, H_, dtype=torch.float32, device=memory.device),
 53 |                                         torch.linspace(0, W_ - 1, W_, dtype=torch.float32, device=memory.device))
 54 |         grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1)
 55 | 
 56 |         scale = torch.cat([valid_W.unsqueeze(-1), valid_H.unsqueeze(-1)], 1).view(N_, 1, 1, 2)
 57 |         grid = (grid.unsqueeze(0).expand(N_, -1, -1, -1) + 0.5) / scale
 58 |         wh = torch.ones_like(grid) * 0.05 * (2.0 ** lvl)
 59 |         proposal = torch.cat((grid, wh), -1).view(N_, -1, 4)
 60 |         proposals.append(proposal)
 61 |         _cur += (H_ * W_)
 62 |     output_proposals = torch.cat(proposals, 1)
 63 |     output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True)
 64 |     output_proposals = torch.log(output_proposals / (1 - output_proposals))
 65 |     output_proposals = output_proposals.masked_fill(memory_padding_mask.unsqueeze(-1), float('inf'))
 66 |     output_proposals = output_proposals.masked_fill(~output_proposals_valid, float('inf'))
 67 | 
 68 |     output_memory = memory
 69 |     output_memory = output_memory.masked_fill(memory_padding_mask.unsqueeze(-1), float(0))
 70 |     output_memory = output_memory.masked_fill(~output_proposals_valid, float(0))
 71 |     return output_memory, output_proposals
 72 | 
 73 | 
 74 | def gen_sineembed_for_position(pos_tensor):
 75 |     # n_query, bs, _ = pos_tensor.size()
 76 |     # sineembed_tensor = torch.zeros(n_query, bs, 256)
 77 |     scale = 2 * math.pi
 78 |     dim_t = torch.arange(128, dtype=torch.float32, device=pos_tensor.device)
 79 |     dim_t = 10000 ** (2 * (dim_t // 2) / 128)
 80 |     x_embed = pos_tensor[:, :, 0] * scale
 81 |     y_embed = pos_tensor[:, :, 1] * scale
 82 |     pos_x = x_embed[:, :, None] / dim_t
 83 |     pos_y = y_embed[:, :, None] / dim_t
 84 |     pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2)
 85 |     pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=3).flatten(2)
 86 |     if pos_tensor.size(-1) == 2:
 87 |         pos = torch.cat((pos_y, pos_x), dim=2)
 88 |     elif pos_tensor.size(-1) == 4:
 89 |         w_embed = pos_tensor[:, :, 2] * scale
 90 |         pos_w = w_embed[:, :, None] / dim_t
 91 |         pos_w = torch.stack((pos_w[:, :, 0::2].sin(), pos_w[:, :, 1::2].cos()), dim=3).flatten(2)
 92 | 
 93 |         h_embed = pos_tensor[:, :, 3] * scale
 94 |         pos_h = h_embed[:, :, None] / dim_t
 95 |         pos_h = torch.stack((pos_h[:, :, 0::2].sin(), pos_h[:, :, 1::2].cos()), dim=3).flatten(2)
 96 | 
 97 |         pos = torch.cat((pos_y, pos_x, pos_w, pos_h), dim=2)
 98 |     else:
 99 |         raise ValueError("Unknown pos_tensor shape(-1):{}".format(pos_tensor.size(-1)))
100 |     return pos
101 | 
102 | 
103 | def _get_activation_fn(activation):
104 |     """Return an activation function given a string"""
105 |     if activation == "relu":
106 |         return F.relu
107 |     if activation == "gelu":
108 |         return F.gelu
109 |     if activation == "glu":
110 |         return F.glu
111 |     if activation == "prelu":
112 |         return nn.PReLU()
113 |     if activation == "selu":
114 |         return F.selu
115 |     raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
116 | 
117 | 
118 | def _get_clones(module, N, layer_share=False):
119 | 
120 |     if layer_share:
121 |         return nn.ModuleList([module for i in range(N)])
122 |     else:
123 |         return nn.ModuleList([copy.deepcopy(module) for i in range(N)])


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | cython
2 | scipy
3 | shapely
4 | timm
5 | h5py
6 | submitit
7 | scikit-image
8 | opencv-python
9 | 


--------------------------------------------------------------------------------
/tools/README.md:
--------------------------------------------------------------------------------
 1 | We use ResNet and Swin as the backbone in our model.
 2 | 
 3 | * `convert-torchvision-to-d2.py`
 4 | 
 5 | Tool to convert torchvision pre-trained weights for D2.
 6 | 
 7 | ```
 8 | wget https://download.pytorch.org/models/resnet101-63fe2227.pth
 9 | python tools/convert-torchvision-to-d2.py resnet101-63fe2227.pth R-101.pkl
10 | ```
11 | 
12 | * `convert-pretrained-swin-model-to-d2.py`
13 | 
14 | Tool to convert Swin Transformer pre-trained weights for D2.
15 | 
16 | ```
17 | pip install timm
18 | 
19 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth
20 | python tools/convert-pretrained-swin-model-to-d2.py swin_tiny_patch4_window7_224.pth swin_tiny_patch4_window7_224.pkl
21 | 
22 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_small_patch4_window7_224.pth
23 | python tools/convert-pretrained-swin-model-to-d2.py swin_small_patch4_window7_224.pth swin_small_patch4_window7_224.pkl
24 | 
25 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window12_384_22k.pth
26 | python tools/convert-pretrained-swin-model-to-d2.py swin_base_patch4_window12_384_22k.pth swin_base_patch4_window12_384_22k.pkl
27 | 
28 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth
29 | python tools/convert-pretrained-swin-model-to-d2.py swin_large_patch4_window12_384_22k.pth swin_large_patch4_window12_384_22k.pkl
30 | ```
31 | 
32 | 
33 | * `analyze_model.py`
34 | 
35 | Tool to analyze model parameters and flops.
36 | 
37 | Usage for semantic segmentation (ADE20K only, use with caution!):
38 | 
39 | ```
40 | python tools/analyze_model.py --num-inputs 1 --tasks flop --use-fixed-input-size --config-file CONFIG_FILE
41 | ```
42 | 
43 | Note that, for semantic segmentation (ADE20K only), we use a dummy image with fixed size that equals to `cfg.INPUT.CROP.SIZE[0] x cfg.INPUT.CROP.SIZE[0]`.
44 | Please do not use `--use-fixed-input-size` for calculating FLOPs on other datasets like Cityscapes!
45 | 
46 | Usage for panoptic and instance segmentation:
47 | 
48 | ```
49 | python tools/analyze_model.py --num-inputs 100 --tasks flop --config-file CONFIG_FILE
50 | ```
51 | 
52 | Note that, for panoptic and instance segmentation, we compute the average flops over 100 real validation images.
53 | 


--------------------------------------------------------------------------------
/tools/analyze_model.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Copyright (c) Facebook, Inc. and its affiliates.
  3 | # Modified by Bowen Cheng from https://github.com/facebookresearch/detectron2/blob/main/tools/analyze_model.py
  4 | 
  5 | import logging
  6 | import numpy as np
  7 | from collections import Counter
  8 | import tqdm
  9 | from fvcore.nn import flop_count_table  # can also try flop_count_str
 10 | 
 11 | from detectron2.checkpoint import DetectionCheckpointer
 12 | from detectron2.config import CfgNode, LazyConfig, get_cfg, instantiate
 13 | from detectron2.data import build_detection_test_loader
 14 | from detectron2.engine import default_argument_parser
 15 | from detectron2.modeling import build_model
 16 | from detectron2.projects.deeplab import add_deeplab_config
 17 | from detectron2.utils.analysis import (
 18 |     FlopCountAnalysis,
 19 |     activation_count_operators,
 20 |     parameter_count_table,
 21 | )
 22 | from detectron2.utils.logger import setup_logger
 23 | 
 24 | # fmt: off
 25 | import os
 26 | import sys
 27 | sys.path.insert(1, os.path.join(sys.path[0], '..'))
 28 | # fmt: on
 29 | 
 30 | from mask2former import add_maskformer2_config
 31 | 
 32 | logger = logging.getLogger("detectron2")
 33 | 
 34 | 
 35 | def setup(args):
 36 |     if args.config_file.endswith(".yaml"):
 37 |         cfg = get_cfg()
 38 |         add_deeplab_config(cfg)
 39 |         add_maskformer2_config(cfg)
 40 |         cfg.merge_from_file(args.config_file)
 41 |         cfg.DATALOADER.NUM_WORKERS = 0
 42 |         cfg.merge_from_list(args.opts)
 43 |         cfg.freeze()
 44 |     else:
 45 |         cfg = LazyConfig.load(args.config_file)
 46 |         cfg = LazyConfig.apply_overrides(cfg, args.opts)
 47 |     setup_logger(name="fvcore")
 48 |     setup_logger()
 49 |     return cfg
 50 | 
 51 | 
 52 | def do_flop(cfg):
 53 |     if isinstance(cfg, CfgNode):
 54 |         data_loader = build_detection_test_loader(cfg, cfg.DATASETS.TEST[0])
 55 |         model = build_model(cfg)
 56 |         DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS)
 57 |     else:
 58 |         data_loader = instantiate(cfg.dataloader.test)
 59 |         model = instantiate(cfg.model)
 60 |         model.to(cfg.train.device)
 61 |         DetectionCheckpointer(model).load(cfg.train.init_checkpoint)
 62 |     model.eval()
 63 | 
 64 |     counts = Counter()
 65 |     total_flops = []
 66 |     for idx, data in zip(tqdm.trange(args.num_inputs), data_loader):  # noqa
 67 |         if args.use_fixed_input_size and isinstance(cfg, CfgNode):
 68 |             import torch
 69 |             crop_size = cfg.INPUT.CROP.SIZE[0]
 70 |             data[0]["image"] = torch.zeros((3, crop_size, crop_size))
 71 |         flops = FlopCountAnalysis(model, data)
 72 |         if idx > 0:
 73 |             flops.unsupported_ops_warnings(False).uncalled_modules_warnings(False)
 74 |         counts += flops.by_operator()
 75 |         total_flops.append(flops.total())
 76 | 
 77 |     logger.info("Flops table computed from only one input sample:\n" + flop_count_table(flops))
 78 |     logger.info(
 79 |         "Average GFlops for each type of operators:\n"
 80 |         + str([(k, v / (idx + 1) / 1e9) for k, v in counts.items()])
 81 |     )
 82 |     logger.info(
 83 |         "Total GFlops: {:.1f}±{:.1f}".format(np.mean(total_flops) / 1e9, np.std(total_flops) / 1e9)
 84 |     )
 85 | 
 86 | 
 87 | def do_activation(cfg):
 88 |     if isinstance(cfg, CfgNode):
 89 |         data_loader = build_detection_test_loader(cfg, cfg.DATASETS.TEST[0])
 90 |         model = build_model(cfg)
 91 |         DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS)
 92 |     else:
 93 |         data_loader = instantiate(cfg.dataloader.test)
 94 |         model = instantiate(cfg.model)
 95 |         model.to(cfg.train.device)
 96 |         DetectionCheckpointer(model).load(cfg.train.init_checkpoint)
 97 |     model.eval()
 98 | 
 99 |     counts = Counter()
100 |     total_activations = []
101 |     for idx, data in zip(tqdm.trange(args.num_inputs), data_loader):  # noqa
102 |         count = activation_count_operators(model, data)
103 |         counts += count
104 |         total_activations.append(sum(count.values()))
105 |     logger.info(
106 |         "(Million) Activations for Each Type of Operators:\n"
107 |         + str([(k, v / idx) for k, v in counts.items()])
108 |     )
109 |     logger.info(
110 |         "Total (Million) Activations: {}±{}".format(
111 |             np.mean(total_activations), np.std(total_activations)
112 |         )
113 |     )
114 | 
115 | 
116 | def do_parameter(cfg):
117 |     if isinstance(cfg, CfgNode):
118 |         model = build_model(cfg)
119 |     else:
120 |         model = instantiate(cfg.model)
121 |     logger.info("Parameter Count:\n" + parameter_count_table(model, max_depth=5))
122 | 
123 | 
124 | def do_structure(cfg):
125 |     if isinstance(cfg, CfgNode):
126 |         model = build_model(cfg)
127 |     else:
128 |         model = instantiate(cfg.model)
129 |     logger.info("Model Structure:\n" + str(model))
130 | 
131 | 
132 | if __name__ == "__main__":
133 |     parser = default_argument_parser(
134 |         epilog="""
135 | Examples:
136 | To show parameters of a model:
137 | $ ./analyze_model.py --tasks parameter \\
138 |     --config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml
139 | Flops and activations are data-dependent, therefore inputs and model weights
140 | are needed to count them:
141 | $ ./analyze_model.py --num-inputs 100 --tasks flop \\
142 |     --config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \\
143 |     MODEL.WEIGHTS /path/to/model.pkl
144 | """
145 |     )
146 |     parser.add_argument(
147 |         "--tasks",
148 |         choices=["flop", "activation", "parameter", "structure"],
149 |         required=True,
150 |         nargs="+",
151 |     )
152 |     parser.add_argument(
153 |         "-n",
154 |         "--num-inputs",
155 |         default=100,
156 |         type=int,
157 |         help="number of inputs used to compute statistics for flops/activations, "
158 |         "both are data dependent.",
159 |     )
160 |     parser.add_argument(
161 |         "--use-fixed-input-size",
162 |         action="store_true",
163 |         help="use fixed input size when calculating flops",
164 |     )
165 |     args = parser.parse_args()
166 |     assert not args.eval_only
167 |     assert args.num_gpus == 1
168 | 
169 |     cfg = setup(args)
170 | 
171 |     for task in args.tasks:
172 |         {
173 |             "flop": do_flop,
174 |             "activation": do_activation,
175 |             "parameter": do_parameter,
176 |             "structure": do_structure,
177 |         }[task](cfg)
178 | 


--------------------------------------------------------------------------------
/tools/convert-pretrained-swin-model-to-d2.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 3 | 
 4 | import pickle as pkl
 5 | import sys
 6 | 
 7 | import torch
 8 | 
 9 | """
10 | Usage:
11 |   # download pretrained swin model:
12 |   wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth
13 |   # run the conversion
14 |   ./convert-pretrained-model-to-d2.py swin_tiny_patch4_window7_224.pth swin_tiny_patch4_window7_224.pkl
15 |   # Then, use swin_tiny_patch4_window7_224.pkl with the following changes in config:
16 | MODEL:
17 |   WEIGHTS: "/path/to/swin_tiny_patch4_window7_224.pkl"
18 | INPUT:
19 |   FORMAT: "RGB"
20 | """
21 | 
22 | if __name__ == "__main__":
23 |     input = sys.argv[1]
24 | 
25 |     obj = torch.load(input, map_location="cpu")["model"]
26 | 
27 |     res = {"model": obj, "__author__": "third_party", "matching_heuristics": True}
28 | 
29 |     with open(sys.argv[2], "wb") as f:
30 |         pkl.dump(res, f)
31 | 


--------------------------------------------------------------------------------
/tools/convert-torchvision-to-d2.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | 
 4 | import pickle as pkl
 5 | import sys
 6 | 
 7 | import torch
 8 | 
 9 | """
10 | Usage:
11 |   # download one of the ResNet{18,34,50,101,152} models from torchvision:
12 |   wget https://download.pytorch.org/models/resnet50-19c8e357.pth -O r50.pth
13 |   # run the conversion
14 |   ./convert-torchvision-to-d2.py r50.pth r50.pkl
15 |   # Then, use r50.pkl with the following changes in config:
16 | MODEL:
17 |   WEIGHTS: "/path/to/r50.pkl"
18 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
19 |   PIXEL_STD: [58.395, 57.120, 57.375]
20 |   RESNETS:
21 |     DEPTH: 50
22 |     STRIDE_IN_1X1: False
23 | INPUT:
24 |   FORMAT: "RGB"
25 | """
26 | 
27 | if __name__ == "__main__":
28 |     input = sys.argv[1]
29 | 
30 |     obj = torch.load(input, map_location="cpu")
31 | 
32 |     newmodel = {}
33 |     for k in list(obj.keys()):
34 |         old_k = k
35 |         if "layer" not in k:
36 |             k = "stem." + k
37 |         for t in [1, 2, 3, 4]:
38 |             k = k.replace("layer{}".format(t), "res{}".format(t + 1))
39 |         for t in [1, 2, 3]:
40 |             k = k.replace("bn{}".format(t), "conv{}.norm".format(t))
41 |         k = k.replace("downsample.0", "shortcut")
42 |         k = k.replace("downsample.1", "shortcut.norm")
43 |         print(old_k, "->", k)
44 |         newmodel[k] = obj.pop(old_k).detach().numpy()
45 | 
46 |     res = {"model": newmodel, "__author__": "torchvision", "matching_heuristics": True}
47 | 
48 |     with open(sys.argv[2], "wb") as f:
49 |         pkl.dump(res, f)
50 |     if obj:
51 |         print("Unconverted keys:", obj.keys())
52 | 


--------------------------------------------------------------------------------
/tools/evaluate_coco_boundary_ap.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 3 | # Modified by Bowen Cheng from: https://github.com/bowenc0221/boundary-iou-api/blob/master/tools/coco_instance_evaluation.py
 4 | 
 5 | """
 6 | Evaluation for COCO val2017:
 7 | python ./tools/coco_instance_evaluation.py \
 8 |     --gt-json-file COCO_GT_JSON \
 9 |     --dt-json-file COCO_DT_JSON
10 | """
11 | import argparse
12 | import json
13 | 
14 | from boundary_iou.coco_instance_api.coco import COCO
15 | from boundary_iou.coco_instance_api.cocoeval import COCOeval
16 | 
17 | 
18 | def main():
19 |     parser = argparse.ArgumentParser()
20 |     parser.add_argument("--gt-json-file", default="")
21 |     parser.add_argument("--dt-json-file", default="")
22 |     parser.add_argument("--iou-type", default="boundary")
23 |     parser.add_argument("--dilation-ratio", default="0.020", type=float)
24 |     args = parser.parse_args()
25 |     print(args)
26 | 
27 |     annFile = args.gt_json_file
28 |     resFile = args.dt_json_file
29 |     dilation_ratio = args.dilation_ratio
30 |     if args.iou_type == "boundary":
31 |         get_boundary = True
32 |     else:
33 |         get_boundary = False
34 |     cocoGt = COCO(annFile, get_boundary=get_boundary, dilation_ratio=dilation_ratio)
35 |     
36 |     # remove box predictions
37 |     resFile = json.load(open(resFile))
38 |     for c in resFile:
39 |         c.pop("bbox", None)
40 | 
41 |     cocoDt = cocoGt.loadRes(resFile)
42 |     cocoEval = COCOeval(cocoGt, cocoDt, iouType=args.iou_type, dilation_ratio=dilation_ratio)
43 |     cocoEval.evaluate()
44 |     cocoEval.accumulate()
45 |     cocoEval.summarize()
46 | 
47 | 
48 | if __name__ == '__main__':
49 |     main()
50 | 


--------------------------------------------------------------------------------
/tools/evaluate_pq_for_semantic_segmentation.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # Copyright (c) Facebook, Inc. and its affiliates.
  3 | 
  4 | import argparse
  5 | import json
  6 | import os
  7 | from collections import defaultdict
  8 | from tqdm import tqdm
  9 | 
 10 | import numpy as np
 11 | import torch
 12 | 
 13 | from detectron2.data import MetadataCatalog
 14 | from detectron2.data.detection_utils import read_image
 15 | from detectron2.utils.file_io import PathManager
 16 | from pycocotools import mask as maskUtils
 17 | 
 18 | from panopticapi.evaluation import PQStat
 19 | 
 20 | 
 21 | def default_argument_parser():
 22 |     """
 23 |     Creates a parser with some common arguments used by analysis tools.
 24 |     Returns:
 25 |         argparse.ArgumentParser:
 26 |     """
 27 |     parser = argparse.ArgumentParser(description="Evaluate PQ metric for semantic segmentation.")
 28 |     # NOTE: currently does not support Cityscapes, you need to convert
 29 |     # Cityscapes prediction format to Detectron2 prediction format.
 30 |     parser.add_argument(
 31 |         "--dataset-name",
 32 |         default="ade20k_sem_seg_val",
 33 |         choices=["ade20k_sem_seg_val", "coco_2017_test_stuff_10k_sem_seg", "ade20k_full_sem_seg_val"],
 34 |         help="dataset name you want to evaluate")
 35 |     parser.add_argument("--json-file", default="", help="path to detection json file")
 36 | 
 37 |     return parser
 38 | 
 39 | 
 40 | # Modified from the official panoptic api: https://github.com/cocodataset/panopticapi/blob/master/panopticapi/evaluation.py
 41 | def pq_compute_single_image(segm_gt, segm_dt, categories, ignore_label):
 42 |     pq_stat = PQStat()
 43 |     VOID = ignore_label
 44 |     OFFSET = 256 * 256 * 256
 45 | 
 46 |     pan_gt = segm_gt
 47 |     pan_pred = segm_dt
 48 | 
 49 |     gt_ann = {'segments_info': []}
 50 |     labels, labels_cnt = np.unique(segm_gt, return_counts=True)
 51 |     for cat_id, cnt in zip(labels, labels_cnt):
 52 |         if cat_id == VOID:
 53 |             continue
 54 |         gt_ann['segments_info'].append(
 55 |             {"id": cat_id, "category_id": cat_id, "area": cnt, "iscrowd": 0}
 56 |         )
 57 |     
 58 |     pred_ann = {'segments_info': []}
 59 |     for cat_id in np.unique(segm_dt):
 60 |         pred_ann['segments_info'].append({"id": cat_id, "category_id": cat_id})
 61 | 
 62 |     gt_segms = {el['id']: el for el in gt_ann['segments_info']}
 63 |     pred_segms = {el['id']: el for el in pred_ann['segments_info']}
 64 | 
 65 |     # predicted segments area calculation + prediction sanity checks
 66 |     pred_labels_set = set(el['id'] for el in pred_ann['segments_info'])
 67 |     labels, labels_cnt = np.unique(pan_pred, return_counts=True)
 68 |     for label, label_cnt in zip(labels, labels_cnt):
 69 |         if label not in pred_segms:
 70 |             if label == VOID:
 71 |                 continue
 72 |             raise KeyError('In the image with ID {} segment with ID {} is presented in PNG and not presented in JSON.'.format(image_id, label))
 73 |         pred_segms[label]['area'] = label_cnt
 74 |         pred_labels_set.remove(label)
 75 |         if pred_segms[label]['category_id'] not in categories:
 76 |             raise KeyError('In the image with ID {} segment with ID {} has unknown category_id {}.'.format(image_id, label, pred_segms[label]['category_id']))
 77 |     if len(pred_labels_set) != 0:
 78 |         raise KeyError('In the image with ID {} the following segment IDs {} are presented in JSON and not presented in PNG.'.format(image_id, list(pred_labels_set)))
 79 | 
 80 |     # confusion matrix calculation
 81 |     pan_gt_pred = pan_gt.astype(np.uint64) * OFFSET + pan_pred.astype(np.uint64)
 82 |     gt_pred_map = {}
 83 |     labels, labels_cnt = np.unique(pan_gt_pred, return_counts=True)
 84 |     for label, intersection in zip(labels, labels_cnt):
 85 |         gt_id = label // OFFSET
 86 |         pred_id = label % OFFSET
 87 |         gt_pred_map[(gt_id, pred_id)] = intersection
 88 | 
 89 |     # count all matched pairs
 90 |     gt_matched = set()
 91 |     pred_matched = set()
 92 |     for label_tuple, intersection in gt_pred_map.items():
 93 |         gt_label, pred_label = label_tuple
 94 |         if gt_label not in gt_segms:
 95 |             continue
 96 |         if pred_label not in pred_segms:
 97 |             continue
 98 |         if gt_segms[gt_label]['iscrowd'] == 1:
 99 |             continue
100 |         if gt_segms[gt_label]['category_id'] != pred_segms[pred_label]['category_id']:
101 |             continue
102 | 
103 |         union = pred_segms[pred_label]['area'] + gt_segms[gt_label]['area'] - intersection - gt_pred_map.get((VOID, pred_label), 0)
104 |         iou = intersection / union
105 |         if iou > 0.5:
106 |             pq_stat[gt_segms[gt_label]['category_id']].tp += 1
107 |             pq_stat[gt_segms[gt_label]['category_id']].iou += iou
108 |             gt_matched.add(gt_label)
109 |             pred_matched.add(pred_label)
110 | 
111 |     # count false positives
112 |     crowd_labels_dict = {}
113 |     for gt_label, gt_info in gt_segms.items():
114 |         if gt_label in gt_matched:
115 |             continue
116 |         # crowd segments are ignored
117 |         if gt_info['iscrowd'] == 1:
118 |             crowd_labels_dict[gt_info['category_id']] = gt_label
119 |             continue
120 |         pq_stat[gt_info['category_id']].fn += 1
121 | 
122 |     # count false positives
123 |     for pred_label, pred_info in pred_segms.items():
124 |         if pred_label in pred_matched:
125 |             continue
126 |         # intersection of the segment with VOID
127 |         intersection = gt_pred_map.get((VOID, pred_label), 0)
128 |         # plus intersection with corresponding CROWD region if it exists
129 |         if pred_info['category_id'] in crowd_labels_dict:
130 |             intersection += gt_pred_map.get((crowd_labels_dict[pred_info['category_id']], pred_label), 0)
131 |         # predicted segment is ignored if more than half of the segment correspond to VOID and CROWD regions
132 |         if intersection / pred_info['area'] > 0.5:
133 |             continue
134 |         pq_stat[pred_info['category_id']].fp += 1
135 | 
136 |     return pq_stat
137 | 
138 | 
139 | def main():
140 |     parser = default_argument_parser()
141 |     args = parser.parse_args()
142 | 
143 |     _root = os.getenv("DETECTRON2_DATASETS", "datasets")
144 |     json_file = args.json_file
145 | 
146 |     with open(json_file) as f:
147 |         predictions = json.load(f)
148 | 
149 |     imgToAnns = defaultdict(list)
150 |     for pred in predictions:
151 |         image_id = os.path.basename(pred["file_name"]).split(".")[0]
152 |         imgToAnns[image_id].append(
153 |             {"category_id" : pred["category_id"], "segmentation" : pred["segmentation"]}
154 |         )
155 | 
156 |     image_ids = list(imgToAnns.keys())
157 | 
158 |     meta = MetadataCatalog.get(args.dataset_name)
159 |     class_names = meta.stuff_classes
160 |     num_classes = len(meta.stuff_classes)
161 |     ignore_label = meta.ignore_label
162 |     conf_matrix = np.zeros((num_classes + 1, num_classes + 1), dtype=np.int64)
163 | 
164 |     categories = {}
165 |     for i in range(num_classes):
166 |         categories[i] = {"id": i, "name": class_names[i], "isthing": 0}
167 | 
168 |     pq_stat = PQStat()
169 |     
170 |     for image_id in tqdm(image_ids):
171 |         if args.dataset_name == "ade20k_sem_seg_val":
172 |             gt_dir = os.path.join(_root, "ADEChallengeData2016", "annotations_detectron2", "validation")
173 |             segm_gt = read_image(os.path.join(gt_dir, image_id + ".png")).copy().astype(np.int64)
174 |         elif args.dataset_name == "coco_2017_test_stuff_10k_sem_seg":
175 |             gt_dir = os.path.join(_root, "coco", "coco_stuff_10k", "annotations_detectron2", "test")
176 |             segm_gt = read_image(os.path.join(gt_dir, image_id + ".png")).copy().astype(np.int64)
177 |         elif args.dataset_name == "ade20k_full_sem_seg_val":
178 |             gt_dir = os.path.join(_root, "ADE20K_2021_17_01", "annotations_detectron2", "validation")
179 |             segm_gt = read_image(os.path.join(gt_dir, image_id + ".tif")).copy().astype(np.int64)
180 |         else:
181 |             raise ValueError(f"Unsupported dataset {args.dataset_name}")
182 | 
183 |         # get predictions
184 |         segm_dt = np.zeros_like(segm_gt)
185 |         anns = imgToAnns[image_id]
186 |         for ann in anns:
187 |             # map back category_id
188 |             if hasattr(meta, "stuff_dataset_id_to_contiguous_id"):
189 |                 if ann["category_id"] in meta.stuff_dataset_id_to_contiguous_id:
190 |                     category_id = meta.stuff_dataset_id_to_contiguous_id[ann["category_id"]]
191 |             else:
192 |                 category_id = ann["category_id"]
193 |             mask = maskUtils.decode(ann["segmentation"])
194 |             segm_dt[mask > 0] = category_id
195 | 
196 |         # miou
197 |         gt = segm_gt.copy()
198 |         pred = segm_dt.copy()
199 |         gt[gt == ignore_label] = num_classes
200 |         conf_matrix += np.bincount(
201 |             (num_classes + 1) * pred.reshape(-1) + gt.reshape(-1),
202 |             minlength=conf_matrix.size,
203 |         ).reshape(conf_matrix.shape)
204 | 
205 |         # pq
206 |         pq_stat_single = pq_compute_single_image(segm_gt, segm_dt, categories, meta.ignore_label)
207 |         pq_stat += pq_stat_single
208 | 
209 |     metrics = [("All", None), ("Stuff", False)]
210 |     results = {}
211 |     for name, isthing in metrics:
212 |         results[name], per_class_results = pq_stat.pq_average(categories, isthing=isthing)
213 |         if name == 'All':
214 |             results['per_class'] = per_class_results
215 |     print("{:10s}| {:>5s}  {:>5s}  {:>5s} {:>5s}".format("", "PQ", "SQ", "RQ", "N"))
216 |     print("-" * (10 + 7 * 4))
217 | 
218 |     for name, _isthing in metrics:
219 |         print("{:10s}| {:5.1f}  {:5.1f}  {:5.1f} {:5d}".format(
220 |             name,
221 |             100 * results[name]['pq'],
222 |             100 * results[name]['sq'],
223 |             100 * results[name]['rq'],
224 |             results[name]['n'])
225 |         )
226 | 
227 |     # calculate miou
228 |     acc = np.full(num_classes, np.nan, dtype=np.float64)
229 |     iou = np.full(num_classes, np.nan, dtype=np.float64)
230 |     tp = conf_matrix.diagonal()[:-1].astype(np.float64)
231 |     pos_gt = np.sum(conf_matrix[:-1, :-1], axis=0).astype(np.float64)
232 |     pos_pred = np.sum(conf_matrix[:-1, :-1], axis=1).astype(np.float64)
233 |     acc_valid = pos_gt > 0
234 |     acc[acc_valid] = tp[acc_valid] / pos_gt[acc_valid]
235 |     iou_valid = (pos_gt + pos_pred) > 0
236 |     union = pos_gt + pos_pred - tp
237 |     iou[acc_valid] = tp[acc_valid] / union[acc_valid]
238 |     miou = np.sum(iou[acc_valid]) / np.sum(iou_valid)
239 | 
240 |     print("")
241 |     print(f"mIoU: {miou}")
242 | 
243 | 
244 | if __name__ == '__main__':
245 |     main()
246 | 


--------------------------------------------------------------------------------