├── .DS_Store ├── .gitignore ├── INSTALL.md ├── LICENSE ├── README.md ├── configs ├── ade20k │ └── semantic-segmentation │ │ ├── Base-ADE20K-SemanticSegmentation.yaml │ │ └── maskdino_R50_bs16_160k_steplr.yaml ├── cityscapes │ └── semantic-segmentation │ │ ├── Base-Cityscapes-SemanticSegmentation.yaml │ │ └── maskdino_R50_bs16_90k_steplr.yaml └── coco │ ├── instance-segmentation │ ├── Base-COCO-InstanceSegmentation.yaml │ ├── maskdino_R50_bs16_50ep_3s.yaml │ ├── maskdino_R50_bs16_50ep_3s_dowsample1_2048.yaml │ ├── maskdino_R50_bs16_50ep_3s_dowsample1_2048_bitmask.yaml │ ├── maskdino_R50_bs16_50ep_3s_dowsample2.yaml │ ├── maskdino_R50_bs16_50ep_4s_dowsample1_2048.yaml │ └── swin │ │ ├── maskdino_R50_bs16_50ep_4s_dowsample1_1024.yaml │ │ ├── maskdino_R50_bs16_50ep_4s_dowsample1_2048.yaml │ │ └── maskdino_R50_bs16_50ep_4s_dowsample1_2048_no_maskEnhance.yaml │ └── panoptic-segmentation │ ├── Base-COCO-PanopticSegmentation.yaml │ ├── maskdino_R50_bs16_50ep_3s_dowsample1_2048.yaml │ └── swin │ ├── maskdino_R50_bs16_50ep_4s_dowsample1_2048.yaml │ ├── maskdino_R50_bs16_50ep_4s_dowsample1_large_eval.yaml │ └── maskdino_R50_bs16_50ep_4s_dowsample1_maskEnhance_2048.yaml ├── datasets ├── README.md ├── ade20k_instance_catid_mapping.txt ├── ade20k_instance_imgCatIds.json ├── prepare_ade20k_ins_seg.py ├── prepare_ade20k_pan_seg.py ├── prepare_ade20k_sem_seg.py └── prepare_coco_semantic_annos_from_panoptic_annos.py ├── demo ├── README.md ├── demo.py └── predictor.py ├── figures ├── dinosaur.png ├── framework.jpg ├── instance.png ├── panoptic.png ├── semantic.png └── sota.png ├── maskdino ├── .DS_Store ├── __init__.py ├── config.py ├── data │ ├── __init__.py │ ├── dataset_mappers │ │ ├── __init__.py │ │ ├── coco_instance_new_baseline_dataset_mapper.py │ │ ├── coco_panoptic_new_baseline_dataset_mapper.py │ │ ├── detr_dataset_mapper.py │ │ └── mask_former_semantic_dataset_mapper.py │ └── datasets │ │ ├── __init__.py │ │ ├── register_ade20k_full.py │ │ ├── register_ade20k_instance.py │ │ ├── register_ade20k_panoptic.py │ │ ├── register_coco_panoptic_annos_semseg.py │ │ ├── register_coco_stuff_10k.py │ │ ├── register_mapillary_vistas.py │ │ └── register_mapillary_vistas_panoptic.py ├── evaluation │ ├── __init__.py │ └── instance_evaluation.py ├── maskdino.py ├── modeling │ ├── __init__.py │ ├── backbone │ │ ├── __init__.py │ │ ├── focal.py │ │ └── swin.py │ ├── criterion.py │ ├── matcher.py │ ├── meta_arch │ │ ├── __init__.py │ │ └── maskdino_head.py │ ├── pixel_decoder │ │ ├── __init__.py │ │ ├── maskdino_encoder.py │ │ ├── ops │ │ │ ├── functions │ │ │ │ ├── __init__.py │ │ │ │ └── ms_deform_attn_func.py │ │ │ ├── make.sh │ │ │ ├── modules │ │ │ │ ├── __init__.py │ │ │ │ └── ms_deform_attn.py │ │ │ ├── setup.py │ │ │ ├── src │ │ │ │ ├── cpu │ │ │ │ │ ├── ms_deform_attn_cpu.cpp │ │ │ │ │ └── ms_deform_attn_cpu.h │ │ │ │ ├── cuda │ │ │ │ │ ├── ms_deform_attn_cuda.cu │ │ │ │ │ ├── ms_deform_attn_cuda.h │ │ │ │ │ └── ms_deform_im2col_cuda.cuh │ │ │ │ ├── ms_deform_attn.h │ │ │ │ └── vision.cpp │ │ │ └── test.py │ │ └── position_encoding.py │ └── transformer_decoder │ │ ├── __init__.py │ │ ├── dino_decoder.py │ │ └── maskdino_decoder.py ├── test_time_augmentation.py └── utils │ ├── __init__.py │ ├── box_ops.py │ ├── misc.py │ └── utils.py ├── requirements.txt ├── tools ├── README.md ├── analyze_model.py ├── convert-pretrained-swin-model-to-d2.py ├── convert-torchvision-to-d2.py ├── evaluate_coco_boundary_ap.py └── evaluate_pq_for_semantic_segmentation.py └── train_net.py /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IDEA-Research/MaskDINO/3831d8514a3728535ace8d4ecc7d28044c42dd14/.DS_Store -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # output dir 2 | output 3 | instant_test_output 4 | inference_test_output 5 | 6 | 7 | *.png 8 | *.json 9 | *.diff 10 | *.jpg 11 | !/projects/DensePose/doc/images/*.jpg 12 | 13 | # compilation and distribution 14 | __pycache__ 15 | _ext 16 | *.pyc 17 | *.pyd 18 | *.so 19 | *.dll 20 | *.egg-info/ 21 | build/ 22 | dist/ 23 | wheels/ 24 | 25 | # pytorch/python/numpy formats 26 | *.pth 27 | *.pkl 28 | *.npy 29 | *.ts 30 | model_ts*.txt 31 | 32 | # ipython/jupyter notebooks 33 | *.ipynb 34 | **/.ipynb_checkpoints/ 35 | 36 | # Editor temporaries 37 | *.swn 38 | *.swo 39 | *.swp 40 | *~ 41 | 42 | # editor settings 43 | .idea 44 | .vscode 45 | _darcs 46 | 47 | # project dirs 48 | /detectron2/model_zoo/configs 49 | /datasets/* 50 | !/datasets/*.* 51 | /projects/*/datasets 52 | /models 53 | /snippet 54 | -------------------------------------------------------------------------------- /INSTALL.md: -------------------------------------------------------------------------------- 1 | ## Installation 2 | 3 | ### Requirements 4 | - Linux with Python ≥ 3.6 5 | - PyTorch ≥ 1.9 and [torchvision](https://github.com/pytorch/vision/) that matches the PyTorch installation. 6 | Install them together at [pytorch.org](https://pytorch.org) to make sure of this. Note, please check 7 | PyTorch version matches that is required by Detectron2. 8 | - Detectron2: follow [Detectron2 installation instructions](https://detectron2.readthedocs.io/tutorials/install.html). 9 | - OpenCV is optional but needed by demo and visualization 10 | - `pip install -r requirements.txt` 11 | 12 | ### CUDA kernel for MSDeformAttn 13 | After preparing the required environment, run the following command to compile CUDA kernel for MSDeformAttn: 14 | 15 | `CUDA_HOME` must be defined and points to the directory of the installed CUDA toolkit. 16 | 17 | ```bash 18 | cd maskdino/modeling/pixel_decoder/ops 19 | sh make.sh 20 | ``` 21 | 22 | #### Building on another system 23 | To build on a system that does not have a GPU device but provide the drivers: 24 | ```bash 25 | TORCH_CUDA_ARCH_LIST='8.0' FORCE_CUDA=1 python setup.py build install 26 | ``` 27 | 28 | ### Example conda environment setup 29 | ```bash 30 | conda create --name maskdino python=3.8 -y 31 | conda activate maskdino 32 | conda install pytorch==1.9.0 torchvision==0.10.0 cudatoolkit=11.1 -c pytorch -c nvidia 33 | pip install -U opencv-python 34 | 35 | # under your working directory 36 | git clone git@github.com:facebookresearch/detectron2.git 37 | cd detectron2 38 | pip install -e . 39 | pip install git+https://github.com/cocodataset/panopticapi.git 40 | pip install git+https://github.com/mcordts/cityscapesScripts.git 41 | 42 | cd .. 43 | git clone git@github.com:facebookresearch/MaskDINO.git 44 | cd MaskDINO 45 | pip install -r requirements.txt 46 | cd maskdino/modeling/pixel_decoder/ops 47 | sh make.sh 48 | ``` 49 | -------------------------------------------------------------------------------- /configs/ade20k/semantic-segmentation/Base-ADE20K-SemanticSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | # NORM: "SyncBN" 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("ade20k_sem_seg_train",) 18 | TEST: ("ade20k_sem_seg_val",) 19 | SOLVER: 20 | IMS_PER_BATCH: 16 21 | BASE_LR: 0.0001 22 | MAX_ITER: 160000 23 | WARMUP_FACTOR: 1.0 24 | WARMUP_ITERS: 0 25 | WEIGHT_DECAY: 0.05 26 | OPTIMIZER: "ADAMW" 27 | LR_SCHEDULER_NAME: "WarmupPolyLR" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | AMP: 35 | ENABLED: True 36 | INPUT: 37 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 512) for x in range(5, 21)]"] 38 | MIN_SIZE_TRAIN_SAMPLING: "choice" 39 | MIN_SIZE_TEST: 512 40 | MAX_SIZE_TRAIN: 2048 41 | MAX_SIZE_TEST: 2048 42 | CROP: 43 | ENABLED: True 44 | TYPE: "absolute" 45 | SIZE: (512, 512) 46 | SINGLE_CATEGORY_MAX_AREA: 1.0 47 | COLOR_AUG_SSD: True 48 | SIZE_DIVISIBILITY: 512 # used in dataset mapper 49 | FORMAT: "RGB" 50 | DATASET_MAPPER_NAME: "mask_former_semantic" 51 | TEST: 52 | EVAL_PERIOD: 5000 53 | AUG: 54 | ENABLED: False 55 | MIN_SIZES: [256, 384, 512, 640, 768, 896] 56 | MAX_SIZE: 3584 57 | FLIP: True 58 | DATALOADER: 59 | FILTER_EMPTY_ANNOTATIONS: True 60 | NUM_WORKERS: 4 61 | VERSION: 2 62 | -------------------------------------------------------------------------------- /configs/ade20k/semantic-segmentation/maskdino_R50_bs16_160k_steplr.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-ADE20K-SemanticSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskDINO" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskDINOHead" 6 | IGNORE_VALUE: 255 7 | NUM_CLASSES: 150 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MaskDINOEncoder" 14 | DIM_FEEDFORWARD: 1024 15 | NUM_FEATURE_LEVELS: 3 16 | TOTAL_NUM_FEATURE_LEVELS: 3 17 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 18 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 19 | COMMON_STRIDE: 4 20 | TRANSFORMER_ENC_LAYERS: 6 21 | MaskDINO: 22 | TRANSFORMER_DECODER_NAME: "MaskDINODecoder" 23 | DEEP_SUPERVISION: True 24 | NO_OBJECT_WEIGHT: 0.1 25 | CLASS_WEIGHT: 4.0 26 | MASK_WEIGHT: 5.0 27 | DICE_WEIGHT: 5.0 28 | HIDDEN_DIM: 256 29 | NUM_OBJECT_QUERIES: 100 30 | NHEADS: 8 31 | DROPOUT: 0.0 32 | DIM_FEEDFORWARD: 2048 33 | ENC_LAYERS: 0 34 | PRE_NORM: False 35 | ENFORCE_INPUT_PROJ: False 36 | SIZE_DIVISIBILITY: 32 37 | DEC_LAYERS: 9 # 9 decoder layers, add one for the loss on learnable query 38 | TRAIN_NUM_POINTS: 12544 39 | OVERSAMPLE_RATIO: 3.0 40 | IMPORTANCE_SAMPLE_RATIO: 0.75 41 | TWO_STAGE: False 42 | DN: "seg" 43 | DN_NUM: 100 44 | INITIALIZE_BOX_TYPE: "no" 45 | SEMANTIC_CE_LOSS: True 46 | TEST: 47 | SEMANTIC_ON: True 48 | INSTANCE_ON: False 49 | PANOPTIC_ON: False 50 | OVERLAP_THRESHOLD: 0.8 51 | OBJECT_MASK_THRESHOLD: 0.8 52 | SOLVER: 53 | AMP: 54 | ENABLED: False 55 | BACKBONE_MULTIPLIER: 0.1 56 | BASE_LR: 0.0001 57 | BIAS_LR_FACTOR: 1.0 58 | CHECKPOINT_PERIOD: 5000 59 | 60 | IMS_PER_BATCH: 16 61 | LR_SCHEDULER_NAME: WarmupMultiStepLR 62 | MAX_ITER: 160000 63 | 64 | STEPS: (135000,150000) 65 | WARMUP_FACTOR: 1.0 66 | WARMUP_ITERS: 10 67 | WARMUP_METHOD: linear 68 | -------------------------------------------------------------------------------- /configs/cityscapes/semantic-segmentation/Base-Cityscapes-SemanticSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | NORM: "SyncBN" # use syncbn for cityscapes dataset 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("cityscapes_fine_sem_seg_train",) 18 | TEST: ("cityscapes_fine_sem_seg_val",) 19 | SOLVER: 20 | IMS_PER_BATCH: 16 21 | BASE_LR: 0.0001 22 | MAX_ITER: 90000 23 | WARMUP_FACTOR: 1.0 24 | WARMUP_ITERS: 0 25 | WEIGHT_DECAY: 0.05 26 | OPTIMIZER: "ADAMW" 27 | LR_SCHEDULER_NAME: "WarmupPolyLR" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | AMP: 35 | ENABLED: True 36 | INPUT: 37 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1024) for x in range(5, 21)]"] 38 | MIN_SIZE_TRAIN_SAMPLING: "choice" 39 | MIN_SIZE_TEST: 1024 40 | MAX_SIZE_TRAIN: 4096 41 | MAX_SIZE_TEST: 2048 42 | CROP: 43 | ENABLED: True 44 | TYPE: "absolute" 45 | SIZE: (512, 1024) 46 | SINGLE_CATEGORY_MAX_AREA: 1.0 47 | COLOR_AUG_SSD: True 48 | SIZE_DIVISIBILITY: -1 49 | FORMAT: "RGB" 50 | DATASET_MAPPER_NAME: "mask_former_semantic" 51 | TEST: 52 | EVAL_PERIOD: 5000 53 | AUG: 54 | ENABLED: False 55 | MIN_SIZES: [512, 768, 1024, 1280, 1536, 1792] 56 | MAX_SIZE: 4096 57 | FLIP: True 58 | DATALOADER: 59 | FILTER_EMPTY_ANNOTATIONS: True 60 | NUM_WORKERS: 4 61 | VERSION: 2 62 | -------------------------------------------------------------------------------- /configs/cityscapes/semantic-segmentation/maskdino_R50_bs16_90k_steplr.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-Cityscapes-SemanticSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskDINO" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskDINOHead" 6 | IGNORE_VALUE: 255 7 | NUM_CLASSES: 19 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MaskDINOEncoder" 14 | DIM_FEEDFORWARD: 1024 15 | NUM_FEATURE_LEVELS: 3 16 | TOTAL_NUM_FEATURE_LEVELS: 3 17 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 18 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 19 | COMMON_STRIDE: 4 20 | TRANSFORMER_ENC_LAYERS: 6 21 | MaskDINO: 22 | TRANSFORMER_DECODER_NAME: "MaskDINODecoder" 23 | DEEP_SUPERVISION: True 24 | NO_OBJECT_WEIGHT: 0.1 25 | CLASS_WEIGHT: 4.0 26 | MASK_WEIGHT: 5.0 27 | DICE_WEIGHT: 5.0 28 | HIDDEN_DIM: 256 29 | NUM_OBJECT_QUERIES: 100 30 | NHEADS: 8 31 | DROPOUT: 0.0 32 | DIM_FEEDFORWARD: 2048 33 | ENC_LAYERS: 0 34 | PRE_NORM: False 35 | ENFORCE_INPUT_PROJ: False 36 | SIZE_DIVISIBILITY: 32 37 | DEC_LAYERS: 9 # 9 decoder layers, add one for the loss on learnable query 38 | TRAIN_NUM_POINTS: 12544 39 | OVERSAMPLE_RATIO: 3.0 40 | IMPORTANCE_SAMPLE_RATIO: 0.75 41 | TWO_STAGE: False 42 | DN: "seg" 43 | DN_NUM: 100 44 | INITIALIZE_BOX_TYPE: "no" 45 | SEMANTIC_CE_LOSS: True 46 | TEST: 47 | SEMANTIC_ON: True 48 | INSTANCE_ON: False 49 | PANOPTIC_ON: False 50 | OVERLAP_THRESHOLD: 0.8 51 | OBJECT_MASK_THRESHOLD: 0.8 52 | SOLVER: 53 | AMP: 54 | ENABLED: False 55 | BACKBONE_MULTIPLIER: 0.1 56 | BASE_LR: 0.0001 57 | BIAS_LR_FACTOR: 1.0 58 | CHECKPOINT_PERIOD: 5000 59 | 60 | IMS_PER_BATCH: 16 61 | LR_SCHEDULER_NAME: WarmupMultiStepLR 62 | MAX_ITER: 90000 63 | 64 | STEPS: (80000,87000) 65 | WARMUP_FACTOR: 1.0 66 | WARMUP_ITERS: 10 67 | WARMUP_METHOD: linear 68 | -------------------------------------------------------------------------------- /configs/coco/instance-segmentation/Base-COCO-InstanceSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | # NORM: "SyncBN" 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("coco_2017_train",) 18 | TEST: ("coco_2017_val",) 19 | SOLVER: 20 | IMS_PER_BATCH: 16 21 | BASE_LR: 0.0001 22 | STEPS: (327778, 355092) 23 | MAX_ITER: 368750 24 | WARMUP_FACTOR: 1.0 25 | WARMUP_ITERS: 10 26 | WEIGHT_DECAY: 0.05 27 | OPTIMIZER: "ADAMW" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | AMP: 35 | ENABLED: True 36 | INPUT: 37 | IMAGE_SIZE: 1024 38 | MIN_SCALE: 0.1 39 | MAX_SCALE: 2.0 40 | FORMAT: "RGB" 41 | DATASET_MAPPER_NAME: "coco_instance_lsj" 42 | TEST: 43 | EVAL_PERIOD: 5000 44 | # EVAL_FLAG: 1 45 | DATALOADER: 46 | FILTER_EMPTY_ANNOTATIONS: True 47 | NUM_WORKERS: 4 48 | VERSION: 2 49 | -------------------------------------------------------------------------------- /configs/coco/instance-segmentation/maskdino_R50_bs16_50ep_3s.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-COCO-InstanceSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskDINO" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskDINOHead" 6 | IGNORE_VALUE: 255 7 | NUM_CLASSES: 80 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MaskDINOEncoder" 14 | DIM_FEEDFORWARD: 1024 15 | NUM_FEATURE_LEVELS: 3 16 | TOTAL_NUM_FEATURE_LEVELS: 3 17 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 18 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 19 | COMMON_STRIDE: 4 20 | TRANSFORMER_ENC_LAYERS: 6 21 | MaskDINO: 22 | TRANSFORMER_DECODER_NAME: "MaskDINODecoder" 23 | DEEP_SUPERVISION: True 24 | NO_OBJECT_WEIGHT: 0.1 25 | CLASS_WEIGHT: 4.0 26 | MASK_WEIGHT: 5.0 27 | DICE_WEIGHT: 5.0 28 | BOX_WEIGHT: 5.0 29 | GIOU_WEIGHT: 2.0 30 | HIDDEN_DIM: 256 31 | NUM_OBJECT_QUERIES: 300 32 | NHEADS: 8 33 | DROPOUT: 0.0 34 | DIM_FEEDFORWARD: 2048 35 | ENC_LAYERS: 0 36 | PRE_NORM: False 37 | ENFORCE_INPUT_PROJ: False 38 | SIZE_DIVISIBILITY: 32 39 | DEC_LAYERS: 9 # 9+1, 9 decoder layers, add one for the loss on learnable query 40 | TRAIN_NUM_POINTS: 12544 41 | OVERSAMPLE_RATIO: 3.0 42 | IMPORTANCE_SAMPLE_RATIO: 0.75 43 | INITIAL_PRED: True 44 | TWO_STAGE: True 45 | DN: "seg" 46 | DN_NUM: 100 47 | INITIALIZE_BOX_TYPE: "bitmask" 48 | TEST: 49 | SEMANTIC_ON: False 50 | INSTANCE_ON: True 51 | PANOPTIC_ON: False 52 | OVERLAP_THRESHOLD: 0.8 53 | OBJECT_MASK_THRESHOLD: 0.25 54 | 55 | SOLVER: 56 | AMP: 57 | ENABLED: True 58 | TEST: 59 | EVAL_PERIOD: 5000 60 | # EVAL_FLAG: 1 -------------------------------------------------------------------------------- /configs/coco/instance-segmentation/maskdino_R50_bs16_50ep_3s_dowsample1_2048.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-COCO-InstanceSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskDINO" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskDINOHead" 6 | IGNORE_VALUE: 255 7 | NUM_CLASSES: 80 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MaskDINOEncoder" 14 | DIM_FEEDFORWARD: 2048 15 | NUM_FEATURE_LEVELS: 3 16 | TOTAL_NUM_FEATURE_LEVELS: 4 17 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 18 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 19 | COMMON_STRIDE: 4 20 | TRANSFORMER_ENC_LAYERS: 6 21 | FEATURE_ORDER: "low2high" 22 | MaskDINO: 23 | TRANSFORMER_DECODER_NAME: "MaskDINODecoder" 24 | DEEP_SUPERVISION: True 25 | NO_OBJECT_WEIGHT: 0.1 26 | CLASS_WEIGHT: 4.0 27 | MASK_WEIGHT: 5.0 28 | DICE_WEIGHT: 5.0 29 | BOX_WEIGHT: 5.0 30 | GIOU_WEIGHT: 2.0 31 | HIDDEN_DIM: 256 32 | NUM_OBJECT_QUERIES: 300 33 | NHEADS: 8 34 | DROPOUT: 0.0 35 | DIM_FEEDFORWARD: 2048 36 | ENC_LAYERS: 0 37 | PRE_NORM: False 38 | ENFORCE_INPUT_PROJ: False 39 | SIZE_DIVISIBILITY: 32 40 | DEC_LAYERS: 9 # 9+1, 9 decoder layers, add one for the loss on learnable query 41 | TRAIN_NUM_POINTS: 12544 42 | OVERSAMPLE_RATIO: 3.0 43 | IMPORTANCE_SAMPLE_RATIO: 0.75 44 | EVAL_FLAG: 1 45 | INITIAL_PRED: True 46 | TWO_STAGE: True 47 | DN: "seg" 48 | DN_NUM: 100 49 | INITIALIZE_BOX_TYPE: 'mask2box' 50 | TEST: 51 | SEMANTIC_ON: False 52 | INSTANCE_ON: True 53 | PANOPTIC_ON: False 54 | OVERLAP_THRESHOLD: 0.8 55 | OBJECT_MASK_THRESHOLD: 0.25 56 | 57 | SOLVER: 58 | AMP: 59 | ENABLED: True 60 | TEST: 61 | EVAL_PERIOD: 5000 62 | # EVAL_FLAG: 1 -------------------------------------------------------------------------------- /configs/coco/instance-segmentation/maskdino_R50_bs16_50ep_3s_dowsample1_2048_bitmask.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-COCO-InstanceSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskDINO" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskDINOHead" 6 | IGNORE_VALUE: 255 7 | NUM_CLASSES: 80 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MaskDINOEncoder" 14 | DIM_FEEDFORWARD: 2048 15 | NUM_FEATURE_LEVELS: 3 16 | TOTAL_NUM_FEATURE_LEVELS: 4 17 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 18 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 19 | COMMON_STRIDE: 4 20 | TRANSFORMER_ENC_LAYERS: 6 21 | FEATURE_ORDER: "low2high" 22 | MaskDINO: 23 | TRANSFORMER_DECODER_NAME: "MaskDINODecoder" 24 | DEEP_SUPERVISION: True 25 | NO_OBJECT_WEIGHT: 0.1 26 | CLASS_WEIGHT: 4.0 27 | MASK_WEIGHT: 5.0 28 | DICE_WEIGHT: 5.0 29 | BOX_WEIGHT: 5.0 30 | GIOU_WEIGHT: 2.0 31 | HIDDEN_DIM: 256 32 | NUM_OBJECT_QUERIES: 300 33 | NHEADS: 8 34 | DROPOUT: 0.0 35 | DIM_FEEDFORWARD: 2048 36 | ENC_LAYERS: 0 37 | PRE_NORM: False 38 | ENFORCE_INPUT_PROJ: False 39 | SIZE_DIVISIBILITY: 32 40 | DEC_LAYERS: 9 # 9+1, 9 decoder layers, add one for the loss on learnable query 41 | TRAIN_NUM_POINTS: 12544 42 | OVERSAMPLE_RATIO: 3.0 43 | IMPORTANCE_SAMPLE_RATIO: 0.75 44 | EVAL_FLAG: 1 45 | INITIAL_PRED: True 46 | TWO_STAGE: True 47 | DN: "seg" 48 | DN_NUM: 100 49 | INITIALIZE_BOX_TYPE: 'bitmask' 50 | TEST: 51 | SEMANTIC_ON: False 52 | INSTANCE_ON: True 53 | PANOPTIC_ON: False 54 | OVERLAP_THRESHOLD: 0.8 55 | OBJECT_MASK_THRESHOLD: 0.25 56 | 57 | SOLVER: 58 | AMP: 59 | ENABLED: True 60 | TEST: 61 | EVAL_PERIOD: 5000 62 | # EVAL_FLAG: 1 63 | -------------------------------------------------------------------------------- /configs/coco/instance-segmentation/maskdino_R50_bs16_50ep_3s_dowsample2.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-COCO-InstanceSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskDINO" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskDINOHead" 6 | IGNORE_VALUE: 255 7 | NUM_CLASSES: 80 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MaskDINOEncoder" 14 | DIM_FEEDFORWARD: 1024 15 | NUM_FEATURE_LEVELS: 3 16 | TOTAL_NUM_FEATURE_LEVELS: 5 17 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 18 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 19 | COMMON_STRIDE: 4 20 | TRANSFORMER_ENC_LAYERS: 6 21 | MaskDINO: 22 | TRANSFORMER_DECODER_NAME: "MaskDINODecoder" 23 | DEEP_SUPERVISION: True 24 | NO_OBJECT_WEIGHT: 0.1 25 | CLASS_WEIGHT: 4.0 26 | MASK_WEIGHT: 5.0 27 | DICE_WEIGHT: 5.0 28 | BOX_WEIGHT: 5.0 29 | GIOU_WEIGHT: 2.0 30 | HIDDEN_DIM: 256 31 | NUM_OBJECT_QUERIES: 300 32 | NHEADS: 8 33 | DROPOUT: 0.0 34 | DIM_FEEDFORWARD: 2048 35 | ENC_LAYERS: 0 36 | PRE_NORM: False 37 | ENFORCE_INPUT_PROJ: False 38 | SIZE_DIVISIBILITY: 32 39 | DEC_LAYERS: 9 # 9+1, 9 decoder layers, add one for the loss on learnable query 40 | TRAIN_NUM_POINTS: 12544 41 | OVERSAMPLE_RATIO: 3.0 42 | IMPORTANCE_SAMPLE_RATIO: 0.75 43 | EVAL_FLAG: 1 44 | INITIAL_PRED: True 45 | TWO_STAGE: True 46 | DN: "seg" 47 | DN_NUM: 100 48 | INITIALIZE_BOX_TYPE: 'no' 49 | TEST: 50 | SEMANTIC_ON: False 51 | INSTANCE_ON: True 52 | PANOPTIC_ON: False 53 | OVERLAP_THRESHOLD: 0.8 54 | OBJECT_MASK_THRESHOLD: 0.25 55 | 56 | SOLVER: 57 | AMP: 58 | ENABLED: True 59 | TEST: 60 | EVAL_PERIOD: 5000 61 | # EVAL_FLAG: 1 -------------------------------------------------------------------------------- /configs/coco/instance-segmentation/maskdino_R50_bs16_50ep_4s_dowsample1_2048.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-COCO-InstanceSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskDINO" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskDINOHead" 6 | IGNORE_VALUE: 255 7 | NUM_CLASSES: 80 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MaskDINOEncoder" 14 | DIM_FEEDFORWARD: 2048 15 | NUM_FEATURE_LEVELS: 3 16 | TOTAL_NUM_FEATURE_LEVELS: 4 17 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 18 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res2","res3", "res4", "res5"] 19 | COMMON_STRIDE: 4 20 | TRANSFORMER_ENC_LAYERS: 6 21 | FEATURE_ORDER: "low2high" 22 | MaskDINO: 23 | TRANSFORMER_DECODER_NAME: "MaskDINODecoder" 24 | DEEP_SUPERVISION: True 25 | NO_OBJECT_WEIGHT: 0.1 26 | CLASS_WEIGHT: 4.0 27 | MASK_WEIGHT: 5.0 28 | DICE_WEIGHT: 5.0 29 | BOX_WEIGHT: 5.0 30 | GIOU_WEIGHT: 2.0 31 | HIDDEN_DIM: 256 32 | NUM_OBJECT_QUERIES: 300 33 | NHEADS: 8 34 | DROPOUT: 0.0 35 | DIM_FEEDFORWARD: 2048 36 | ENC_LAYERS: 0 37 | PRE_NORM: False 38 | ENFORCE_INPUT_PROJ: False 39 | SIZE_DIVISIBILITY: 32 40 | DEC_LAYERS: 9 # 9+1, 9 decoder layers, add one for the loss on learnable query 41 | TRAIN_NUM_POINTS: 12544 42 | OVERSAMPLE_RATIO: 3.0 43 | IMPORTANCE_SAMPLE_RATIO: 0.75 44 | EVAL_FLAG: 1 45 | INITIAL_PRED: True 46 | TWO_STAGE: True 47 | DN: "seg" 48 | DN_NUM: 100 49 | INITIALIZE_BOX_TYPE: 'no' 50 | TEST: 51 | SEMANTIC_ON: False 52 | INSTANCE_ON: True 53 | PANOPTIC_ON: False 54 | OVERLAP_THRESHOLD: 0.8 55 | OBJECT_MASK_THRESHOLD: 0.25 56 | 57 | SOLVER: 58 | AMP: 59 | ENABLED: True 60 | TEST: 61 | EVAL_PERIOD: 5000 62 | # EVAL_FLAG: 1 -------------------------------------------------------------------------------- /configs/coco/instance-segmentation/swin/maskdino_R50_bs16_50ep_4s_dowsample1_1024.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../Base-COCO-InstanceSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskDINO" 4 | BACKBONE: 5 | NAME: "D2SwinTransformer" 6 | SWIN: 7 | EMBED_DIM: 192 8 | DEPTHS: [ 2, 2, 18, 2 ] 9 | NUM_HEADS: [ 6, 12, 24, 48 ] 10 | WINDOW_SIZE: 12 11 | APE: False 12 | DROP_PATH_RATE: 0.3 13 | PATCH_NORM: True 14 | PRETRAIN_IMG_SIZE: 384 15 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" 16 | PIXEL_MEAN: [ 123.675, 116.280, 103.530 ] 17 | PIXEL_STD: [ 58.395, 57.120, 57.375 ] 18 | # head 19 | SEM_SEG_HEAD: 20 | NAME: "MaskDINOHead" 21 | IGNORE_VALUE: 255 22 | NUM_CLASSES: 80 23 | LOSS_WEIGHT: 1.0 24 | CONVS_DIM: 256 25 | MASK_DIM: 256 26 | NORM: "GN" 27 | # pixel decoder 28 | PIXEL_DECODER_NAME: "MaskDINOEncoder" 29 | DIM_FEEDFORWARD: 1024 30 | NUM_FEATURE_LEVELS: 4 31 | TOTAL_NUM_FEATURE_LEVELS: 5 32 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 33 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res2","res3", "res4", "res5"] 34 | COMMON_STRIDE: 4 35 | TRANSFORMER_ENC_LAYERS: 6 36 | FEATURE_ORDER: "low2high" 37 | MaskDINO: 38 | TRANSFORMER_DECODER_NAME: "MaskDINODecoder" 39 | DEEP_SUPERVISION: True 40 | NO_OBJECT_WEIGHT: 0.1 41 | CLASS_WEIGHT: 4.0 42 | MASK_WEIGHT: 5.0 43 | DICE_WEIGHT: 5.0 44 | BOX_WEIGHT: 5.0 45 | GIOU_WEIGHT: 2.0 46 | HIDDEN_DIM: 256 47 | NUM_OBJECT_QUERIES: 300 48 | NHEADS: 8 49 | DROPOUT: 0.0 50 | DIM_FEEDFORWARD: 2048 51 | ENC_LAYERS: 0 52 | PRE_NORM: False 53 | ENFORCE_INPUT_PROJ: False 54 | SIZE_DIVISIBILITY: 32 55 | DEC_LAYERS: 9 # 9+1, 9 decoder layers, add one for the loss on learnable query 56 | TRAIN_NUM_POINTS: 12544 57 | OVERSAMPLE_RATIO: 3.0 58 | IMPORTANCE_SAMPLE_RATIO: 0.75 59 | EVAL_FLAG: 1 60 | INITIAL_PRED: True 61 | TWO_STAGE: True 62 | DN: "seg" 63 | DN_NUM: 100 64 | INITIALIZE_BOX_TYPE: 'bitmask' 65 | TEST: 66 | SEMANTIC_ON: False 67 | INSTANCE_ON: True 68 | PANOPTIC_ON: False 69 | OVERLAP_THRESHOLD: 0.8 70 | OBJECT_MASK_THRESHOLD: 0.25 71 | 72 | SOLVER: 73 | AMP: 74 | ENABLED: True 75 | TEST: 76 | EVAL_PERIOD: 5000 77 | # EVAL_FLAG: 1 -------------------------------------------------------------------------------- /configs/coco/instance-segmentation/swin/maskdino_R50_bs16_50ep_4s_dowsample1_2048.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../Base-COCO-InstanceSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskDINO" 4 | BACKBONE: 5 | NAME: "D2SwinTransformer" 6 | SWIN: 7 | EMBED_DIM: 192 8 | DEPTHS: [ 2, 2, 18, 2 ] 9 | NUM_HEADS: [ 6, 12, 24, 48 ] 10 | WINDOW_SIZE: 12 11 | APE: False 12 | DROP_PATH_RATE: 0.3 13 | PATCH_NORM: True 14 | PRETRAIN_IMG_SIZE: 384 15 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" 16 | PIXEL_MEAN: [ 123.675, 116.280, 103.530 ] 17 | PIXEL_STD: [ 58.395, 57.120, 57.375 ] 18 | # head 19 | SEM_SEG_HEAD: 20 | NAME: "MaskDINOHead" 21 | IGNORE_VALUE: 255 22 | NUM_CLASSES: 80 23 | LOSS_WEIGHT: 1.0 24 | CONVS_DIM: 256 25 | MASK_DIM: 256 26 | NORM: "GN" 27 | # pixel decoder 28 | PIXEL_DECODER_NAME: "MaskDINOEncoder" 29 | DIM_FEEDFORWARD: 2048 30 | NUM_FEATURE_LEVELS: 4 31 | TOTAL_NUM_FEATURE_LEVELS: 5 32 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 33 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res2","res3", "res4", "res5"] 34 | COMMON_STRIDE: 4 35 | TRANSFORMER_ENC_LAYERS: 6 36 | FEATURE_ORDER: "low2high" 37 | MaskDINO: 38 | TRANSFORMER_DECODER_NAME: "MaskDINODecoder" 39 | DEEP_SUPERVISION: True 40 | NO_OBJECT_WEIGHT: 0.1 41 | CLASS_WEIGHT: 4.0 42 | MASK_WEIGHT: 5.0 43 | DICE_WEIGHT: 5.0 44 | BOX_WEIGHT: 5.0 45 | GIOU_WEIGHT: 2.0 46 | HIDDEN_DIM: 256 47 | NUM_OBJECT_QUERIES: 300 48 | NHEADS: 8 49 | DROPOUT: 0.0 50 | DIM_FEEDFORWARD: 2048 51 | ENC_LAYERS: 0 52 | PRE_NORM: False 53 | ENFORCE_INPUT_PROJ: False 54 | SIZE_DIVISIBILITY: 32 55 | DEC_LAYERS: 9 # 9+1, 9 decoder layers, add one for the loss on learnable query 56 | TRAIN_NUM_POINTS: 12544 57 | OVERSAMPLE_RATIO: 3.0 58 | IMPORTANCE_SAMPLE_RATIO: 0.75 59 | EVAL_FLAG: 1 60 | INITIAL_PRED: True 61 | TWO_STAGE: True 62 | DN: "seg" 63 | DN_NUM: 100 64 | INITIALIZE_BOX_TYPE: 'bitmask' 65 | TEST: 66 | SEMANTIC_ON: False 67 | INSTANCE_ON: True 68 | PANOPTIC_ON: False 69 | OVERLAP_THRESHOLD: 0.8 70 | OBJECT_MASK_THRESHOLD: 0.25 71 | 72 | SOLVER: 73 | AMP: 74 | ENABLED: True 75 | TEST: 76 | EVAL_PERIOD: 5000 77 | # EVAL_FLAG: 1 -------------------------------------------------------------------------------- /configs/coco/instance-segmentation/swin/maskdino_R50_bs16_50ep_4s_dowsample1_2048_no_maskEnhance.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../Base-COCO-InstanceSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskDINO" 4 | BACKBONE: 5 | NAME: "D2SwinTransformer" 6 | SWIN: 7 | EMBED_DIM: 192 8 | DEPTHS: [ 2, 2, 18, 2 ] 9 | NUM_HEADS: [ 6, 12, 24, 48 ] 10 | WINDOW_SIZE: 12 11 | APE: False 12 | DROP_PATH_RATE: 0.3 13 | PATCH_NORM: True 14 | PRETRAIN_IMG_SIZE: 384 15 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" 16 | PIXEL_MEAN: [ 123.675, 116.280, 103.530 ] 17 | PIXEL_STD: [ 58.395, 57.120, 57.375 ] 18 | # head 19 | SEM_SEG_HEAD: 20 | NAME: "MaskDINOHead" 21 | IGNORE_VALUE: 255 22 | NUM_CLASSES: 80 23 | LOSS_WEIGHT: 1.0 24 | CONVS_DIM: 256 25 | MASK_DIM: 256 26 | NORM: "GN" 27 | # pixel decoder 28 | PIXEL_DECODER_NAME: "MaskDINOEncoder" 29 | DIM_FEEDFORWARD: 2048 30 | NUM_FEATURE_LEVELS: 4 31 | TOTAL_NUM_FEATURE_LEVELS: 5 32 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 33 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res2","res3", "res4", "res5"] 34 | COMMON_STRIDE: 4 35 | TRANSFORMER_ENC_LAYERS: 6 36 | FEATURE_ORDER: "low2high" 37 | MaskDINO: 38 | TRANSFORMER_DECODER_NAME: "MaskDINODecoder" 39 | DEEP_SUPERVISION: True 40 | NO_OBJECT_WEIGHT: 0.1 41 | CLASS_WEIGHT: 4.0 42 | MASK_WEIGHT: 5.0 43 | DICE_WEIGHT: 5.0 44 | BOX_WEIGHT: 5.0 45 | GIOU_WEIGHT: 2.0 46 | HIDDEN_DIM: 256 47 | NUM_OBJECT_QUERIES: 300 48 | NHEADS: 8 49 | DROPOUT: 0.0 50 | DIM_FEEDFORWARD: 2048 51 | ENC_LAYERS: 0 52 | PRE_NORM: False 53 | ENFORCE_INPUT_PROJ: False 54 | SIZE_DIVISIBILITY: 32 55 | DEC_LAYERS: 9 # 9+1, 9 decoder layers, add one for the loss on learnable query 56 | TRAIN_NUM_POINTS: 12544 57 | OVERSAMPLE_RATIO: 3.0 58 | IMPORTANCE_SAMPLE_RATIO: 0.75 59 | EVAL_FLAG: 1 60 | INITIAL_PRED: True 61 | TWO_STAGE: True 62 | DN: "seg" 63 | DN_NUM: 100 64 | INITIALIZE_BOX_TYPE: 'no' 65 | TEST: 66 | SEMANTIC_ON: False 67 | INSTANCE_ON: True 68 | PANOPTIC_ON: False 69 | OVERLAP_THRESHOLD: 0.8 70 | OBJECT_MASK_THRESHOLD: 0.25 71 | 72 | SOLVER: 73 | AMP: 74 | ENABLED: True 75 | TEST: 76 | EVAL_PERIOD: 5000 77 | # EVAL_FLAG: 1 -------------------------------------------------------------------------------- /configs/coco/panoptic-segmentation/Base-COCO-PanopticSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | # NORM: "SyncBN" 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("coco_2017_train_panoptic",) 18 | TEST: ("coco_2017_val_panoptic_with_sem_seg",) # to evaluate instance and semantic performance as well 19 | SOLVER: 20 | IMS_PER_BATCH: 16 21 | BASE_LR: 0.0001 22 | STEPS: (327778, 355092) 23 | MAX_ITER: 368750 24 | WARMUP_FACTOR: 1.0 25 | WARMUP_ITERS: 10 26 | WEIGHT_DECAY: 0.05 27 | OPTIMIZER: "ADAMW" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | AMP: 35 | ENABLED: True 36 | INPUT: 37 | IMAGE_SIZE: 1024 38 | MIN_SCALE: 0.1 39 | MAX_SCALE: 2.0 40 | FORMAT: "RGB" 41 | DATASET_MAPPER_NAME: "coco_panoptic_lsj" 42 | TEST: 43 | EVAL_PERIOD: 5000 44 | DATALOADER: 45 | FILTER_EMPTY_ANNOTATIONS: True 46 | NUM_WORKERS: 4 47 | VERSION: 2 48 | -------------------------------------------------------------------------------- /configs/coco/panoptic-segmentation/maskdino_R50_bs16_50ep_3s_dowsample1_2048.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-COCO-PanopticSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskDINO" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskDINOHead" 6 | IGNORE_VALUE: 255 7 | NUM_CLASSES: 133 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MaskDINOEncoder" 14 | DIM_FEEDFORWARD: 2048 15 | NUM_FEATURE_LEVELS: 3 16 | TOTAL_NUM_FEATURE_LEVELS: 4 17 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 18 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 19 | COMMON_STRIDE: 4 20 | TRANSFORMER_ENC_LAYERS: 6 21 | FEATURE_ORDER: "low2high" 22 | MaskDINO: 23 | TRANSFORMER_DECODER_NAME: "MaskDINODecoder" 24 | DEEP_SUPERVISION: True 25 | NO_OBJECT_WEIGHT: 0.1 26 | CLASS_WEIGHT: 4.0 27 | MASK_WEIGHT: 5.0 28 | DICE_WEIGHT: 5.0 29 | BOX_WEIGHT: 5.0 30 | GIOU_WEIGHT: 2.0 31 | HIDDEN_DIM: 256 32 | NUM_OBJECT_QUERIES: 300 33 | NHEADS: 8 34 | DROPOUT: 0.0 35 | DIM_FEEDFORWARD: 2048 36 | ENC_LAYERS: 0 37 | PRE_NORM: False 38 | ENFORCE_INPUT_PROJ: False 39 | SIZE_DIVISIBILITY: 32 40 | DEC_LAYERS: 9 # 9+1, 9 decoder layers, add one for the loss on learnable query 41 | TRAIN_NUM_POINTS: 12544 42 | OVERSAMPLE_RATIO: 3.0 43 | IMPORTANCE_SAMPLE_RATIO: 0.75 44 | EVAL_FLAG: 1 45 | INITIAL_PRED: True 46 | TWO_STAGE: True 47 | DN: "seg" 48 | DN_NUM: 100 49 | INITIALIZE_BOX_TYPE: 'no' 50 | PANO_BOX_LOSS: False 51 | TEST: 52 | SEMANTIC_ON: True 53 | INSTANCE_ON: True 54 | PANOPTIC_ON: True 55 | OVERLAP_THRESHOLD: 0.8 56 | OBJECT_MASK_THRESHOLD: 0.25 57 | 58 | SOLVER: 59 | AMP: 60 | ENABLED: True 61 | TEST: 62 | EVAL_PERIOD: 5000 63 | # EVAL_FLAG: 1 -------------------------------------------------------------------------------- /configs/coco/panoptic-segmentation/swin/maskdino_R50_bs16_50ep_4s_dowsample1_2048.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../Base-COCO-PanopticSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskDINO" 4 | BACKBONE: 5 | NAME: "D2SwinTransformer" 6 | SWIN: 7 | EMBED_DIM: 192 8 | DEPTHS: [ 2, 2, 18, 2 ] 9 | NUM_HEADS: [ 6, 12, 24, 48 ] 10 | WINDOW_SIZE: 12 11 | APE: False 12 | DROP_PATH_RATE: 0.3 13 | PATCH_NORM: True 14 | PRETRAIN_IMG_SIZE: 384 15 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" 16 | PIXEL_MEAN: [ 123.675, 116.280, 103.530 ] 17 | PIXEL_STD: [ 58.395, 57.120, 57.375 ] 18 | # head 19 | SEM_SEG_HEAD: 20 | NAME: "MaskDINOHead" 21 | IGNORE_VALUE: 255 22 | NUM_CLASSES: 133 23 | LOSS_WEIGHT: 1.0 24 | CONVS_DIM: 256 25 | MASK_DIM: 256 26 | NORM: "GN" 27 | # pixel decoder 28 | PIXEL_DECODER_NAME: "MaskDINOEncoder" 29 | DIM_FEEDFORWARD: 2048 30 | NUM_FEATURE_LEVELS: 4 31 | TOTAL_NUM_FEATURE_LEVELS: 5 32 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 33 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res2","res3", "res4", "res5"] 34 | COMMON_STRIDE: 4 35 | TRANSFORMER_ENC_LAYERS: 6 36 | FEATURE_ORDER: "low2high" 37 | MaskDINO: 38 | TRANSFORMER_DECODER_NAME: "MaskDINODecoder" 39 | DEEP_SUPERVISION: True 40 | NO_OBJECT_WEIGHT: 0.1 41 | CLASS_WEIGHT: 4.0 42 | MASK_WEIGHT: 5.0 43 | DICE_WEIGHT: 5.0 44 | BOX_WEIGHT: 5.0 45 | GIOU_WEIGHT: 2.0 46 | HIDDEN_DIM: 256 47 | NUM_OBJECT_QUERIES: 300 48 | NHEADS: 8 49 | DROPOUT: 0.0 50 | DIM_FEEDFORWARD: 2048 51 | ENC_LAYERS: 0 52 | PRE_NORM: False 53 | ENFORCE_INPUT_PROJ: False 54 | SIZE_DIVISIBILITY: 32 55 | DEC_LAYERS: 9 # 9+1, 9 decoder layers, add one for the loss on learnable query 56 | TRAIN_NUM_POINTS: 12544 57 | OVERSAMPLE_RATIO: 3.0 58 | IMPORTANCE_SAMPLE_RATIO: 0.75 59 | EVAL_FLAG: 1 60 | INITIAL_PRED: True 61 | TWO_STAGE: True 62 | DN: "seg" 63 | DN_NUM: 100 64 | INITIALIZE_BOX_TYPE: 'no' 65 | PANO_BOX_LOSS: False 66 | TEST: 67 | SEMANTIC_ON: True 68 | INSTANCE_ON: True 69 | PANOPTIC_ON: True 70 | OVERLAP_THRESHOLD: 0.8 71 | OBJECT_MASK_THRESHOLD: 0.25 72 | 73 | SOLVER: 74 | AMP: 75 | ENABLED: True 76 | TEST: 77 | EVAL_PERIOD: 5000 78 | # EVAL_FLAG: 1 -------------------------------------------------------------------------------- /configs/coco/panoptic-segmentation/swin/maskdino_R50_bs16_50ep_4s_dowsample1_large_eval.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../Base-COCO-PanopticSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskDINO" 4 | BACKBONE: 5 | NAME: "D2SwinTransformer" 6 | SWIN: 7 | EMBED_DIM: 192 8 | DEPTHS: [ 2, 2, 18, 2 ] 9 | NUM_HEADS: [ 6, 12, 24, 48 ] 10 | WINDOW_SIZE: 12 11 | APE: False 12 | DROP_PATH_RATE: 0.3 13 | PATCH_NORM: True 14 | PRETRAIN_IMG_SIZE: 384 15 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" 16 | PIXEL_MEAN: [ 123.675, 116.280, 103.530 ] 17 | PIXEL_STD: [ 58.395, 57.120, 57.375 ] 18 | # head 19 | SEM_SEG_HEAD: 20 | NAME: "MaskDINOHead" 21 | IGNORE_VALUE: 255 22 | NUM_CLASSES: 133 23 | LOSS_WEIGHT: 1.0 24 | CONVS_DIM: 256 25 | MASK_DIM: 256 26 | NORM: "GN" 27 | # pixel decoder 28 | PIXEL_DECODER_NAME: "MaskDINOEncoder" 29 | DIM_FEEDFORWARD: 2048 30 | NUM_FEATURE_LEVELS: 4 31 | TOTAL_NUM_FEATURE_LEVELS: 5 32 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 33 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res2","res3", "res4", "res5"] 34 | COMMON_STRIDE: 4 35 | TRANSFORMER_ENC_LAYERS: 6 36 | FEATURE_ORDER: "low2high" 37 | MaskDINO: 38 | TRANSFORMER_DECODER_NAME: "MaskDINODecoder" 39 | DEEP_SUPERVISION: True 40 | NO_OBJECT_WEIGHT: 0.1 41 | CLASS_WEIGHT: 4.0 42 | MASK_WEIGHT: 5.0 43 | DICE_WEIGHT: 5.0 44 | BOX_WEIGHT: 5.0 45 | GIOU_WEIGHT: 2.0 46 | HIDDEN_DIM: 256 47 | NUM_OBJECT_QUERIES: 300 48 | NHEADS: 8 49 | DROPOUT: 0.0 50 | DIM_FEEDFORWARD: 2048 51 | ENC_LAYERS: 0 52 | PRE_NORM: False 53 | ENFORCE_INPUT_PROJ: False 54 | SIZE_DIVISIBILITY: 32 55 | DEC_LAYERS: 9 # 9+1, 9 decoder layers, add one for the loss on learnable query 56 | TRAIN_NUM_POINTS: 12544 57 | OVERSAMPLE_RATIO: 3.0 58 | IMPORTANCE_SAMPLE_RATIO: 0.75 59 | EVAL_FLAG: 1 60 | INITIAL_PRED: True 61 | TWO_STAGE: True 62 | DN: "seg" 63 | DN_NUM: 100 64 | INITIALIZE_BOX_TYPE: 'no' 65 | PANO_BOX_LOSS: False 66 | TEST: 67 | SEMANTIC_ON: True 68 | INSTANCE_ON: True 69 | PANOPTIC_ON: True 70 | OVERLAP_THRESHOLD: 0.8 71 | OBJECT_MASK_THRESHOLD: 0.25 72 | 73 | SOLVER: 74 | AMP: 75 | ENABLED: True 76 | TEST: 77 | EVAL_PERIOD: 5000 78 | # EVAL_FLAG: 1 -------------------------------------------------------------------------------- /configs/coco/panoptic-segmentation/swin/maskdino_R50_bs16_50ep_4s_dowsample1_maskEnhance_2048.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../Base-COCO-PanopticSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskDINO" 4 | BACKBONE: 5 | NAME: "D2SwinTransformer" 6 | SWIN: 7 | EMBED_DIM: 192 8 | DEPTHS: [ 2, 2, 18, 2 ] 9 | NUM_HEADS: [ 6, 12, 24, 48 ] 10 | WINDOW_SIZE: 12 11 | APE: False 12 | DROP_PATH_RATE: 0.3 13 | PATCH_NORM: True 14 | PRETRAIN_IMG_SIZE: 384 15 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" 16 | PIXEL_MEAN: [ 123.675, 116.280, 103.530 ] 17 | PIXEL_STD: [ 58.395, 57.120, 57.375 ] 18 | # head 19 | SEM_SEG_HEAD: 20 | NAME: "MaskDINOHead" 21 | IGNORE_VALUE: 255 22 | NUM_CLASSES: 133 23 | LOSS_WEIGHT: 1.0 24 | CONVS_DIM: 256 25 | MASK_DIM: 256 26 | NORM: "GN" 27 | # pixel decoder 28 | PIXEL_DECODER_NAME: "MaskDINOEncoder" 29 | DIM_FEEDFORWARD: 2048 30 | NUM_FEATURE_LEVELS: 4 31 | TOTAL_NUM_FEATURE_LEVELS: 5 32 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 33 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res2","res3", "res4", "res5"] 34 | COMMON_STRIDE: 4 35 | TRANSFORMER_ENC_LAYERS: 6 36 | FEATURE_ORDER: "low2high" 37 | MaskDINO: 38 | TRANSFORMER_DECODER_NAME: "MaskDINODecoder" 39 | DEEP_SUPERVISION: True 40 | NO_OBJECT_WEIGHT: 0.1 41 | CLASS_WEIGHT: 4.0 42 | MASK_WEIGHT: 5.0 43 | DICE_WEIGHT: 5.0 44 | BOX_WEIGHT: 5.0 45 | GIOU_WEIGHT: 2.0 46 | HIDDEN_DIM: 256 47 | NUM_OBJECT_QUERIES: 300 48 | NHEADS: 8 49 | DROPOUT: 0.0 50 | DIM_FEEDFORWARD: 2048 51 | ENC_LAYERS: 0 52 | PRE_NORM: False 53 | ENFORCE_INPUT_PROJ: False 54 | SIZE_DIVISIBILITY: 32 55 | DEC_LAYERS: 9 # 9+1, 9 decoder layers, add one for the loss on learnable query 56 | TRAIN_NUM_POINTS: 12544 57 | OVERSAMPLE_RATIO: 3.0 58 | IMPORTANCE_SAMPLE_RATIO: 0.75 59 | EVAL_FLAG: 1 60 | INITIAL_PRED: True 61 | TWO_STAGE: True 62 | DN: "seg" 63 | DN_NUM: 100 64 | INITIALIZE_BOX_TYPE: 'bitmask' 65 | PANO_BOX_LOSS: False 66 | TEST: 67 | SEMANTIC_ON: True 68 | INSTANCE_ON: True 69 | PANOPTIC_ON: True 70 | OVERLAP_THRESHOLD: 0.8 71 | OBJECT_MASK_THRESHOLD: 0.25 72 | 73 | SOLVER: 74 | AMP: 75 | ENABLED: True 76 | TEST: 77 | EVAL_PERIOD: 5000 78 | # EVAL_FLAG: 1 -------------------------------------------------------------------------------- /datasets/README.md: -------------------------------------------------------------------------------- 1 | # Prepare Datasets for MaskDINO 2 | 3 | A dataset can be used by accessing [DatasetCatalog](https://detectron2.readthedocs.io/modules/data.html#detectron2.data.DatasetCatalog) 4 | for its data, or [MetadataCatalog](https://detectron2.readthedocs.io/modules/data.html#detectron2.data.MetadataCatalog) for its metadata (class names, etc). 5 | This document explains how to setup the builtin datasets so they can be used by the above APIs. 6 | [Use Custom Datasets](https://detectron2.readthedocs.io/tutorials/datasets.html) gives a deeper dive on how to use `DatasetCatalog` and `MetadataCatalog`, 7 | and how to add new datasets to them. 8 | 9 | MaskDINO has builtin support for a few datasets. 10 | The datasets are assumed to exist in a directory specified by the environment variable 11 | `DETECTRON2_DATASETS`. 12 | Under this directory, detectron2 will look for datasets in the structure described below, if needed. 13 | ``` 14 | $DETECTRON2_DATASETS/ 15 | ADEChallengeData2016/ 16 | coco/ 17 | cityscapes/ 18 | ``` 19 | 20 | You can set the location for builtin datasets by `export DETECTRON2_DATASETS=/path/to/datasets`. 21 | If left unset, the default is `./datasets` relative to your current working directory. 22 | 23 | [comment]: <> (The [model zoo](https://github.com/facebookresearch/MaskFormer/blob/master/MODEL_ZOO.md)) 24 | 25 | [comment]: <> (contains configs and models that use these builtin datasets.) 26 | 27 | 28 | ## Expected dataset structure for [COCO](https://cocodataset.org/#download): 29 | 30 | ``` 31 | coco/ 32 | annotations/ 33 | instances_{train,val}2017.json 34 | panoptic_{train,val}2017.json 35 | {train,val}2017/ 36 | # image files that are mentioned in the corresponding json 37 | panoptic_{train,val}2017/ # png annotations 38 | panoptic_semseg_{train,val}2017/ # generated by the script mentioned below 39 | ``` 40 | 41 | Install panopticapi by: 42 | ``` 43 | pip install git+https://github.com/cocodataset/panopticapi.git 44 | ``` 45 | Then, run `python datasets/prepare_coco_semantic_annos_from_panoptic_annos.py`, to extract semantic annotations from panoptic annotations (only used for evaluation). 46 | 47 | 48 | ## Expected dataset structure for [cityscapes](https://www.cityscapes-dataset.com/downloads/): 49 | ``` 50 | cityscapes/ 51 | gtFine/ 52 | train/ 53 | aachen/ 54 | color.png, instanceIds.png, labelIds.png, polygons.json, 55 | labelTrainIds.png 56 | ... 57 | val/ 58 | test/ 59 | # below are generated Cityscapes panoptic annotation 60 | cityscapes_panoptic_train.json 61 | cityscapes_panoptic_train/ 62 | cityscapes_panoptic_val.json 63 | cityscapes_panoptic_val/ 64 | cityscapes_panoptic_test.json 65 | cityscapes_panoptic_test/ 66 | leftImg8bit/ 67 | train/ 68 | val/ 69 | test/ 70 | ``` 71 | Install cityscapes scripts by: 72 | ``` 73 | pip install git+https://github.com/mcordts/cityscapesScripts.git 74 | ``` 75 | 76 | Note: to create labelTrainIds.png, first prepare the above structure, then run cityscapesescript with: 77 | ``` 78 | CITYSCAPES_DATASET=/path/to/abovementioned/cityscapes python cityscapesscripts/preparation/createTrainIdLabelImgs.py 79 | ``` 80 | These files are not needed for instance segmentation. 81 | 82 | Note: to generate Cityscapes panoptic dataset, run cityscapesescript with: 83 | ``` 84 | CITYSCAPES_DATASET=/path/to/abovementioned/cityscapes python cityscapesscripts/preparation/createPanopticImgs.py 85 | ``` 86 | These files are not needed for semantic and instance segmentation. 87 | 88 | 89 | ## Expected dataset structure for [ADE20k](http://sceneparsing.csail.mit.edu/): 90 | ``` 91 | ADEChallengeData2016/ 92 | images/ 93 | annotations/ 94 | objectInfo150.txt 95 | # download instance annotation 96 | annotations_instance/ 97 | # generated by prepare_ade20k_sem_seg.py 98 | annotations_detectron2/ 99 | # below are generated by prepare_ade20k_pan_seg.py 100 | ade20k_panoptic_{train,val}.json 101 | ade20k_panoptic_{train,val}/ 102 | # below are generated by prepare_ade20k_ins_seg.py 103 | ade20k_instance_{train,val}.json 104 | ``` 105 | 106 | The directory `annotations_detectron2` is generated by running `python datasets/prepare_ade20k_sem_seg.py`. 107 | 108 | Install panopticapi by: 109 | ```bash 110 | pip install git+https://github.com/cocodataset/panopticapi.git 111 | ``` 112 | 113 | Download the instance annotation from http://sceneparsing.csail.mit.edu/: 114 | ```bash 115 | wget http://sceneparsing.csail.mit.edu/data/ChallengeData2017/annotations_instance.tar 116 | ``` 117 | 118 | Then, run `python datasets/prepare_ade20k_pan_seg.py`, to combine semantic and instance annotations for panoptic annotations. 119 | 120 | And run `python datasets/prepare_ade20k_ins_seg.py`, to extract instance annotations in COCO format. 121 | 122 | 123 | -------------------------------------------------------------------------------- /datasets/ade20k_instance_catid_mapping.txt: -------------------------------------------------------------------------------- 1 | Instacne100 SceneParse150 FullADE20K 2 | 1 8 165 3 | 2 9 3055 4 | 3 11 350 5 | 4 13 1831 6 | 5 15 774 7 | 5 15 783 8 | 6 16 2684 9 | 7 19 687 10 | 8 20 471 11 | 9 21 401 12 | 10 23 1735 13 | 11 24 2473 14 | 12 25 2329 15 | 13 28 1564 16 | 14 31 57 17 | 15 32 2272 18 | 16 33 907 19 | 17 34 724 20 | 18 36 2985 21 | 18 36 533 22 | 19 37 1395 23 | 20 38 155 24 | 21 39 2053 25 | 22 40 689 26 | 23 42 266 27 | 24 43 581 28 | 25 44 2380 29 | 26 45 491 30 | 27 46 627 31 | 28 48 2388 32 | 29 50 943 33 | 30 51 2096 34 | 31 54 2530 35 | 32 56 420 36 | 33 57 1948 37 | 34 58 1869 38 | 35 59 2251 39 | 36 63 239 40 | 37 65 571 41 | 38 66 2793 42 | 39 67 978 43 | 40 68 236 44 | 41 70 181 45 | 42 71 629 46 | 43 72 2598 47 | 44 73 1744 48 | 45 74 1374 49 | 46 75 591 50 | 47 76 2679 51 | 48 77 223 52 | 49 79 47 53 | 50 81 327 54 | 51 82 2821 55 | 52 83 1451 56 | 53 84 2880 57 | 54 86 480 58 | 55 87 77 59 | 56 88 2616 60 | 57 89 246 61 | 57 89 247 62 | 58 90 2733 63 | 59 91 14 64 | 60 93 38 65 | 61 94 1936 66 | 62 96 120 67 | 63 98 1702 68 | 64 99 249 69 | 65 103 2928 70 | 66 104 2337 71 | 67 105 1023 72 | 68 108 2989 73 | 69 109 1930 74 | 70 111 2586 75 | 71 112 131 76 | 72 113 146 77 | 73 116 95 78 | 74 117 1563 79 | 75 119 1708 80 | 76 120 103 81 | 77 121 1002 82 | 78 122 2569 83 | 79 124 2833 84 | 80 125 1551 85 | 81 126 1981 86 | 82 127 29 87 | 83 128 187 88 | 84 130 747 89 | 85 131 2254 90 | 86 133 2262 91 | 87 134 1260 92 | 88 135 2243 93 | 89 136 2932 94 | 90 137 2836 95 | 91 138 2850 96 | 92 139 64 97 | 93 140 894 98 | 94 143 1919 99 | 95 144 1583 100 | 96 145 318 101 | 97 147 2046 102 | 98 148 1098 103 | 99 149 530 104 | 100 150 954 105 | -------------------------------------------------------------------------------- /datasets/prepare_ade20k_ins_seg.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # Copyright (c) Facebook, Inc. and its affiliates. 4 | import glob 5 | import json 6 | import os 7 | from collections import Counter 8 | 9 | import numpy as np 10 | import tqdm 11 | from panopticapi.utils import IdGenerator, save_json 12 | from PIL import Image 13 | import pycocotools.mask as mask_util 14 | 15 | 16 | if __name__ == "__main__": 17 | dataset_dir = os.getenv("DETECTRON2_DATASETS", "datasets") 18 | 19 | for name, dirname in [("train", "training"), ("val", "validation")]: 20 | image_dir = os.path.join(dataset_dir, f"ADEChallengeData2016/images/{dirname}/") 21 | instance_dir = os.path.join( 22 | dataset_dir, f"ADEChallengeData2016/annotations_instance/{dirname}/" 23 | ) 24 | 25 | # img_id = 0 26 | ann_id = 1 27 | 28 | # json 29 | out_file = os.path.join(dataset_dir, f"ADEChallengeData2016/ade20k_instance_{name}.json") 30 | 31 | # json config 32 | instance_config_file = "datasets/ade20k_instance_imgCatIds.json" 33 | with open(instance_config_file) as f: 34 | category_dict = json.load(f)["categories"] 35 | 36 | # load catid mapping 37 | # it is important to share category id for both instance and panoptic annotations 38 | mapping_file = "datasets/ade20k_instance_catid_mapping.txt" 39 | with open(mapping_file) as f: 40 | map_id = {} 41 | for i, line in enumerate(f.readlines()): 42 | if i == 0: 43 | continue 44 | ins_id, sem_id, _ = line.strip().split() 45 | # shift id by 1 because we want it to start from 0! 46 | # ignore_label becomes 255 47 | map_id[int(ins_id)] = int(sem_id) - 1 48 | 49 | for cat in category_dict: 50 | cat["id"] = map_id[cat["id"]] 51 | 52 | filenames = sorted(glob.glob(os.path.join(image_dir, "*.jpg"))) 53 | 54 | ann_dict = {} 55 | images = [] 56 | annotations = [] 57 | 58 | for idx, filename in enumerate(tqdm.tqdm(filenames)): 59 | image = {} 60 | image_id = os.path.basename(filename).split(".")[0] 61 | 62 | image["id"] = image_id 63 | image["file_name"] = os.path.basename(filename) 64 | 65 | original_format = np.array(Image.open(filename)) 66 | image["width"] = original_format.shape[1] 67 | image["height"] = original_format.shape[0] 68 | 69 | images.append(image) 70 | 71 | filename_instance = os.path.join(instance_dir, image_id + ".png") 72 | ins_seg = np.asarray(Image.open(filename_instance)) 73 | assert ins_seg.dtype == np.uint8 74 | 75 | instance_cat_ids = ins_seg[..., 0] 76 | # instance id starts from 1! 77 | # because 0 is reserved as VOID label 78 | instance_ins_ids = ins_seg[..., 1] 79 | 80 | # process things 81 | for thing_id in np.unique(instance_ins_ids): 82 | if thing_id == 0: 83 | continue 84 | mask = instance_ins_ids == thing_id 85 | instance_cat_id = np.unique(instance_cat_ids[mask]) 86 | assert len(instance_cat_id) == 1 87 | 88 | anno = {} 89 | anno['id'] = ann_id 90 | ann_id += 1 91 | anno['image_id'] = image['id'] 92 | anno["iscrowd"] = int(0) 93 | anno["category_id"] = int(map_id[instance_cat_id[0]]) 94 | 95 | inds = np.nonzero(mask) 96 | ymin, ymax = inds[0].min(), inds[0].max() 97 | xmin, xmax = inds[1].min(), inds[1].max() 98 | anno["bbox"] = [int(xmin), int(ymin), int(xmax - xmin + 1), int(ymax - ymin + 1)] 99 | # if xmax <= xmin or ymax <= ymin: 100 | # continue 101 | rle = mask_util.encode(np.array(mask[:, :, None], order="F", dtype="uint8"))[0] 102 | rle["counts"] = rle["counts"].decode("utf-8") 103 | anno["segmentation"] = rle 104 | anno["area"] = int(mask_util.area(rle)) 105 | annotations.append(anno) 106 | 107 | # save this 108 | ann_dict['images'] = images 109 | ann_dict['categories'] = category_dict 110 | ann_dict['annotations'] = annotations 111 | 112 | save_json(ann_dict, out_file) 113 | -------------------------------------------------------------------------------- /datasets/prepare_ade20k_sem_seg.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # Copyright (c) Facebook, Inc. and its affiliates. 4 | import os 5 | from pathlib import Path 6 | 7 | import numpy as np 8 | import tqdm 9 | from PIL import Image 10 | 11 | 12 | def convert(input, output): 13 | img = np.asarray(Image.open(input)) 14 | assert img.dtype == np.uint8 15 | img = img - 1 # 0 (ignore) becomes 255. others are shifted by 1 16 | Image.fromarray(img).save(output) 17 | 18 | 19 | if __name__ == "__main__": 20 | dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "ADEChallengeData2016" 21 | for name in ["training", "validation"]: 22 | annotation_dir = dataset_dir / "annotations" / name 23 | output_dir = dataset_dir / "annotations_detectron2" / name 24 | output_dir.mkdir(parents=True, exist_ok=True) 25 | for file in tqdm.tqdm(list(annotation_dir.iterdir())): 26 | output_file = output_dir / file.name 27 | convert(file, output_file) 28 | -------------------------------------------------------------------------------- /datasets/prepare_coco_semantic_annos_from_panoptic_annos.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # Copyright (c) Facebook, Inc. and its affiliates. 4 | 5 | import functools 6 | import json 7 | import multiprocessing as mp 8 | import numpy as np 9 | import os 10 | import time 11 | from fvcore.common.download import download 12 | from panopticapi.utils import rgb2id 13 | from PIL import Image 14 | 15 | from detectron2.data.datasets.builtin_meta import COCO_CATEGORIES 16 | 17 | 18 | def _process_panoptic_to_semantic(input_panoptic, output_semantic, segments, id_map): 19 | panoptic = np.asarray(Image.open(input_panoptic), dtype=np.uint32) 20 | panoptic = rgb2id(panoptic) 21 | output = np.zeros_like(panoptic, dtype=np.uint8) + 255 22 | for seg in segments: 23 | cat_id = seg["category_id"] 24 | new_cat_id = id_map[cat_id] 25 | output[panoptic == seg["id"]] = new_cat_id 26 | Image.fromarray(output).save(output_semantic) 27 | 28 | 29 | def separate_coco_semantic_from_panoptic(panoptic_json, panoptic_root, sem_seg_root, categories): 30 | """ 31 | Create semantic segmentation annotations from panoptic segmentation 32 | annotations, to be used by PanopticFPN. 33 | It maps all thing categories to class 0, and maps all unlabeled pixels to class 255. 34 | It maps all stuff categories to contiguous ids starting from 1. 35 | Args: 36 | panoptic_json (str): path to the panoptic json file, in COCO's format. 37 | panoptic_root (str): a directory with panoptic annotation files, in COCO's format. 38 | sem_seg_root (str): a directory to output semantic annotation files 39 | categories (list[dict]): category metadata. Each dict needs to have: 40 | "id": corresponds to the "category_id" in the json annotations 41 | "isthing": 0 or 1 42 | """ 43 | os.makedirs(sem_seg_root, exist_ok=True) 44 | 45 | id_map = {} # map from category id to id in the output semantic annotation 46 | assert len(categories) <= 254 47 | for i, k in enumerate(categories): 48 | id_map[k["id"]] = i 49 | # what is id = 0? 50 | # id_map[0] = 255 51 | print(id_map) 52 | 53 | with open(panoptic_json) as f: 54 | obj = json.load(f) 55 | 56 | pool = mp.Pool(processes=max(mp.cpu_count() // 2, 4)) 57 | 58 | def iter_annotations(): 59 | for anno in obj["annotations"]: 60 | file_name = anno["file_name"] 61 | segments = anno["segments_info"] 62 | input = os.path.join(panoptic_root, file_name) 63 | output = os.path.join(sem_seg_root, file_name) 64 | yield input, output, segments 65 | 66 | print("Start writing to {} ...".format(sem_seg_root)) 67 | start = time.time() 68 | pool.starmap( 69 | functools.partial(_process_panoptic_to_semantic, id_map=id_map), 70 | iter_annotations(), 71 | chunksize=100, 72 | ) 73 | print("Finished. time: {:.2f}s".format(time.time() - start)) 74 | 75 | 76 | if __name__ == "__main__": 77 | dataset_dir = os.path.join(os.getenv("DETECTRON2_DATASETS", "datasets"), "coco") 78 | for s in ["val2017", "train2017"]: 79 | separate_coco_semantic_from_panoptic( 80 | os.path.join(dataset_dir, "annotations/panoptic_{}.json".format(s)), 81 | os.path.join(dataset_dir, "panoptic_{}".format(s)), 82 | os.path.join(dataset_dir, "panoptic_semseg_{}".format(s)), 83 | COCO_CATEGORIES, 84 | ) 85 | -------------------------------------------------------------------------------- /demo/README.md: -------------------------------------------------------------------------------- 1 | ## Getting Started with MaskDINO 2 | 3 | This document provides a brief intro of the usage of **MaskDINO**. 4 | 5 | Please see [Getting Started with Detectron2](https://github.com/facebookresearch/detectron2/blob/master/GETTING_STARTED.md) for full usage. 6 | 7 | 8 | ### Inference Demo with Pre-trained Models 9 | 10 | 1. Pick a model and its config file 11 | - for example 12 | - config file at `/configs/coco/instance-segmentation/maskdino_R50_bs16_50ep_3s.yaml`. 13 | - Model file [MaskDINO (hid 1024) ](https://github.com/IDEA-Research/detrex-storage/releases/download/maskdino-v0.1.0/maskdino_r50_50ep_300q_hid1024_3sd1_instance_maskenhanced_mask46.1ap_box51.5ap.pth) 14 | 2. We provide `demo.py` that is able to demo builtin configs. 15 | 3. Run it with: 16 | ``` 17 | cd demo/ 18 | python demo.py --config-file /configs/coco/instance-segmentation/maskdino_R50_bs16_50ep_3s.yaml \ 19 | --input input1.jpg input2.jpg \ 20 | [--other-options] 21 | --opts MODEL.WEIGHTS /path/to/model_file 22 | ``` 23 | The configs are made for training, therefore we need to specify `MODEL.WEIGHTS` to a model from model zoo for evaluation. 24 | This command will run the inference and show visualizations in an OpenCV window. 25 | 26 | For details of the command line arguments, see `demo.py -h` or look at its source code 27 | to understand its behavior. Some common arguments are: 28 | * To run __on your webcam__, replace `--input files` with `--webcam`. 29 | * To run __on a video__, replace `--input files` with `--video-input video.mp4`. 30 | * To run __on cpu__, add `MODEL.DEVICE cpu` after `--opts`. 31 | * To save outputs to a directory (for images) or a file (for webcam or video), use `--output`. 32 | 33 | 34 | -------------------------------------------------------------------------------- /demo/demo.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Modified by Bowen Cheng from: https://github.com/facebookresearch/detectron2/blob/master/demo/demo.py 3 | import argparse 4 | import glob 5 | import multiprocessing as mp 6 | import os 7 | 8 | # fmt: off 9 | import sys 10 | sys.path.insert(1, os.path.join(sys.path[0], '..')) 11 | # fmt: on 12 | 13 | import tempfile 14 | import time 15 | import warnings 16 | 17 | import cv2 18 | import numpy as np 19 | import tqdm 20 | 21 | from detectron2.config import get_cfg 22 | from detectron2.data.detection_utils import read_image 23 | from detectron2.projects.deeplab import add_deeplab_config 24 | from detectron2.utils.logger import setup_logger 25 | 26 | from maskdino import add_maskdino_config 27 | from predictor import VisualizationDemo 28 | 29 | 30 | # constants 31 | WINDOW_NAME = "mask2former demo" 32 | 33 | 34 | def setup_cfg(args): 35 | # load config from file and command-line arguments 36 | cfg = get_cfg() 37 | add_deeplab_config(cfg) 38 | add_maskdino_config(cfg) 39 | cfg.merge_from_file(args.config_file) 40 | cfg.merge_from_list(args.opts) 41 | cfg.freeze() 42 | return cfg 43 | 44 | 45 | def get_parser(): 46 | parser = argparse.ArgumentParser(description="maskdino demo for builtin configs") 47 | parser.add_argument( 48 | "--config-file", 49 | default="configs/coco/instance-segmentation/maskdino_R50_bs16_50ep_3s.yaml", 50 | metavar="FILE", 51 | help="path to config file", 52 | ) 53 | parser.add_argument("--webcam", action="store_true", help="Take inputs from webcam.") 54 | parser.add_argument("--video-input", help="Path to video file.") 55 | parser.add_argument( 56 | "--input", 57 | nargs="+", 58 | help="A list of space separated input images; " 59 | "or a single glob pattern such as 'directory/*.jpg'", 60 | ) 61 | parser.add_argument( 62 | "--output", 63 | help="A file or directory to save output visualizations. " 64 | "If not given, will show output in an OpenCV window.", 65 | ) 66 | 67 | parser.add_argument( 68 | "--confidence-threshold", 69 | type=float, 70 | default=0.5, 71 | help="Minimum score for instance predictions to be shown", 72 | ) 73 | parser.add_argument( 74 | "--opts", 75 | help="Modify config options using the command-line 'KEY VALUE' pairs", 76 | default=[], 77 | nargs=argparse.REMAINDER, 78 | ) 79 | return parser 80 | 81 | 82 | def test_opencv_video_format(codec, file_ext): 83 | with tempfile.TemporaryDirectory(prefix="video_format_test") as dir: 84 | filename = os.path.join(dir, "test_file" + file_ext) 85 | writer = cv2.VideoWriter( 86 | filename=filename, 87 | fourcc=cv2.VideoWriter_fourcc(*codec), 88 | fps=float(30), 89 | frameSize=(10, 10), 90 | isColor=True, 91 | ) 92 | [writer.write(np.zeros((10, 10, 3), np.uint8)) for _ in range(30)] 93 | writer.release() 94 | if os.path.isfile(filename): 95 | return True 96 | return False 97 | 98 | 99 | if __name__ == "__main__": 100 | mp.set_start_method("spawn", force=True) 101 | args = get_parser().parse_args() 102 | setup_logger(name="fvcore") 103 | logger = setup_logger() 104 | logger.info("Arguments: " + str(args)) 105 | 106 | cfg = setup_cfg(args) 107 | 108 | demo = VisualizationDemo(cfg) 109 | 110 | if args.input: 111 | if len(args.input) == 1: 112 | args.input = glob.glob(os.path.expanduser(args.input[0])) 113 | assert args.input, "The input path(s) was not found" 114 | for path in tqdm.tqdm(args.input, disable=not args.output): 115 | # use PIL, to be consistent with evaluation 116 | img = read_image(path, format="BGR") 117 | start_time = time.time() 118 | predictions, visualized_output = demo.run_on_image(img) 119 | logger.info( 120 | "{}: {} in {:.2f}s".format( 121 | path, 122 | "detected {} instances".format(len(predictions["instances"])) 123 | if "instances" in predictions 124 | else "finished", 125 | time.time() - start_time, 126 | ) 127 | ) 128 | 129 | if args.output: 130 | if os.path.isdir(args.output): 131 | assert os.path.isdir(args.output), args.output 132 | out_filename = os.path.join(args.output, os.path.basename(path)) 133 | else: 134 | assert len(args.input) == 1, "Please specify a directory with args.output" 135 | out_filename = args.output 136 | visualized_output.save(out_filename) 137 | else: 138 | cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL) 139 | cv2.imshow(WINDOW_NAME, visualized_output.get_image()[:, :, ::-1]) 140 | if cv2.waitKey(0) == 27: 141 | break # esc to quit 142 | elif args.webcam: 143 | assert args.input is None, "Cannot have both --input and --webcam!" 144 | assert args.output is None, "output not yet supported with --webcam!" 145 | cam = cv2.VideoCapture(0) 146 | for vis in tqdm.tqdm(demo.run_on_video(cam)): 147 | cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL) 148 | cv2.imshow(WINDOW_NAME, vis) 149 | if cv2.waitKey(1) == 27: 150 | break # esc to quit 151 | cam.release() 152 | cv2.destroyAllWindows() 153 | elif args.video_input: 154 | video = cv2.VideoCapture(args.video_input) 155 | width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH)) 156 | height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT)) 157 | frames_per_second = video.get(cv2.CAP_PROP_FPS) 158 | num_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT)) 159 | basename = os.path.basename(args.video_input) 160 | codec, file_ext = ( 161 | ("x264", ".mkv") if test_opencv_video_format("x264", ".mkv") else ("mp4v", ".mp4") 162 | ) 163 | if codec == ".mp4v": 164 | warnings.warn("x264 codec not available, switching to mp4v") 165 | if args.output: 166 | if os.path.isdir(args.output): 167 | output_fname = os.path.join(args.output, basename) 168 | output_fname = os.path.splitext(output_fname)[0] + file_ext 169 | else: 170 | output_fname = args.output 171 | assert not os.path.isfile(output_fname), output_fname 172 | output_file = cv2.VideoWriter( 173 | filename=output_fname, 174 | # some installation of opencv may not support x264 (due to its license), 175 | # you can try other format (e.g. MPEG) 176 | fourcc=cv2.VideoWriter_fourcc(*codec), 177 | fps=float(frames_per_second), 178 | frameSize=(width, height), 179 | isColor=True, 180 | ) 181 | assert os.path.isfile(args.video_input) 182 | for vis_frame in tqdm.tqdm(demo.run_on_video(video), total=num_frames): 183 | if args.output: 184 | output_file.write(vis_frame) 185 | else: 186 | cv2.namedWindow(basename, cv2.WINDOW_NORMAL) 187 | cv2.imshow(basename, vis_frame) 188 | if cv2.waitKey(1) == 27: 189 | break # esc to quit 190 | video.release() 191 | if args.output: 192 | output_file.release() 193 | else: 194 | cv2.destroyAllWindows() 195 | -------------------------------------------------------------------------------- /demo/predictor.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Copied from: https://github.com/facebookresearch/detectron2/blob/master/demo/predictor.py 3 | import atexit 4 | import bisect 5 | import multiprocessing as mp 6 | from collections import deque 7 | 8 | import cv2 9 | import torch 10 | 11 | from detectron2.data import MetadataCatalog 12 | from detectron2.engine.defaults import DefaultPredictor 13 | from detectron2.utils.video_visualizer import VideoVisualizer 14 | from detectron2.utils.visualizer import ColorMode, Visualizer 15 | 16 | 17 | class VisualizationDemo(object): 18 | def __init__(self, cfg, instance_mode=ColorMode.IMAGE, parallel=False): 19 | """ 20 | Args: 21 | cfg (CfgNode): 22 | instance_mode (ColorMode): 23 | parallel (bool): whether to run the model in different processes from visualization. 24 | Useful since the visualization logic can be slow. 25 | """ 26 | self.metadata = MetadataCatalog.get( 27 | cfg.DATASETS.TEST[0] if len(cfg.DATASETS.TEST) else "__unused" 28 | ) 29 | self.cpu_device = torch.device("cpu") 30 | self.instance_mode = instance_mode 31 | 32 | self.parallel = parallel 33 | if parallel: 34 | num_gpu = torch.cuda.device_count() 35 | self.predictor = AsyncPredictor(cfg, num_gpus=num_gpu) 36 | else: 37 | self.predictor = DefaultPredictor(cfg) 38 | 39 | def run_on_image(self, image): 40 | """ 41 | Args: 42 | image (np.ndarray): an image of shape (H, W, C) (in BGR order). 43 | This is the format used by OpenCV. 44 | Returns: 45 | predictions (dict): the output of the model. 46 | vis_output (VisImage): the visualized image output. 47 | """ 48 | vis_output = None 49 | predictions = self.predictor(image) 50 | # Convert image from OpenCV BGR format to Matplotlib RGB format. 51 | image = image[:, :, ::-1] 52 | visualizer = Visualizer(image, self.metadata, instance_mode=self.instance_mode) 53 | if "panoptic_seg" in predictions: 54 | panoptic_seg, segments_info = predictions["panoptic_seg"] 55 | vis_output = visualizer.draw_panoptic_seg_predictions( 56 | panoptic_seg.to(self.cpu_device), segments_info 57 | ) 58 | else: 59 | if "sem_seg" in predictions: 60 | vis_output = visualizer.draw_sem_seg( 61 | predictions["sem_seg"].argmax(dim=0).to(self.cpu_device) 62 | ) 63 | if "instances" in predictions: 64 | instances = predictions["instances"].to(self.cpu_device) 65 | vis_output = visualizer.draw_instance_predictions(predictions=instances) 66 | 67 | return predictions, vis_output 68 | 69 | def _frame_from_video(self, video): 70 | while video.isOpened(): 71 | success, frame = video.read() 72 | if success: 73 | yield frame 74 | else: 75 | break 76 | 77 | def run_on_video(self, video): 78 | """ 79 | Visualizes predictions on frames of the input video. 80 | Args: 81 | video (cv2.VideoCapture): a :class:`VideoCapture` object, whose source can be 82 | either a webcam or a video file. 83 | Yields: 84 | ndarray: BGR visualizations of each video frame. 85 | """ 86 | video_visualizer = VideoVisualizer(self.metadata, self.instance_mode) 87 | 88 | def process_predictions(frame, predictions): 89 | frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) 90 | if "panoptic_seg" in predictions: 91 | panoptic_seg, segments_info = predictions["panoptic_seg"] 92 | vis_frame = video_visualizer.draw_panoptic_seg_predictions( 93 | frame, panoptic_seg.to(self.cpu_device), segments_info 94 | ) 95 | elif "instances" in predictions: 96 | predictions = predictions["instances"].to(self.cpu_device) 97 | vis_frame = video_visualizer.draw_instance_predictions(frame, predictions) 98 | elif "sem_seg" in predictions: 99 | vis_frame = video_visualizer.draw_sem_seg( 100 | frame, predictions["sem_seg"].argmax(dim=0).to(self.cpu_device) 101 | ) 102 | 103 | # Converts Matplotlib RGB format to OpenCV BGR format 104 | vis_frame = cv2.cvtColor(vis_frame.get_image(), cv2.COLOR_RGB2BGR) 105 | return vis_frame 106 | 107 | frame_gen = self._frame_from_video(video) 108 | if self.parallel: 109 | buffer_size = self.predictor.default_buffer_size 110 | 111 | frame_data = deque() 112 | 113 | for cnt, frame in enumerate(frame_gen): 114 | frame_data.append(frame) 115 | self.predictor.put(frame) 116 | 117 | if cnt >= buffer_size: 118 | frame = frame_data.popleft() 119 | predictions = self.predictor.get() 120 | yield process_predictions(frame, predictions) 121 | 122 | while len(frame_data): 123 | frame = frame_data.popleft() 124 | predictions = self.predictor.get() 125 | yield process_predictions(frame, predictions) 126 | else: 127 | for frame in frame_gen: 128 | yield process_predictions(frame, self.predictor(frame)) 129 | 130 | 131 | class AsyncPredictor: 132 | """ 133 | A predictor that runs the model asynchronously, possibly on >1 GPUs. 134 | Because rendering the visualization takes considerably amount of time, 135 | this helps improve throughput a little bit when rendering videos. 136 | """ 137 | 138 | class _StopToken: 139 | pass 140 | 141 | class _PredictWorker(mp.Process): 142 | def __init__(self, cfg, task_queue, result_queue): 143 | self.cfg = cfg 144 | self.task_queue = task_queue 145 | self.result_queue = result_queue 146 | super().__init__() 147 | 148 | def run(self): 149 | predictor = DefaultPredictor(self.cfg) 150 | 151 | while True: 152 | task = self.task_queue.get() 153 | if isinstance(task, AsyncPredictor._StopToken): 154 | break 155 | idx, data = task 156 | result = predictor(data) 157 | self.result_queue.put((idx, result)) 158 | 159 | def __init__(self, cfg, num_gpus: int = 1): 160 | """ 161 | Args: 162 | cfg (CfgNode): 163 | num_gpus (int): if 0, will run on CPU 164 | """ 165 | num_workers = max(num_gpus, 1) 166 | self.task_queue = mp.Queue(maxsize=num_workers * 3) 167 | self.result_queue = mp.Queue(maxsize=num_workers * 3) 168 | self.procs = [] 169 | for gpuid in range(max(num_gpus, 1)): 170 | cfg = cfg.clone() 171 | cfg.defrost() 172 | cfg.MODEL.DEVICE = "cuda:{}".format(gpuid) if num_gpus > 0 else "cpu" 173 | self.procs.append( 174 | AsyncPredictor._PredictWorker(cfg, self.task_queue, self.result_queue) 175 | ) 176 | 177 | self.put_idx = 0 178 | self.get_idx = 0 179 | self.result_rank = [] 180 | self.result_data = [] 181 | 182 | for p in self.procs: 183 | p.start() 184 | atexit.register(self.shutdown) 185 | 186 | def put(self, image): 187 | self.put_idx += 1 188 | self.task_queue.put((self.put_idx, image)) 189 | 190 | def get(self): 191 | self.get_idx += 1 # the index needed for this request 192 | if len(self.result_rank) and self.result_rank[0] == self.get_idx: 193 | res = self.result_data[0] 194 | del self.result_data[0], self.result_rank[0] 195 | return res 196 | 197 | while True: 198 | # make sure the results are returned in the correct order 199 | idx, res = self.result_queue.get() 200 | if idx == self.get_idx: 201 | return res 202 | insert = bisect.bisect(self.result_rank, idx) 203 | self.result_rank.insert(insert, idx) 204 | self.result_data.insert(insert, res) 205 | 206 | def __len__(self): 207 | return self.put_idx - self.get_idx 208 | 209 | def __call__(self, image): 210 | self.put(image) 211 | return self.get() 212 | 213 | def shutdown(self): 214 | for _ in self.procs: 215 | self.task_queue.put(AsyncPredictor._StopToken()) 216 | 217 | @property 218 | def default_buffer_size(self): 219 | return len(self.procs) * 5 220 | -------------------------------------------------------------------------------- /figures/dinosaur.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IDEA-Research/MaskDINO/3831d8514a3728535ace8d4ecc7d28044c42dd14/figures/dinosaur.png -------------------------------------------------------------------------------- /figures/framework.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IDEA-Research/MaskDINO/3831d8514a3728535ace8d4ecc7d28044c42dd14/figures/framework.jpg -------------------------------------------------------------------------------- /figures/instance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IDEA-Research/MaskDINO/3831d8514a3728535ace8d4ecc7d28044c42dd14/figures/instance.png -------------------------------------------------------------------------------- /figures/panoptic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IDEA-Research/MaskDINO/3831d8514a3728535ace8d4ecc7d28044c42dd14/figures/panoptic.png -------------------------------------------------------------------------------- /figures/semantic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IDEA-Research/MaskDINO/3831d8514a3728535ace8d4ecc7d28044c42dd14/figures/semantic.png -------------------------------------------------------------------------------- /figures/sota.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IDEA-Research/MaskDINO/3831d8514a3728535ace8d4ecc7d28044c42dd14/figures/sota.png -------------------------------------------------------------------------------- /maskdino/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IDEA-Research/MaskDINO/3831d8514a3728535ace8d4ecc7d28044c42dd14/maskdino/.DS_Store -------------------------------------------------------------------------------- /maskdino/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Copyright (c) 2022 IDEA. All Rights Reserved. 3 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 4 | # ------------------------------------------------------------------------ 5 | # Modified from Mask2Former https://github.com/facebookresearch/Mask2Former by Feng Li and Hao Zhang. 6 | # ------------------------------------------------------------------------------ 7 | from . import data # register all new datasets 8 | from . import modeling 9 | 10 | # config 11 | from .config import add_maskdino_config 12 | 13 | # dataset loading 14 | from .data.dataset_mappers.coco_instance_new_baseline_dataset_mapper import COCOInstanceNewBaselineDatasetMapper 15 | from .data.dataset_mappers.coco_panoptic_new_baseline_dataset_mapper import COCOPanopticNewBaselineDatasetMapper 16 | from .data.dataset_mappers.detr_dataset_mapper import DetrDatasetMapper 17 | 18 | from .data.dataset_mappers.mask_former_semantic_dataset_mapper import ( 19 | MaskFormerSemanticDatasetMapper, 20 | ) 21 | 22 | # models 23 | from .maskdino import MaskDINO 24 | # from .data.datasets_detr import coco 25 | from .test_time_augmentation import SemanticSegmentorWithTTA 26 | 27 | # evaluation 28 | from .evaluation.instance_evaluation import InstanceSegEvaluator 29 | # util 30 | from .utils import box_ops, misc, utils 31 | -------------------------------------------------------------------------------- /maskdino/config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # ------------------------------------------------------------------------ 3 | # Copyright (c) 2022 IDEA. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | from detectron2.config import CfgNode as CN 7 | 8 | 9 | def add_maskdino_config(cfg): 10 | """ 11 | Add config for MaskDINO. 12 | """ 13 | # NOTE: configs from original mask2former 14 | # data config 15 | # select the dataset mapper 16 | cfg.INPUT.DATASET_MAPPER_NAME = "MaskDINO_semantic" 17 | # Color augmentation 18 | cfg.INPUT.COLOR_AUG_SSD = False 19 | # We retry random cropping until no single category in semantic segmentation GT occupies more 20 | # than `SINGLE_CATEGORY_MAX_AREA` part of the crop. 21 | cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA = 1.0 22 | # Pad image and segmentation GT in dataset mapper. 23 | cfg.INPUT.SIZE_DIVISIBILITY = -1 24 | 25 | # solver config 26 | # weight decay on embedding 27 | cfg.SOLVER.WEIGHT_DECAY_EMBED = 0.0 28 | # optimizer 29 | cfg.SOLVER.OPTIMIZER = "ADAMW" 30 | cfg.SOLVER.BACKBONE_MULTIPLIER = 0.1 31 | 32 | # MaskDINO model config 33 | cfg.MODEL.MaskDINO = CN() 34 | cfg.MODEL.MaskDINO.LEARN_TGT = False 35 | 36 | # loss 37 | cfg.MODEL.MaskDINO.PANO_BOX_LOSS = False 38 | cfg.MODEL.MaskDINO.SEMANTIC_CE_LOSS = False 39 | cfg.MODEL.MaskDINO.DEEP_SUPERVISION = True 40 | cfg.MODEL.MaskDINO.NO_OBJECT_WEIGHT = 0.1 41 | cfg.MODEL.MaskDINO.CLASS_WEIGHT = 4.0 42 | cfg.MODEL.MaskDINO.DICE_WEIGHT = 5.0 43 | cfg.MODEL.MaskDINO.MASK_WEIGHT = 5.0 44 | cfg.MODEL.MaskDINO.BOX_WEIGHT = 5. 45 | cfg.MODEL.MaskDINO.GIOU_WEIGHT = 2. 46 | 47 | # cost weight 48 | cfg.MODEL.MaskDINO.COST_CLASS_WEIGHT = 4.0 49 | cfg.MODEL.MaskDINO.COST_DICE_WEIGHT = 5.0 50 | cfg.MODEL.MaskDINO.COST_MASK_WEIGHT = 5.0 51 | cfg.MODEL.MaskDINO.COST_BOX_WEIGHT = 5. 52 | cfg.MODEL.MaskDINO.COST_GIOU_WEIGHT = 2. 53 | 54 | # transformer config 55 | cfg.MODEL.MaskDINO.NHEADS = 8 56 | cfg.MODEL.MaskDINO.DROPOUT = 0.1 57 | cfg.MODEL.MaskDINO.DIM_FEEDFORWARD = 2048 58 | cfg.MODEL.MaskDINO.ENC_LAYERS = 0 59 | cfg.MODEL.MaskDINO.DEC_LAYERS = 6 60 | cfg.MODEL.MaskDINO.INITIAL_PRED = True 61 | cfg.MODEL.MaskDINO.PRE_NORM = False 62 | cfg.MODEL.MaskDINO.BOX_LOSS = True 63 | cfg.MODEL.MaskDINO.HIDDEN_DIM = 256 64 | cfg.MODEL.MaskDINO.NUM_OBJECT_QUERIES = 100 65 | 66 | cfg.MODEL.MaskDINO.ENFORCE_INPUT_PROJ = False 67 | cfg.MODEL.MaskDINO.TWO_STAGE = True 68 | cfg.MODEL.MaskDINO.INITIALIZE_BOX_TYPE = 'no' # ['no', 'bitmask', 'mask2box'] 69 | cfg.MODEL.MaskDINO.DN="seg" 70 | cfg.MODEL.MaskDINO.DN_NOISE_SCALE=0.4 71 | cfg.MODEL.MaskDINO.DN_NUM=100 72 | cfg.MODEL.MaskDINO.PRED_CONV=False 73 | 74 | cfg.MODEL.MaskDINO.EVAL_FLAG = 1 75 | 76 | # MSDeformAttn encoder configs 77 | cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES = ["res3", "res4", "res5"] 78 | cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_POINTS = 4 79 | cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_HEADS = 8 80 | cfg.MODEL.SEM_SEG_HEAD.DIM_FEEDFORWARD = 1024 81 | cfg.MODEL.SEM_SEG_HEAD.NUM_FEATURE_LEVELS = 3 82 | cfg.MODEL.SEM_SEG_HEAD.TOTAL_NUM_FEATURE_LEVELS = 4 83 | cfg.MODEL.SEM_SEG_HEAD.FEATURE_ORDER = 'high2low' # ['low2high', 'high2low'] high2low: from high level to low level 84 | 85 | ##################### 86 | 87 | # MaskDINO inference config 88 | cfg.MODEL.MaskDINO.TEST = CN() 89 | cfg.MODEL.MaskDINO.TEST.TEST_FOUCUS_ON_BOX = False 90 | cfg.MODEL.MaskDINO.TEST.SEMANTIC_ON = True 91 | cfg.MODEL.MaskDINO.TEST.INSTANCE_ON = False 92 | cfg.MODEL.MaskDINO.TEST.PANOPTIC_ON = False 93 | cfg.MODEL.MaskDINO.TEST.OBJECT_MASK_THRESHOLD = 0.0 94 | cfg.MODEL.MaskDINO.TEST.OVERLAP_THRESHOLD = 0.0 95 | cfg.MODEL.MaskDINO.TEST.SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE = False 96 | cfg.MODEL.MaskDINO.TEST.PANO_TRANSFORM_EVAL = True 97 | cfg.MODEL.MaskDINO.TEST.PANO_TEMPERATURE = 0.06 98 | # cfg.MODEL.MaskDINO.TEST.EVAL_FLAG = 1 99 | 100 | # Sometimes `backbone.size_divisibility` is set to 0 for some backbone (e.g. ResNet) 101 | # you can use this config to override 102 | cfg.MODEL.MaskDINO.SIZE_DIVISIBILITY = 32 103 | 104 | # pixel decoder config 105 | cfg.MODEL.SEM_SEG_HEAD.MASK_DIM = 256 106 | # adding transformer in pixel decoder 107 | cfg.MODEL.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS = 0 108 | # pixel decoder 109 | cfg.MODEL.SEM_SEG_HEAD.PIXEL_DECODER_NAME = "MaskDINOEncoder" 110 | 111 | # transformer module 112 | cfg.MODEL.MaskDINO.TRANSFORMER_DECODER_NAME = "MaskDINODecoder" 113 | 114 | # LSJ aug 115 | cfg.INPUT.IMAGE_SIZE = 1024 116 | cfg.INPUT.MIN_SCALE = 0.1 117 | cfg.INPUT.MAX_SCALE = 2.0 118 | 119 | # point loss configs 120 | # Number of points sampled during training for a mask point head. 121 | cfg.MODEL.MaskDINO.TRAIN_NUM_POINTS = 112 * 112 122 | # Oversampling parameter for PointRend point sampling during training. Parameter `k` in the 123 | # original paper. 124 | cfg.MODEL.MaskDINO.OVERSAMPLE_RATIO = 3.0 125 | # Importance sampling parameter for PointRend point sampling during training. Parametr `beta` in 126 | # the original paper. 127 | cfg.MODEL.MaskDINO.IMPORTANCE_SAMPLE_RATIO = 0.75 128 | 129 | # swin transformer backbone 130 | cfg.MODEL.SWIN = CN() 131 | cfg.MODEL.SWIN.PRETRAIN_IMG_SIZE = 224 132 | cfg.MODEL.SWIN.PATCH_SIZE = 4 133 | cfg.MODEL.SWIN.EMBED_DIM = 96 134 | cfg.MODEL.SWIN.DEPTHS = [2, 2, 6, 2] 135 | cfg.MODEL.SWIN.NUM_HEADS = [3, 6, 12, 24] 136 | cfg.MODEL.SWIN.WINDOW_SIZE = 7 137 | cfg.MODEL.SWIN.MLP_RATIO = 4.0 138 | cfg.MODEL.SWIN.QKV_BIAS = True 139 | cfg.MODEL.SWIN.QK_SCALE = None 140 | cfg.MODEL.SWIN.DROP_RATE = 0.0 141 | cfg.MODEL.SWIN.ATTN_DROP_RATE = 0.0 142 | cfg.MODEL.SWIN.DROP_PATH_RATE = 0.3 143 | cfg.MODEL.SWIN.APE = False 144 | cfg.MODEL.SWIN.PATCH_NORM = True 145 | cfg.MODEL.SWIN.OUT_FEATURES = ["res2", "res3", "res4", "res5"] 146 | cfg.MODEL.SWIN.USE_CHECKPOINT = False 147 | 148 | cfg.Default_loading=True # a bug in my d2. resume use this; if first time ResNet load, set it false 149 | -------------------------------------------------------------------------------- /maskdino/data/__init__.py: -------------------------------------------------------------------------------- 1 | from . import datasets 2 | 3 | -------------------------------------------------------------------------------- /maskdino/data/dataset_mappers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IDEA-Research/MaskDINO/3831d8514a3728535ace8d4ecc7d28044c42dd14/maskdino/data/dataset_mappers/__init__.py -------------------------------------------------------------------------------- /maskdino/data/dataset_mappers/coco_instance_new_baseline_dataset_mapper.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Copyright (c) 2022 IDEA. All Rights Reserved. 3 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 4 | # ------------------------------------------------------------------------ 5 | # Modified from Mask2Former https://github.com/facebookresearch/Mask2Former by Feng Li. 6 | import copy 7 | import logging 8 | 9 | import numpy as np 10 | import torch 11 | 12 | from detectron2.config import configurable 13 | from detectron2.data import detection_utils as utils 14 | from detectron2.data import transforms as T 15 | from detectron2.data.transforms import TransformGen 16 | from detectron2.structures import BitMasks, Instances, PolygonMasks 17 | 18 | from pycocotools import mask as coco_mask 19 | 20 | __all__ = ["COCOInstanceNewBaselineDatasetMapper"] 21 | 22 | 23 | def convert_coco_poly_to_mask(segmentations, height, width): 24 | masks = [] 25 | for polygons in segmentations: 26 | rles = coco_mask.frPyObjects(polygons, height, width) 27 | mask = coco_mask.decode(rles) 28 | if len(mask.shape) < 3: 29 | mask = mask[..., None] 30 | mask = torch.as_tensor(mask, dtype=torch.uint8) 31 | mask = mask.any(dim=2) 32 | masks.append(mask) 33 | if masks: 34 | masks = torch.stack(masks, dim=0) 35 | else: 36 | masks = torch.zeros((0, height, width), dtype=torch.uint8) 37 | return masks 38 | 39 | 40 | def build_transform_gen(cfg, is_train): 41 | """ 42 | Create a list of default :class:`Augmentation` from config. 43 | Now it includes resizing and flipping. 44 | Returns: 45 | list[Augmentation] 46 | """ 47 | assert is_train, "Only support training augmentation" 48 | image_size = cfg.INPUT.IMAGE_SIZE 49 | min_scale = cfg.INPUT.MIN_SCALE 50 | max_scale = cfg.INPUT.MAX_SCALE 51 | 52 | augmentation = [] 53 | 54 | if cfg.INPUT.RANDOM_FLIP != "none": 55 | augmentation.append( 56 | T.RandomFlip( 57 | horizontal=cfg.INPUT.RANDOM_FLIP == "horizontal", 58 | vertical=cfg.INPUT.RANDOM_FLIP == "vertical", 59 | ) 60 | ) 61 | 62 | augmentation.extend([ 63 | T.ResizeScale( 64 | min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size 65 | ), 66 | T.FixedSizeCrop(crop_size=(image_size, image_size)), 67 | ]) 68 | 69 | return augmentation 70 | 71 | 72 | class COCOInstanceNewBaselineDatasetMapper: 73 | """ 74 | A callable which takes a dataset dict in Detectron2 Dataset format, 75 | and map it into a format used by MaskFormer. 76 | 77 | This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation. 78 | 79 | The callable currently does the following: 80 | 81 | 1. Read the image from "file_name" 82 | 2. Applies geometric transforms to the image and annotation 83 | 3. Find and applies suitable cropping to the image and annotation 84 | 4. Prepare image and annotation to Tensors 85 | """ 86 | 87 | @configurable 88 | def __init__( 89 | self, 90 | is_train=True, 91 | *, 92 | tfm_gens, 93 | image_format, 94 | ): 95 | """ 96 | NOTE: this interface is experimental. 97 | Args: 98 | is_train: for training or inference 99 | augmentations: a list of augmentations or deterministic transforms to apply 100 | tfm_gens: data augmentation 101 | image_format: an image format supported by :func:`detection_utils.read_image`. 102 | """ 103 | self.tfm_gens = tfm_gens 104 | logging.getLogger(__name__).info( 105 | "[COCOInstanceNewBaselineDatasetMapper] Full TransformGens used in training: {}".format(str(self.tfm_gens)) 106 | ) 107 | 108 | self.img_format = image_format 109 | self.is_train = is_train 110 | 111 | @classmethod 112 | def from_config(cls, cfg, is_train=True): 113 | # Build augmentation 114 | tfm_gens = build_transform_gen(cfg, is_train) 115 | 116 | ret = { 117 | "is_train": is_train, 118 | "tfm_gens": tfm_gens, 119 | "image_format": cfg.INPUT.FORMAT, 120 | } 121 | return ret 122 | 123 | def __call__(self, dataset_dict): 124 | """ 125 | Args: 126 | dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. 127 | 128 | Returns: 129 | dict: a format that builtin models in detectron2 accept 130 | """ 131 | dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below 132 | image = utils.read_image(dataset_dict["file_name"], format=self.img_format) 133 | utils.check_image_size(dataset_dict, image) 134 | 135 | # TODO: get padding mask 136 | # by feeding a "segmentation mask" to the same transforms 137 | padding_mask = np.ones(image.shape[:2]) 138 | 139 | image, transforms = T.apply_transform_gens(self.tfm_gens, image) 140 | # the crop transformation has default padding value 0 for segmentation 141 | padding_mask = transforms.apply_segmentation(padding_mask) 142 | padding_mask = ~ padding_mask.astype(bool) 143 | 144 | image_shape = image.shape[:2] # h, w 145 | 146 | # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory, 147 | # but not efficient on large generic data structures due to the use of pickle & mp.Queue. 148 | # Therefore it's important to use torch.Tensor. 149 | dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))) 150 | dataset_dict["padding_mask"] = torch.as_tensor(np.ascontiguousarray(padding_mask)) 151 | 152 | if not self.is_train: 153 | # USER: Modify this if you want to keep them for some reason. 154 | dataset_dict.pop("annotations", None) 155 | return dataset_dict 156 | 157 | if "annotations" in dataset_dict: 158 | # USER: Modify this if you want to keep them for some reason. 159 | for anno in dataset_dict["annotations"]: 160 | # Let's always keep mask 161 | anno.pop("keypoints", None) 162 | 163 | # USER: Implement additional transformations if you have other types of data 164 | annos = [ 165 | utils.transform_instance_annotations(obj, transforms, image_shape) 166 | for obj in dataset_dict.pop("annotations") 167 | if obj.get("iscrowd", 0) == 0 168 | ] 169 | # NOTE: does not support BitMask due to augmentation 170 | # Current BitMask cannot handle empty objects 171 | instances = utils.annotations_to_instances(annos, image_shape) 172 | # After transforms such as cropping are applied, the bounding box may no longer 173 | # tightly bound the object. As an example, imagine a triangle object 174 | # [(0,0), (2,0), (0,2)] cropped by a box [(1,0),(2,2)] (XYXY format). The tight 175 | # bounding box of the cropped triangle should be [(1,0),(2,1)], which is not equal to 176 | # the intersection of original bounding box and the cropping box. 177 | if not instances.has('gt_masks'): # this is to avoid empty annotation 178 | instances.gt_masks = PolygonMasks([]) 179 | instances.gt_boxes = instances.gt_masks.get_bounding_boxes() 180 | # Need to filter empty instances first (due to augmentation) 181 | instances = utils.filter_empty_instances(instances) 182 | # Generate masks from polygon 183 | h, w = instances.image_size 184 | if hasattr(instances, 'gt_masks'): 185 | gt_masks = instances.gt_masks 186 | gt_masks = convert_coco_poly_to_mask(gt_masks.polygons, h, w) 187 | instances.gt_masks = gt_masks 188 | 189 | dataset_dict["instances"] = instances 190 | 191 | return dataset_dict 192 | -------------------------------------------------------------------------------- /maskdino/data/dataset_mappers/coco_panoptic_new_baseline_dataset_mapper.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Copyright (c) 2022 IDEA. All Rights Reserved. 3 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 4 | # ------------------------------------------------------------------------ 5 | # Modified from Mask2Former https://github.com/facebookresearch/Mask2Former by Feng Li. 6 | import copy 7 | import logging 8 | 9 | import numpy as np 10 | import torch 11 | 12 | from detectron2.config import configurable 13 | from detectron2.data import detection_utils as utils 14 | from detectron2.data import transforms as T 15 | from detectron2.data.transforms import TransformGen 16 | from detectron2.structures import BitMasks, Boxes, Instances 17 | 18 | __all__ = ["COCOPanopticNewBaselineDatasetMapper"] 19 | 20 | 21 | def build_transform_gen(cfg, is_train): 22 | """ 23 | Create a list of default :class:`Augmentation` from config. 24 | Now it includes resizing and flipping. 25 | Returns: 26 | list[Augmentation] 27 | """ 28 | assert is_train, "Only support training augmentation" 29 | image_size = cfg.INPUT.IMAGE_SIZE 30 | min_scale = cfg.INPUT.MIN_SCALE 31 | max_scale = cfg.INPUT.MAX_SCALE 32 | 33 | augmentation = [] 34 | 35 | if cfg.INPUT.RANDOM_FLIP != "none": 36 | augmentation.append( 37 | T.RandomFlip( 38 | horizontal=cfg.INPUT.RANDOM_FLIP == "horizontal", 39 | vertical=cfg.INPUT.RANDOM_FLIP == "vertical", 40 | ) 41 | ) 42 | 43 | augmentation.extend([ 44 | T.ResizeScale( 45 | min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size 46 | ), 47 | T.FixedSizeCrop(crop_size=(image_size, image_size)), 48 | ]) 49 | 50 | return augmentation 51 | 52 | 53 | # This is specifically designed for the COCO dataset. 54 | class COCOPanopticNewBaselineDatasetMapper: 55 | """ 56 | A callable which takes a dataset dict in Detectron2 Dataset format, 57 | and map it into a format used by MaskFormer. 58 | 59 | This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation. 60 | 61 | The callable currently does the following: 62 | 63 | 1. Read the image from "file_name" 64 | 2. Applies geometric transforms to the image and annotation 65 | 3. Find and applies suitable cropping to the image and annotation 66 | 4. Prepare image and annotation to Tensors 67 | """ 68 | 69 | @configurable 70 | def __init__( 71 | self, 72 | is_train=True, 73 | *, 74 | tfm_gens, 75 | image_format, 76 | ): 77 | """ 78 | NOTE: this interface is experimental. 79 | Args: 80 | is_train: for training or inference 81 | augmentations: a list of augmentations or deterministic transforms to apply 82 | crop_gen: crop augmentation 83 | tfm_gens: data augmentation 84 | image_format: an image format supported by :func:`detection_utils.read_image`. 85 | """ 86 | self.tfm_gens = tfm_gens 87 | logging.getLogger(__name__).info( 88 | "[COCOPanopticNewBaselineDatasetMapper] Full TransformGens used in training: {}".format( 89 | str(self.tfm_gens) 90 | ) 91 | ) 92 | 93 | self.img_format = image_format 94 | self.is_train = is_train 95 | 96 | @classmethod 97 | def from_config(cls, cfg, is_train=True): 98 | # Build augmentation 99 | tfm_gens = build_transform_gen(cfg, is_train) 100 | 101 | ret = { 102 | "is_train": is_train, 103 | "tfm_gens": tfm_gens, 104 | "image_format": cfg.INPUT.FORMAT, 105 | } 106 | return ret 107 | 108 | def __call__(self, dataset_dict): 109 | """ 110 | Args: 111 | dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. 112 | 113 | Returns: 114 | dict: a format that builtin models in detectron2 accept 115 | """ 116 | dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below 117 | image = utils.read_image(dataset_dict["file_name"], format=self.img_format) 118 | utils.check_image_size(dataset_dict, image) 119 | 120 | image, transforms = T.apply_transform_gens(self.tfm_gens, image) 121 | image_shape = image.shape[:2] # h, w 122 | 123 | # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory, 124 | # but not efficient on large generic data structures due to the use of pickle & mp.Queue. 125 | # Therefore it's important to use torch.Tensor. 126 | dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))) 127 | 128 | if not self.is_train: 129 | # USER: Modify this if you want to keep them for some reason. 130 | dataset_dict.pop("annotations", None) 131 | return dataset_dict 132 | 133 | if "pan_seg_file_name" in dataset_dict: 134 | pan_seg_gt = utils.read_image(dataset_dict.pop("pan_seg_file_name"), "RGB") 135 | segments_info = dataset_dict["segments_info"] 136 | 137 | # apply the same transformation to panoptic segmentation 138 | pan_seg_gt = transforms.apply_segmentation(pan_seg_gt) 139 | 140 | from panopticapi.utils import rgb2id 141 | 142 | pan_seg_gt = rgb2id(pan_seg_gt) 143 | 144 | instances = Instances(image_shape) 145 | classes = [] 146 | masks = [] 147 | for segment_info in segments_info: 148 | class_id = segment_info["category_id"] 149 | if not segment_info["iscrowd"]: 150 | classes.append(class_id) 151 | masks.append(pan_seg_gt == segment_info["id"]) 152 | 153 | classes = np.array(classes) 154 | instances.gt_classes = torch.tensor(classes, dtype=torch.int64) 155 | if len(masks) == 0: 156 | # Some image does not have annotation (all ignored) 157 | instances.gt_masks = torch.zeros((0, pan_seg_gt.shape[-2], pan_seg_gt.shape[-1])) 158 | instances.gt_boxes = Boxes(torch.zeros((0, 4))) 159 | else: 160 | masks = BitMasks( 161 | torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks]) 162 | ) 163 | instances.gt_masks = masks.tensor 164 | instances.gt_boxes = masks.get_bounding_boxes() 165 | 166 | dataset_dict["instances"] = instances 167 | 168 | return dataset_dict 169 | -------------------------------------------------------------------------------- /maskdino/data/dataset_mappers/detr_dataset_mapper.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | import copy 3 | import logging 4 | 5 | import numpy as np 6 | import torch 7 | 8 | from detectron2.data import detection_utils as utils 9 | from detectron2.data import transforms as T 10 | from detectron2.data.transforms import TransformGen 11 | from pycocotools import mask as coco_mask 12 | __all__ = ["DetrDatasetMapper"] 13 | def convert_coco_poly_to_mask(segmentations, height, width): 14 | masks = [] 15 | for polygons in segmentations: 16 | rles = coco_mask.frPyObjects(polygons, height, width) 17 | mask = coco_mask.decode(rles) 18 | if len(mask.shape) < 3: 19 | mask = mask[..., None] 20 | mask = torch.as_tensor(mask, dtype=torch.uint8) 21 | mask = mask.any(dim=2) 22 | masks.append(mask) 23 | if masks: 24 | masks = torch.stack(masks, dim=0) 25 | else: 26 | masks = torch.zeros((0, height, width), dtype=torch.uint8) 27 | return masks 28 | 29 | def build_transform_gen(cfg, is_train): 30 | """ 31 | Create a list of :class:`TransformGen` from config. 32 | Returns: 33 | list[TransformGen] 34 | """ 35 | if is_train: 36 | min_size = cfg.INPUT.MIN_SIZE_TRAIN 37 | max_size = cfg.INPUT.MAX_SIZE_TRAIN 38 | sample_style = cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING 39 | else: 40 | min_size = cfg.INPUT.MIN_SIZE_TEST 41 | max_size = cfg.INPUT.MAX_SIZE_TEST 42 | sample_style = "choice" 43 | if sample_style == "range": 44 | assert len(min_size) == 2, "more than 2 ({}) min_size(s) are provided for ranges".format(len(min_size)) 45 | 46 | logger = logging.getLogger(__name__) 47 | 48 | tfm_gens = [] 49 | if is_train: 50 | tfm_gens.append(T.RandomFlip()) 51 | tfm_gens.append(T.ResizeShortestEdge(min_size, max_size, sample_style)) 52 | if is_train: 53 | logger.info("TransformGens used in training: " + str(tfm_gens)) 54 | return tfm_gens 55 | 56 | 57 | class DetrDatasetMapper: 58 | """ 59 | A callable which takes a dataset dict in Detectron2 Dataset format, 60 | and map it into a format used by DETR. 61 | 62 | The callable currently does the following: 63 | 64 | 1. Read the image from "file_name" 65 | 2. Applies geometric transforms to the image and annotation 66 | 3. Find and applies suitable cropping to the image and annotation 67 | 4. Prepare image and annotation to Tensors 68 | """ 69 | 70 | def __init__(self, cfg, is_train=True): 71 | if cfg.INPUT.CROP.ENABLED and is_train: 72 | self.crop_gen = [ 73 | T.ResizeShortestEdge([400, 500, 600], sample_style="choice"), 74 | T.RandomCrop(cfg.INPUT.CROP.TYPE, cfg.INPUT.CROP.SIZE), 75 | ] 76 | else: 77 | self.crop_gen = None 78 | 79 | self.mask_on = True 80 | self.tfm_gens = build_transform_gen(cfg, is_train) 81 | logging.getLogger(__name__).info( 82 | "Full TransformGens used in training: {}, crop: {}".format(str(self.tfm_gens), str(self.crop_gen)) 83 | ) 84 | 85 | self.img_format = cfg.INPUT.FORMAT 86 | self.is_train = is_train 87 | 88 | def __call__(self, dataset_dict): 89 | """ 90 | Args: 91 | dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. 92 | 93 | Returns: 94 | dict: a format that builtin models in detectron2 accept 95 | """ 96 | dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below 97 | image = utils.read_image(dataset_dict["file_name"], format=self.img_format) 98 | utils.check_image_size(dataset_dict, image) 99 | 100 | if self.crop_gen is None: 101 | image, transforms = T.apply_transform_gens(self.tfm_gens, image) 102 | else: 103 | if np.random.rand() > 0.5: 104 | image, transforms = T.apply_transform_gens(self.tfm_gens, image) 105 | else: 106 | image, transforms = T.apply_transform_gens( 107 | self.tfm_gens[:-1] + self.crop_gen + self.tfm_gens[-1:], image 108 | ) 109 | 110 | image_shape = image.shape[:2] # h, w 111 | 112 | # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory, 113 | # but not efficient on large generic data structures due to the use of pickle & mp.Queue. 114 | # Therefore it's important to use torch.Tensor. 115 | dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))) 116 | 117 | if not self.is_train: 118 | # USER: Modify this if you want to keep them for some reason. 119 | dataset_dict.pop("annotations", None) 120 | return dataset_dict 121 | 122 | if "annotations" in dataset_dict: 123 | # USER: Modify this if you want to keep them for some reason. 124 | for anno in dataset_dict["annotations"]: 125 | if not self.mask_on: 126 | anno.pop("segmentation", None) 127 | anno.pop("keypoints", None) 128 | 129 | # USER: Implement additional transformations if you have other types of data 130 | annos = [ 131 | utils.transform_instance_annotations(obj, transforms, image_shape) 132 | for obj in dataset_dict.pop("annotations") 133 | if obj.get("iscrowd", 0) == 0 134 | ] 135 | instances = utils.annotations_to_instances(annos, image_shape) 136 | instances = utils.filter_empty_instances(instances) 137 | h, w = instances.image_size 138 | if hasattr(instances, 'gt_masks'): 139 | gt_masks = instances.gt_masks 140 | gt_masks = convert_coco_poly_to_mask(gt_masks.polygons, h, w) 141 | instances.gt_masks = gt_masks 142 | 143 | dataset_dict["instances"] = instances 144 | return dataset_dict 145 | -------------------------------------------------------------------------------- /maskdino/data/dataset_mappers/mask_former_semantic_dataset_mapper.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import copy 3 | import logging 4 | 5 | import numpy as np 6 | import torch 7 | from torch.nn import functional as F 8 | 9 | from detectron2.config import configurable 10 | from detectron2.data import MetadataCatalog 11 | from detectron2.data import detection_utils as utils 12 | from detectron2.data import transforms as T 13 | from detectron2.projects.point_rend import ColorAugSSDTransform 14 | from detectron2.structures import BitMasks, Boxes, Instances 15 | 16 | __all__ = ["MaskFormerSemanticDatasetMapper"] 17 | 18 | 19 | class MaskFormerSemanticDatasetMapper: 20 | """ 21 | A callable which takes a dataset dict in Detectron2 Dataset format, 22 | and map it into a format used by MaskFormer for semantic segmentation. 23 | 24 | The callable currently does the following: 25 | 26 | 1. Read the image from "file_name" 27 | 2. Applies geometric transforms to the image and annotation 28 | 3. Find and applies suitable cropping to the image and annotation 29 | 4. Prepare image and annotation to Tensors 30 | """ 31 | 32 | @configurable 33 | def __init__( 34 | self, 35 | is_train=True, 36 | *, 37 | augmentations, 38 | image_format, 39 | ignore_label, 40 | size_divisibility, 41 | ): 42 | """ 43 | NOTE: this interface is experimental. 44 | Args: 45 | is_train: for training or inference 46 | augmentations: a list of augmentations or deterministic transforms to apply 47 | image_format: an image format supported by :func:`detection_utils.read_image`. 48 | ignore_label: the label that is ignored to evaluation 49 | size_divisibility: pad image size to be divisible by this value 50 | """ 51 | self.is_train = is_train 52 | self.tfm_gens = augmentations 53 | self.img_format = image_format 54 | self.ignore_label = ignore_label 55 | self.size_divisibility = size_divisibility 56 | 57 | logger = logging.getLogger(__name__) 58 | mode = "training" if is_train else "inference" 59 | logger.info(f"[{self.__class__.__name__}] Augmentations used in {mode}: {augmentations}") 60 | 61 | @classmethod 62 | def from_config(cls, cfg, is_train=True): 63 | # Build augmentation 64 | augs = [ 65 | T.ResizeShortestEdge( 66 | cfg.INPUT.MIN_SIZE_TRAIN, 67 | cfg.INPUT.MAX_SIZE_TRAIN, 68 | cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING, 69 | ) 70 | ] 71 | if cfg.INPUT.CROP.ENABLED: 72 | augs.append( 73 | T.RandomCrop_CategoryAreaConstraint( 74 | cfg.INPUT.CROP.TYPE, 75 | cfg.INPUT.CROP.SIZE, 76 | cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA, 77 | cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE, 78 | ) 79 | ) 80 | if cfg.INPUT.COLOR_AUG_SSD: 81 | augs.append(ColorAugSSDTransform(img_format=cfg.INPUT.FORMAT)) 82 | augs.append(T.RandomFlip()) 83 | 84 | # Assume always applies to the training set. 85 | dataset_names = cfg.DATASETS.TRAIN 86 | meta = MetadataCatalog.get(dataset_names[0]) 87 | ignore_label = meta.ignore_label 88 | 89 | ret = { 90 | "is_train": is_train, 91 | "augmentations": augs, 92 | "image_format": cfg.INPUT.FORMAT, 93 | "ignore_label": ignore_label, 94 | "size_divisibility": cfg.INPUT.SIZE_DIVISIBILITY, 95 | } 96 | return ret 97 | 98 | def __call__(self, dataset_dict): 99 | """ 100 | Args: 101 | dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. 102 | 103 | Returns: 104 | dict: a format that builtin models in detectron2 accept 105 | """ 106 | assert self.is_train, "MaskFormerSemanticDatasetMapper should only be used for training!" 107 | 108 | dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below 109 | image = utils.read_image(dataset_dict["file_name"], format=self.img_format) 110 | utils.check_image_size(dataset_dict, image) 111 | 112 | if "sem_seg_file_name" in dataset_dict: 113 | # PyTorch transformation not implemented for uint16, so converting it to double first 114 | sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name")).astype("double") 115 | else: 116 | sem_seg_gt = None 117 | 118 | if sem_seg_gt is None: 119 | raise ValueError( 120 | "Cannot find 'sem_seg_file_name' for semantic segmentation dataset {}.".format( 121 | dataset_dict["file_name"] 122 | ) 123 | ) 124 | 125 | aug_input = T.AugInput(image, sem_seg=sem_seg_gt) 126 | aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input) 127 | image = aug_input.image 128 | sem_seg_gt = aug_input.sem_seg 129 | 130 | # Pad image and segmentation label here! 131 | image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))) 132 | if sem_seg_gt is not None: 133 | sem_seg_gt = torch.as_tensor(sem_seg_gt.astype("long")) 134 | 135 | if self.size_divisibility > 0: 136 | image_size = (image.shape[-2], image.shape[-1]) 137 | padding_size = [ 138 | 0, 139 | self.size_divisibility - image_size[1], 140 | 0, 141 | self.size_divisibility - image_size[0], 142 | ] 143 | image = F.pad(image, padding_size, value=128).contiguous() 144 | if sem_seg_gt is not None: 145 | sem_seg_gt = F.pad(sem_seg_gt, padding_size, value=self.ignore_label).contiguous() 146 | 147 | image_shape = (image.shape[-2], image.shape[-1]) # h, w 148 | 149 | # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory, 150 | # but not efficient on large generic data structures due to the use of pickle & mp.Queue. 151 | # Therefore it's important to use torch.Tensor. 152 | dataset_dict["image"] = image 153 | 154 | if sem_seg_gt is not None: 155 | dataset_dict["sem_seg"] = sem_seg_gt.long() 156 | 157 | if "annotations" in dataset_dict: 158 | raise ValueError("Semantic segmentation dataset should not have 'annotations'.") 159 | 160 | # Prepare per-category binary masks 161 | if sem_seg_gt is not None: 162 | sem_seg_gt = sem_seg_gt.numpy() 163 | instances = Instances(image_shape) 164 | classes = np.unique(sem_seg_gt) 165 | # remove ignored region 166 | classes = classes[classes != self.ignore_label] 167 | instances.gt_classes = torch.tensor(classes, dtype=torch.int64) 168 | 169 | masks = [] 170 | for class_id in classes: 171 | masks.append(sem_seg_gt == class_id) 172 | 173 | if len(masks) == 0: 174 | # Some image does not have annotation (all ignored) 175 | instances.gt_masks = torch.zeros((0, sem_seg_gt.shape[-2], sem_seg_gt.shape[-1])) 176 | instances.gt_boxes = Boxes(torch.zeros((0,4))) 177 | else: 178 | masks = BitMasks( 179 | torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks]) 180 | ) 181 | instances.gt_masks = masks.tensor 182 | instances.gt_boxes = masks.get_bounding_boxes() 183 | 184 | dataset_dict["instances"] = instances 185 | 186 | return dataset_dict 187 | -------------------------------------------------------------------------------- /maskdino/data/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from . import ( 3 | register_ade20k_full, 4 | register_ade20k_panoptic, 5 | register_coco_stuff_10k, 6 | register_mapillary_vistas, 7 | register_coco_panoptic_annos_semseg, 8 | register_ade20k_instance, 9 | register_mapillary_vistas_panoptic, 10 | ) 11 | -------------------------------------------------------------------------------- /maskdino/data/datasets/register_ade20k_instance.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import json 3 | import logging 4 | import numpy as np 5 | import os 6 | from PIL import Image 7 | 8 | from detectron2.data import DatasetCatalog, MetadataCatalog 9 | from detectron2.data.datasets.coco import load_coco_json, register_coco_instances 10 | from detectron2.utils.file_io import PathManager 11 | 12 | ADE_CATEGORIES = [{'id': 7, 'name': 'bed'}, {'id': 8, 'name': 'windowpane'}, {'id': 10, 'name': 'cabinet'}, {'id': 12, 'name': 'person'}, {'id': 14, 'name': 'door'}, {'id': 15, 'name': 'table'}, {'id': 18, 'name': 'curtain'}, {'id': 19, 'name': 'chair'}, {'id': 20, 'name': 'car'}, {'id': 22, 'name': 'painting'}, {'id': 23, 'name': 'sofa'}, {'id': 24, 'name': 'shelf'}, {'id': 27, 'name': 'mirror'}, {'id': 30, 'name': 'armchair'}, {'id': 31, 'name': 'seat'}, {'id': 32, 'name': 'fence'}, {'id': 33, 'name': 'desk'}, {'id': 35, 'name': 'wardrobe'}, {'id': 36, 'name': 'lamp'}, {'id': 37, 'name': 'bathtub'}, {'id': 38, 'name': 'railing'}, {'id': 39, 'name': 'cushion'}, {'id': 41, 'name': 'box'}, {'id': 42, 'name': 'column'}, {'id': 43, 'name': 'signboard'}, {'id': 44, 'name': 'chest of drawers'}, {'id': 45, 'name': 'counter'}, {'id': 47, 'name': 'sink'}, {'id': 49, 'name': 'fireplace'}, {'id': 50, 'name': 'refrigerator'}, {'id': 53, 'name': 'stairs'}, {'id': 55, 'name': 'case'}, {'id': 56, 'name': 'pool table'}, {'id': 57, 'name': 'pillow'}, {'id': 58, 'name': 'screen door'}, {'id': 62, 'name': 'bookcase'}, {'id': 64, 'name': 'coffee table'}, {'id': 65, 'name': 'toilet'}, {'id': 66, 'name': 'flower'}, {'id': 67, 'name': 'book'}, {'id': 69, 'name': 'bench'}, {'id': 70, 'name': 'countertop'}, {'id': 71, 'name': 'stove'}, {'id': 72, 'name': 'palm'}, {'id': 73, 'name': 'kitchen island'}, {'id': 74, 'name': 'computer'}, {'id': 75, 'name': 'swivel chair'}, {'id': 76, 'name': 'boat'}, {'id': 78, 'name': 'arcade machine'}, {'id': 80, 'name': 'bus'}, {'id': 81, 'name': 'towel'}, {'id': 82, 'name': 'light'}, {'id': 83, 'name': 'truck'}, {'id': 85, 'name': 'chandelier'}, {'id': 86, 'name': 'awning'}, {'id': 87, 'name': 'streetlight'}, {'id': 88, 'name': 'booth'}, {'id': 89, 'name': 'television receiver'}, {'id': 90, 'name': 'airplane'}, {'id': 92, 'name': 'apparel'}, {'id': 93, 'name': 'pole'}, {'id': 95, 'name': 'bannister'}, {'id': 97, 'name': 'ottoman'}, {'id': 98, 'name': 'bottle'}, {'id': 102, 'name': 'van'}, {'id': 103, 'name': 'ship'}, {'id': 104, 'name': 'fountain'}, {'id': 107, 'name': 'washer'}, {'id': 108, 'name': 'plaything'}, {'id': 110, 'name': 'stool'}, {'id': 111, 'name': 'barrel'}, {'id': 112, 'name': 'basket'}, {'id': 115, 'name': 'bag'}, {'id': 116, 'name': 'minibike'}, {'id': 118, 'name': 'oven'}, {'id': 119, 'name': 'ball'}, {'id': 120, 'name': 'food'}, {'id': 121, 'name': 'step'}, {'id': 123, 'name': 'trade name'}, {'id': 124, 'name': 'microwave'}, {'id': 125, 'name': 'pot'}, {'id': 126, 'name': 'animal'}, {'id': 127, 'name': 'bicycle'}, {'id': 129, 'name': 'dishwasher'}, {'id': 130, 'name': 'screen'}, {'id': 132, 'name': 'sculpture'}, {'id': 133, 'name': 'hood'}, {'id': 134, 'name': 'sconce'}, {'id': 135, 'name': 'vase'}, {'id': 136, 'name': 'traffic light'}, {'id': 137, 'name': 'tray'}, {'id': 138, 'name': 'ashcan'}, {'id': 139, 'name': 'fan'}, {'id': 142, 'name': 'plate'}, {'id': 143, 'name': 'monitor'}, {'id': 144, 'name': 'bulletin board'}, {'id': 146, 'name': 'radiator'}, {'id': 147, 'name': 'glass'}, {'id': 148, 'name': 'clock'}, {'id': 149, 'name': 'flag'}] 13 | 14 | 15 | _PREDEFINED_SPLITS = { 16 | # point annotations without masks 17 | "ade20k_instance_train": ( 18 | "ADEChallengeData2016/images/training", 19 | "ADEChallengeData2016/ade20k_instance_train.json", 20 | ), 21 | "ade20k_instance_val": ( 22 | "ADEChallengeData2016/images/validation", 23 | "ADEChallengeData2016/ade20k_instance_val.json", 24 | ), 25 | } 26 | 27 | 28 | def _get_ade_instances_meta(): 29 | thing_ids = [k["id"] for k in ADE_CATEGORIES] 30 | assert len(thing_ids) == 100, len(thing_ids) 31 | # Mapping from the incontiguous ADE category id to an id in [0, 99] 32 | thing_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(thing_ids)} 33 | thing_classes = [k["name"] for k in ADE_CATEGORIES] 34 | ret = { 35 | "thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id, 36 | "thing_classes": thing_classes, 37 | } 38 | return ret 39 | 40 | 41 | def register_all_ade20k_instance(root): 42 | for key, (image_root, json_file) in _PREDEFINED_SPLITS.items(): 43 | # Assume pre-defined datasets live in `./datasets`. 44 | register_coco_instances( 45 | key, 46 | _get_ade_instances_meta(), 47 | os.path.join(root, json_file) if "://" not in json_file else json_file, 48 | os.path.join(root, image_root), 49 | ) 50 | 51 | 52 | _root = os.getenv("DETECTRON2_DATASETS", "datasets") 53 | register_all_ade20k_instance(_root) 54 | -------------------------------------------------------------------------------- /maskdino/data/datasets/register_coco_panoptic_annos_semseg.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import json 3 | import os 4 | 5 | from detectron2.data import DatasetCatalog, MetadataCatalog 6 | from detectron2.data.datasets import load_sem_seg 7 | from detectron2.data.datasets.builtin_meta import COCO_CATEGORIES 8 | from detectron2.utils.file_io import PathManager 9 | 10 | 11 | _PREDEFINED_SPLITS_COCO_PANOPTIC = { 12 | "coco_2017_train_panoptic": ( 13 | # This is the original panoptic annotation directory 14 | "coco/panoptic_train2017", 15 | "coco/annotations/panoptic_train2017.json", 16 | # This directory contains semantic annotations that are 17 | # converted from panoptic annotations. 18 | # It is used by PanopticFPN. 19 | # You can use the script at detectron2/datasets/prepare_panoptic_fpn.py 20 | # to create these directories. 21 | "coco/panoptic_semseg_train2017", 22 | ), 23 | "coco_2017_val_panoptic": ( 24 | "coco/panoptic_val2017", 25 | "coco/annotations/panoptic_val2017.json", 26 | "coco/panoptic_semseg_val2017", 27 | ), 28 | } 29 | 30 | 31 | def get_metadata(): 32 | meta = {} 33 | # The following metadata maps contiguous id from [0, #thing categories + 34 | # #stuff categories) to their names and colors. We have to replica of the 35 | # same name and color under "thing_*" and "stuff_*" because the current 36 | # visualization function in D2 handles thing and class classes differently 37 | # due to some heuristic used in Panoptic FPN. We keep the same naming to 38 | # enable reusing existing visualization functions. 39 | thing_classes = [k["name"] for k in COCO_CATEGORIES if k["isthing"] == 1] 40 | thing_colors = [k["color"] for k in COCO_CATEGORIES if k["isthing"] == 1] 41 | stuff_classes = [k["name"] for k in COCO_CATEGORIES] 42 | stuff_colors = [k["color"] for k in COCO_CATEGORIES] 43 | 44 | meta["thing_classes"] = thing_classes 45 | meta["thing_colors"] = thing_colors 46 | meta["stuff_classes"] = stuff_classes 47 | meta["stuff_colors"] = stuff_colors 48 | 49 | # Convert category id for training: 50 | # category id: like semantic segmentation, it is the class id for each 51 | # pixel. Since there are some classes not used in evaluation, the category 52 | # id is not always contiguous and thus we have two set of category ids: 53 | # - original category id: category id in the original dataset, mainly 54 | # used for evaluation. 55 | # - contiguous category id: [0, #classes), in order to train the linear 56 | # softmax classifier. 57 | thing_dataset_id_to_contiguous_id = {} 58 | stuff_dataset_id_to_contiguous_id = {} 59 | 60 | for i, cat in enumerate(COCO_CATEGORIES): 61 | if cat["isthing"]: 62 | thing_dataset_id_to_contiguous_id[cat["id"]] = i 63 | # else: 64 | # stuff_dataset_id_to_contiguous_id[cat["id"]] = i 65 | 66 | # in order to use sem_seg evaluator 67 | stuff_dataset_id_to_contiguous_id[cat["id"]] = i 68 | 69 | meta["thing_dataset_id_to_contiguous_id"] = thing_dataset_id_to_contiguous_id 70 | meta["stuff_dataset_id_to_contiguous_id"] = stuff_dataset_id_to_contiguous_id 71 | 72 | return meta 73 | 74 | 75 | def load_coco_panoptic_json(json_file, image_dir, gt_dir, semseg_dir, meta): 76 | """ 77 | Args: 78 | image_dir (str): path to the raw dataset. e.g., "~/coco/train2017". 79 | gt_dir (str): path to the raw annotations. e.g., "~/coco/panoptic_train2017". 80 | json_file (str): path to the json file. e.g., "~/coco/annotations/panoptic_train2017.json". 81 | Returns: 82 | list[dict]: a list of dicts in Detectron2 standard format. (See 83 | `Using Custom Datasets `_ ) 84 | """ 85 | 86 | def _convert_category_id(segment_info, meta): 87 | if segment_info["category_id"] in meta["thing_dataset_id_to_contiguous_id"]: 88 | segment_info["category_id"] = meta["thing_dataset_id_to_contiguous_id"][ 89 | segment_info["category_id"] 90 | ] 91 | segment_info["isthing"] = True 92 | else: 93 | segment_info["category_id"] = meta["stuff_dataset_id_to_contiguous_id"][ 94 | segment_info["category_id"] 95 | ] 96 | segment_info["isthing"] = False 97 | return segment_info 98 | 99 | with PathManager.open(json_file) as f: 100 | json_info = json.load(f) 101 | 102 | ret = [] 103 | for ann in json_info["annotations"]: 104 | image_id = int(ann["image_id"]) 105 | # TODO: currently we assume image and label has the same filename but 106 | # different extension, and images have extension ".jpg" for COCO. Need 107 | # to make image extension a user-provided argument if we extend this 108 | # function to support other COCO-like datasets. 109 | image_file = os.path.join(image_dir, os.path.splitext(ann["file_name"])[0] + ".jpg") 110 | label_file = os.path.join(gt_dir, ann["file_name"]) 111 | sem_label_file = os.path.join(semseg_dir, ann["file_name"]) 112 | segments_info = [_convert_category_id(x, meta) for x in ann["segments_info"]] 113 | ret.append( 114 | { 115 | "file_name": image_file, 116 | "image_id": image_id, 117 | "pan_seg_file_name": label_file, 118 | "sem_seg_file_name": sem_label_file, 119 | "segments_info": segments_info, 120 | } 121 | ) 122 | assert len(ret), f"No images found in {image_dir}!" 123 | assert PathManager.isfile(ret[0]["file_name"]), ret[0]["file_name"] 124 | assert PathManager.isfile(ret[0]["pan_seg_file_name"]), ret[0]["pan_seg_file_name"] 125 | assert PathManager.isfile(ret[0]["sem_seg_file_name"]), ret[0]["sem_seg_file_name"] 126 | return ret 127 | 128 | 129 | def register_coco_panoptic_annos_sem_seg( 130 | name, metadata, image_root, panoptic_root, panoptic_json, sem_seg_root, instances_json 131 | ): 132 | panoptic_name = name 133 | delattr(MetadataCatalog.get(panoptic_name), "thing_classes") 134 | delattr(MetadataCatalog.get(panoptic_name), "thing_colors") 135 | MetadataCatalog.get(panoptic_name).set( 136 | thing_classes=metadata["thing_classes"], 137 | thing_colors=metadata["thing_colors"], 138 | # thing_dataset_id_to_contiguous_id=metadata["thing_dataset_id_to_contiguous_id"], 139 | ) 140 | 141 | # the name is "coco_2017_train_panoptic_with_sem_seg" and "coco_2017_val_panoptic_with_sem_seg" 142 | semantic_name = name + "_with_sem_seg" 143 | DatasetCatalog.register( 144 | semantic_name, 145 | lambda: load_coco_panoptic_json(panoptic_json, image_root, panoptic_root, sem_seg_root, metadata), 146 | ) 147 | MetadataCatalog.get(semantic_name).set( 148 | sem_seg_root=sem_seg_root, 149 | panoptic_root=panoptic_root, 150 | image_root=image_root, 151 | panoptic_json=panoptic_json, 152 | json_file=instances_json, 153 | evaluator_type="coco_panoptic_seg", 154 | ignore_label=255, 155 | label_divisor=1000, 156 | **metadata, 157 | ) 158 | 159 | 160 | def register_all_coco_panoptic_annos_sem_seg(root): 161 | for ( 162 | prefix, 163 | (panoptic_root, panoptic_json, semantic_root), 164 | ) in _PREDEFINED_SPLITS_COCO_PANOPTIC.items(): 165 | prefix_instances = prefix[: -len("_panoptic")] 166 | instances_meta = MetadataCatalog.get(prefix_instances) 167 | image_root, instances_json = instances_meta.image_root, instances_meta.json_file 168 | 169 | register_coco_panoptic_annos_sem_seg( 170 | prefix, 171 | get_metadata(), 172 | image_root, 173 | os.path.join(root, panoptic_root), 174 | os.path.join(root, panoptic_json), 175 | os.path.join(root, semantic_root), 176 | instances_json, 177 | ) 178 | 179 | 180 | _root = os.getenv("DETECTRON2_DATASETS", "datasets") 181 | register_all_coco_panoptic_annos_sem_seg(_root) 182 | -------------------------------------------------------------------------------- /maskdino/evaluation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IDEA-Research/MaskDINO/3831d8514a3728535ace8d4ecc7d28044c42dd14/maskdino/evaluation/__init__.py -------------------------------------------------------------------------------- /maskdino/evaluation/instance_evaluation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import contextlib 3 | import copy 4 | import io 5 | import itertools 6 | import json 7 | import logging 8 | import numpy as np 9 | import os 10 | import pickle 11 | from collections import OrderedDict 12 | import pycocotools.mask as mask_util 13 | import torch 14 | from pycocotools.coco import COCO 15 | from pycocotools.cocoeval import COCOeval 16 | from tabulate import tabulate 17 | 18 | import detectron2.utils.comm as comm 19 | from detectron2.config import CfgNode 20 | from detectron2.data import MetadataCatalog 21 | from detectron2.data.datasets.coco import convert_to_coco_json 22 | from detectron2.evaluation.coco_evaluation import COCOEvaluator, _evaluate_predictions_on_coco 23 | from detectron2.evaluation.fast_eval_api import COCOeval_opt 24 | from detectron2.structures import Boxes, BoxMode, pairwise_iou 25 | from detectron2.utils.file_io import PathManager 26 | from detectron2.utils.logger import create_small_table 27 | 28 | 29 | # modified from COCOEvaluator for instance segmetnat 30 | class InstanceSegEvaluator(COCOEvaluator): 31 | """ 32 | Evaluate AR for object proposals, AP for instance detection/segmentation, AP 33 | for keypoint detection outputs using COCO's metrics. 34 | See http://cocodataset.org/#detection-eval and 35 | http://cocodataset.org/#keypoints-eval to understand its metrics. 36 | The metrics range from 0 to 100 (instead of 0 to 1), where a -1 or NaN means 37 | the metric cannot be computed (e.g. due to no predictions made). 38 | 39 | In addition to COCO, this evaluator is able to support any bounding box detection, 40 | instance segmentation, or keypoint detection dataset. 41 | """ 42 | 43 | def _eval_predictions(self, predictions, img_ids=None): 44 | """ 45 | Evaluate predictions. Fill self._results with the metrics of the tasks. 46 | """ 47 | self._logger.info("Preparing results for COCO format ...") 48 | coco_results = list(itertools.chain(*[x["instances"] for x in predictions])) 49 | tasks = self._tasks or self._tasks_from_predictions(coco_results) 50 | 51 | # unmap the category ids for COCO 52 | if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"): 53 | dataset_id_to_contiguous_id = self._metadata.thing_dataset_id_to_contiguous_id 54 | # all_contiguous_ids = list(dataset_id_to_contiguous_id.values()) 55 | # num_classes = len(all_contiguous_ids) 56 | # assert min(all_contiguous_ids) == 0 and max(all_contiguous_ids) == num_classes - 1 57 | 58 | reverse_id_mapping = {v: k for k, v in dataset_id_to_contiguous_id.items()} 59 | for result in coco_results: 60 | category_id = result["category_id"] 61 | # assert category_id < num_classes, ( 62 | # f"A prediction has class={category_id}, " 63 | # f"but the dataset only has {num_classes} classes and " 64 | # f"predicted class id should be in [0, {num_classes - 1}]." 65 | # ) 66 | assert category_id in reverse_id_mapping, ( 67 | f"A prediction has class={category_id}, " 68 | f"but the dataset only has class ids in {dataset_id_to_contiguous_id}." 69 | ) 70 | result["category_id"] = reverse_id_mapping[category_id] 71 | 72 | if self._output_dir: 73 | file_path = os.path.join(self._output_dir, "coco_instances_results.json") 74 | self._logger.info("Saving results to {}".format(file_path)) 75 | with PathManager.open(file_path, "w") as f: 76 | f.write(json.dumps(coco_results)) 77 | f.flush() 78 | 79 | if not self._do_evaluation: 80 | self._logger.info("Annotations are not available for evaluation.") 81 | return 82 | 83 | self._logger.info( 84 | "Evaluating predictions with {} COCO API...".format( 85 | "unofficial" if self._use_fast_impl else "official" 86 | ) 87 | ) 88 | for task in sorted(tasks): 89 | assert task in {"bbox", "segm", "keypoints"}, f"Got unknown task: {task}!" 90 | coco_eval = ( 91 | _evaluate_predictions_on_coco( 92 | self._coco_api, 93 | coco_results, 94 | task, 95 | kpt_oks_sigmas=self._kpt_oks_sigmas, 96 | use_fast_impl=self._use_fast_impl, 97 | img_ids=img_ids, 98 | max_dets_per_image=self._max_dets_per_image, 99 | ) 100 | if len(coco_results) > 0 101 | else None # cocoapi does not handle empty results very well 102 | ) 103 | 104 | res = self._derive_coco_results( 105 | coco_eval, task, class_names=self._metadata.get("thing_classes") 106 | ) 107 | self._results[task] = res 108 | -------------------------------------------------------------------------------- /maskdino/modeling/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) IDEA, Inc. and its affiliates. 2 | from .backbone.swin import D2SwinTransformer 3 | from .pixel_decoder.maskdino_encoder import MaskDINOEncoder 4 | from .meta_arch.maskdino_head import MaskDINOHead 5 | 6 | -------------------------------------------------------------------------------- /maskdino/modeling/backbone/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /maskdino/modeling/meta_arch/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) IDEA, Inc. and its affiliates. 2 | 3 | -------------------------------------------------------------------------------- /maskdino/modeling/meta_arch/maskdino_head.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Copyright (c) 2022 IDEA. All Rights Reserved. 3 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 4 | # ------------------------------------------------------------------------ 5 | # Modified from Mask2Former https://github.com/facebookresearch/Mask2Former by Feng Li and Hao Zhang. 6 | # ------------------------------------------------------------------------------ 7 | import logging 8 | from typing import Callable, Dict, List, Optional, Tuple, Union 9 | 10 | from torch import nn 11 | 12 | from detectron2.config import configurable 13 | from detectron2.layers import Conv2d, ShapeSpec, get_norm 14 | from detectron2.modeling import SEM_SEG_HEADS_REGISTRY 15 | 16 | from ..transformer_decoder.maskdino_decoder import build_transformer_decoder 17 | from ..pixel_decoder.maskdino_encoder import build_pixel_decoder 18 | 19 | 20 | @SEM_SEG_HEADS_REGISTRY.register() 21 | class MaskDINOHead(nn.Module): 22 | @configurable 23 | def __init__( 24 | self, 25 | input_shape: Dict[str, ShapeSpec], 26 | *, 27 | num_classes: int, 28 | pixel_decoder: nn.Module, 29 | loss_weight: float = 1.0, 30 | ignore_value: int = -1, 31 | transformer_predictor: nn.Module, 32 | ): 33 | """ 34 | Args: 35 | input_shape: shapes (channels and stride) of the input features 36 | num_classes: number of classes to predict 37 | pixel_decoder: the pixel decoder module 38 | loss_weight: loss weight 39 | ignore_value: category id to be ignored during training. 40 | transformer_predictor: the transformer decoder that makes prediction 41 | transformer_in_feature: input feature name to the transformer_predictor 42 | """ 43 | super().__init__() 44 | input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride) 45 | self.in_features = [k for k, v in input_shape] 46 | self.ignore_value = ignore_value 47 | self.common_stride = 4 48 | self.loss_weight = loss_weight 49 | 50 | self.pixel_decoder = pixel_decoder 51 | self.predictor = transformer_predictor 52 | 53 | self.num_classes = num_classes 54 | 55 | @classmethod 56 | def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]): 57 | transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM 58 | 59 | return { 60 | "input_shape": { 61 | k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES 62 | }, 63 | "ignore_value": cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE, 64 | "num_classes": cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES, 65 | "pixel_decoder": build_pixel_decoder(cfg, input_shape), 66 | "loss_weight": cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT, 67 | "transformer_predictor": build_transformer_decoder( 68 | cfg, 69 | transformer_predictor_in_channels, 70 | mask_classification=True, 71 | ), 72 | } 73 | 74 | def forward(self, features, mask=None,targets=None): 75 | return self.layers(features, mask,targets=targets) 76 | 77 | def layers(self, features, mask=None,targets=None): 78 | mask_features, transformer_encoder_features, multi_scale_features = self.pixel_decoder.forward_features(features, mask) 79 | 80 | predictions = self.predictor(multi_scale_features, mask_features, mask, targets=targets) 81 | 82 | return predictions 83 | -------------------------------------------------------------------------------- /maskdino/modeling/pixel_decoder/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) IDEA, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /maskdino/modeling/pixel_decoder/ops/functions/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | from .ms_deform_attn_func import MSDeformAttnFunction 13 | 14 | -------------------------------------------------------------------------------- /maskdino/modeling/pixel_decoder/ops/functions/ms_deform_attn_func.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | from __future__ import absolute_import 13 | from __future__ import print_function 14 | from __future__ import division 15 | 16 | import torch 17 | import torch.nn.functional as F 18 | from torch.autograd import Function 19 | from torch.autograd.function import once_differentiable 20 | 21 | try: 22 | import MultiScaleDeformableAttention as MSDA 23 | except ModuleNotFoundError as e: 24 | info_string = ( 25 | "\n\nPlease compile MultiScaleDeformableAttention CUDA op with the following commands:\n" 26 | "\t`cd maskdino/modeling/pixel_decoder/ops`\n" 27 | "\t`sh make.sh`\n" 28 | ) 29 | raise ModuleNotFoundError(info_string) 30 | 31 | 32 | class MSDeformAttnFunction(Function): 33 | @staticmethod 34 | def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step): 35 | ctx.im2col_step = im2col_step 36 | output = MSDA.ms_deform_attn_forward( 37 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step) 38 | ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights) 39 | return output 40 | 41 | @staticmethod 42 | @once_differentiable 43 | def backward(ctx, grad_output): 44 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors 45 | grad_value, grad_sampling_loc, grad_attn_weight = \ 46 | MSDA.ms_deform_attn_backward( 47 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step) 48 | 49 | return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None 50 | 51 | 52 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights): 53 | # for debug and test only, 54 | # need to use cuda version instead 55 | N_, S_, M_, D_ = value.shape 56 | _, Lq_, M_, L_, P_, _ = sampling_locations.shape 57 | value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1) 58 | sampling_grids = 2 * sampling_locations - 1 59 | sampling_value_list = [] 60 | for lid_, (H_, W_) in enumerate(value_spatial_shapes): 61 | # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_ 62 | value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_) 63 | # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2 64 | sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1) 65 | # N_*M_, D_, Lq_, P_ 66 | sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_, 67 | mode='bilinear', padding_mode='zeros', align_corners=False) 68 | sampling_value_list.append(sampling_value_l_) 69 | # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_) 70 | attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_) 71 | output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_) 72 | return output.transpose(1, 2).contiguous() 73 | -------------------------------------------------------------------------------- /maskdino/modeling/pixel_decoder/ops/make.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # ------------------------------------------------------------------------------------------------ 3 | # Deformable DETR 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | # ------------------------------------------------------------------------------------------------ 7 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | # ------------------------------------------------------------------------------------------------ 9 | 10 | # Copyright (c) Facebook, Inc. and its affiliates. 11 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 12 | 13 | python setup.py build install 14 | -------------------------------------------------------------------------------- /maskdino/modeling/pixel_decoder/ops/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | from .ms_deform_attn import MSDeformAttn 13 | -------------------------------------------------------------------------------- /maskdino/modeling/pixel_decoder/ops/modules/ms_deform_attn.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | from __future__ import absolute_import 13 | from __future__ import print_function 14 | from __future__ import division 15 | 16 | import warnings 17 | import math 18 | 19 | import torch 20 | from torch import nn 21 | import torch.nn.functional as F 22 | from torch.nn.init import xavier_uniform_, constant_ 23 | 24 | from ..functions import MSDeformAttnFunction 25 | from ..functions.ms_deform_attn_func import ms_deform_attn_core_pytorch 26 | 27 | 28 | def _is_power_of_2(n): 29 | if (not isinstance(n, int)) or (n < 0): 30 | raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n))) 31 | return (n & (n-1) == 0) and n != 0 32 | 33 | 34 | class MSDeformAttn(nn.Module): 35 | def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4): 36 | """ 37 | Multi-Scale Deformable Attention Module 38 | :param d_model hidden dimension 39 | :param n_levels number of feature levels 40 | :param n_heads number of attention heads 41 | :param n_points number of sampling points per attention head per feature level 42 | """ 43 | super().__init__() 44 | if d_model % n_heads != 0: 45 | raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads)) 46 | _d_per_head = d_model // n_heads 47 | # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation 48 | if not _is_power_of_2(_d_per_head): 49 | warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 " 50 | "which is more efficient in our CUDA implementation.") 51 | 52 | self.im2col_step = 128 53 | 54 | self.d_model = d_model 55 | self.n_levels = n_levels 56 | self.n_heads = n_heads 57 | self.n_points = n_points 58 | 59 | self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2) 60 | self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points) 61 | self.value_proj = nn.Linear(d_model, d_model) 62 | self.output_proj = nn.Linear(d_model, d_model) 63 | 64 | self._reset_parameters() 65 | 66 | def _reset_parameters(self): 67 | constant_(self.sampling_offsets.weight.data, 0.) 68 | thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads) 69 | grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) 70 | grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1) 71 | for i in range(self.n_points): 72 | grid_init[:, :, i, :] *= i + 1 73 | with torch.no_grad(): 74 | self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1)) 75 | constant_(self.attention_weights.weight.data, 0.) 76 | constant_(self.attention_weights.bias.data, 0.) 77 | xavier_uniform_(self.value_proj.weight.data) 78 | constant_(self.value_proj.bias.data, 0.) 79 | xavier_uniform_(self.output_proj.weight.data) 80 | constant_(self.output_proj.bias.data, 0.) 81 | 82 | def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None): 83 | """ 84 | :param query (N, Length_{query}, C) 85 | :param reference_points (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area 86 | or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes 87 | :param input_flatten (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C) 88 | :param input_spatial_shapes (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})] 89 | :param input_level_start_index (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}] 90 | :param input_padding_mask (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements 91 | 92 | :return output (N, Length_{query}, C) 93 | """ 94 | N, Len_q, _ = query.shape 95 | N, Len_in, _ = input_flatten.shape 96 | assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in 97 | 98 | value = self.value_proj(input_flatten) 99 | if input_padding_mask is not None: 100 | value = value.masked_fill(input_padding_mask[..., None], float(0)) 101 | value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads) 102 | sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2) 103 | attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points) 104 | attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points) 105 | # N, Len_q, n_heads, n_levels, n_points, 2 106 | if reference_points.shape[-1] == 2: 107 | offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1) 108 | sampling_locations = reference_points[:, :, None, :, None, :] \ 109 | + sampling_offsets / offset_normalizer[None, None, None, :, None, :] 110 | elif reference_points.shape[-1] == 4: 111 | sampling_locations = reference_points[:, :, None, :, None, :2] \ 112 | + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5 113 | else: 114 | raise ValueError( 115 | 'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1])) 116 | try: 117 | output = MSDeformAttnFunction.apply( 118 | value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step) 119 | except: 120 | # CPU 121 | output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights) 122 | # # For FLOPs calculation only 123 | # output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights) 124 | output = self.output_proj(output) 125 | return output 126 | -------------------------------------------------------------------------------- /maskdino/modeling/pixel_decoder/ops/setup.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | import os 13 | import glob 14 | 15 | import torch 16 | 17 | from torch.utils.cpp_extension import CUDA_HOME 18 | from torch.utils.cpp_extension import CppExtension 19 | from torch.utils.cpp_extension import CUDAExtension 20 | 21 | from setuptools import find_packages 22 | from setuptools import setup 23 | 24 | requirements = ["torch", "torchvision"] 25 | 26 | def get_extensions(): 27 | this_dir = os.path.dirname(os.path.abspath(__file__)) 28 | extensions_dir = os.path.join(this_dir, "src") 29 | 30 | main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) 31 | source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) 32 | source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) 33 | 34 | sources = main_file + source_cpu 35 | extension = CppExtension 36 | extra_compile_args = {"cxx": []} 37 | define_macros = [] 38 | 39 | # Force cuda since torch ask for a device, not if cuda is in fact available. 40 | if (os.environ.get('FORCE_CUDA') or torch.cuda.is_available()) and CUDA_HOME is not None: 41 | extension = CUDAExtension 42 | sources += source_cuda 43 | define_macros += [("WITH_CUDA", None)] 44 | extra_compile_args["nvcc"] = [ 45 | "-DCUDA_HAS_FP16=1", 46 | "-D__CUDA_NO_HALF_OPERATORS__", 47 | "-D__CUDA_NO_HALF_CONVERSIONS__", 48 | "-D__CUDA_NO_HALF2_OPERATORS__", 49 | ] 50 | else: 51 | if CUDA_HOME is None: 52 | raise NotImplementedError('CUDA_HOME is None. Please set environment variable CUDA_HOME.') 53 | else: 54 | raise NotImplementedError('No CUDA runtime is found. Please set FORCE_CUDA=1 or test it by running torch.cuda.is_available().') 55 | 56 | sources = [os.path.join(extensions_dir, s) for s in sources] 57 | include_dirs = [extensions_dir] 58 | ext_modules = [ 59 | extension( 60 | "MultiScaleDeformableAttention", 61 | sources, 62 | include_dirs=include_dirs, 63 | define_macros=define_macros, 64 | extra_compile_args=extra_compile_args, 65 | ) 66 | ] 67 | return ext_modules 68 | 69 | setup( 70 | name="MultiScaleDeformableAttention", 71 | version="1.0", 72 | author="Weijie Su", 73 | url="https://github.com/fundamentalvision/Deformable-DETR", 74 | description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention", 75 | packages=find_packages(exclude=("configs", "tests",)), 76 | ext_modules=get_extensions(), 77 | cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, 78 | ) 79 | -------------------------------------------------------------------------------- /maskdino/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #include 17 | 18 | #include 19 | #include 20 | 21 | 22 | at::Tensor 23 | ms_deform_attn_cpu_forward( 24 | const at::Tensor &value, 25 | const at::Tensor &spatial_shapes, 26 | const at::Tensor &level_start_index, 27 | const at::Tensor &sampling_loc, 28 | const at::Tensor &attn_weight, 29 | const int im2col_step) 30 | { 31 | AT_ERROR("Not implement on cpu"); 32 | } 33 | 34 | std::vector 35 | ms_deform_attn_cpu_backward( 36 | const at::Tensor &value, 37 | const at::Tensor &spatial_shapes, 38 | const at::Tensor &level_start_index, 39 | const at::Tensor &sampling_loc, 40 | const at::Tensor &attn_weight, 41 | const at::Tensor &grad_output, 42 | const int im2col_step) 43 | { 44 | AT_ERROR("Not implement on cpu"); 45 | } 46 | 47 | -------------------------------------------------------------------------------- /maskdino/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #pragma once 17 | #include 18 | 19 | at::Tensor 20 | ms_deform_attn_cpu_forward( 21 | const at::Tensor &value, 22 | const at::Tensor &spatial_shapes, 23 | const at::Tensor &level_start_index, 24 | const at::Tensor &sampling_loc, 25 | const at::Tensor &attn_weight, 26 | const int im2col_step); 27 | 28 | std::vector 29 | ms_deform_attn_cpu_backward( 30 | const at::Tensor &value, 31 | const at::Tensor &spatial_shapes, 32 | const at::Tensor &level_start_index, 33 | const at::Tensor &sampling_loc, 34 | const at::Tensor &attn_weight, 35 | const at::Tensor &grad_output, 36 | const int im2col_step); 37 | 38 | 39 | -------------------------------------------------------------------------------- /maskdino/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.cu: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #include 17 | #include "cuda/ms_deform_im2col_cuda.cuh" 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | 25 | at::Tensor ms_deform_attn_cuda_forward( 26 | const at::Tensor &value, 27 | const at::Tensor &spatial_shapes, 28 | const at::Tensor &level_start_index, 29 | const at::Tensor &sampling_loc, 30 | const at::Tensor &attn_weight, 31 | const int im2col_step) 32 | { 33 | AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous"); 34 | AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous"); 35 | AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous"); 36 | AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous"); 37 | AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous"); 38 | 39 | AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor"); 40 | AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor"); 41 | AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor"); 42 | AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor"); 43 | AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor"); 44 | 45 | const int batch = value.size(0); 46 | const int spatial_size = value.size(1); 47 | const int num_heads = value.size(2); 48 | const int channels = value.size(3); 49 | 50 | const int num_levels = spatial_shapes.size(0); 51 | 52 | const int num_query = sampling_loc.size(1); 53 | const int num_point = sampling_loc.size(4); 54 | 55 | const int im2col_step_ = std::min(batch, im2col_step); 56 | 57 | AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_); 58 | 59 | auto output = at::zeros({batch, num_query, num_heads, channels}, value.options()); 60 | 61 | const int batch_n = im2col_step_; 62 | auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels}); 63 | auto per_value_size = spatial_size * num_heads * channels; 64 | auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2; 65 | auto per_attn_weight_size = num_query * num_heads * num_levels * num_point; 66 | for (int n = 0; n < batch/im2col_step_; ++n) 67 | { 68 | auto columns = output_n.select(0, n); 69 | AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_forward_cuda", ([&] { 70 | ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(), 71 | value.data() + n * im2col_step_ * per_value_size, 72 | spatial_shapes.data(), 73 | level_start_index.data(), 74 | sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, 75 | attn_weight.data() + n * im2col_step_ * per_attn_weight_size, 76 | batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point, 77 | columns.data()); 78 | 79 | })); 80 | } 81 | 82 | output = output.view({batch, num_query, num_heads*channels}); 83 | 84 | return output; 85 | } 86 | 87 | 88 | std::vector ms_deform_attn_cuda_backward( 89 | const at::Tensor &value, 90 | const at::Tensor &spatial_shapes, 91 | const at::Tensor &level_start_index, 92 | const at::Tensor &sampling_loc, 93 | const at::Tensor &attn_weight, 94 | const at::Tensor &grad_output, 95 | const int im2col_step) 96 | { 97 | 98 | AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous"); 99 | AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous"); 100 | AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous"); 101 | AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous"); 102 | AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous"); 103 | AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous"); 104 | 105 | AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor"); 106 | AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor"); 107 | AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor"); 108 | AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor"); 109 | AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor"); 110 | AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor"); 111 | 112 | const int batch = value.size(0); 113 | const int spatial_size = value.size(1); 114 | const int num_heads = value.size(2); 115 | const int channels = value.size(3); 116 | 117 | const int num_levels = spatial_shapes.size(0); 118 | 119 | const int num_query = sampling_loc.size(1); 120 | const int num_point = sampling_loc.size(4); 121 | 122 | const int im2col_step_ = std::min(batch, im2col_step); 123 | 124 | AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_); 125 | 126 | auto grad_value = at::zeros_like(value); 127 | auto grad_sampling_loc = at::zeros_like(sampling_loc); 128 | auto grad_attn_weight = at::zeros_like(attn_weight); 129 | 130 | const int batch_n = im2col_step_; 131 | auto per_value_size = spatial_size * num_heads * channels; 132 | auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2; 133 | auto per_attn_weight_size = num_query * num_heads * num_levels * num_point; 134 | auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels}); 135 | 136 | for (int n = 0; n < batch/im2col_step_; ++n) 137 | { 138 | auto grad_output_g = grad_output_n.select(0, n); 139 | AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_backward_cuda", ([&] { 140 | ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(), 141 | grad_output_g.data(), 142 | value.data() + n * im2col_step_ * per_value_size, 143 | spatial_shapes.data(), 144 | level_start_index.data(), 145 | sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, 146 | attn_weight.data() + n * im2col_step_ * per_attn_weight_size, 147 | batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point, 148 | grad_value.data() + n * im2col_step_ * per_value_size, 149 | grad_sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, 150 | grad_attn_weight.data() + n * im2col_step_ * per_attn_weight_size); 151 | 152 | })); 153 | } 154 | 155 | return { 156 | grad_value, grad_sampling_loc, grad_attn_weight 157 | }; 158 | } -------------------------------------------------------------------------------- /maskdino/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #pragma once 17 | #include 18 | 19 | at::Tensor ms_deform_attn_cuda_forward( 20 | const at::Tensor &value, 21 | const at::Tensor &spatial_shapes, 22 | const at::Tensor &level_start_index, 23 | const at::Tensor &sampling_loc, 24 | const at::Tensor &attn_weight, 25 | const int im2col_step); 26 | 27 | std::vector ms_deform_attn_cuda_backward( 28 | const at::Tensor &value, 29 | const at::Tensor &spatial_shapes, 30 | const at::Tensor &level_start_index, 31 | const at::Tensor &sampling_loc, 32 | const at::Tensor &attn_weight, 33 | const at::Tensor &grad_output, 34 | const int im2col_step); 35 | 36 | -------------------------------------------------------------------------------- /maskdino/modeling/pixel_decoder/ops/src/ms_deform_attn.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #pragma once 17 | 18 | #include "cpu/ms_deform_attn_cpu.h" 19 | 20 | #ifdef WITH_CUDA 21 | #include "cuda/ms_deform_attn_cuda.h" 22 | #endif 23 | 24 | 25 | at::Tensor 26 | ms_deform_attn_forward( 27 | const at::Tensor &value, 28 | const at::Tensor &spatial_shapes, 29 | const at::Tensor &level_start_index, 30 | const at::Tensor &sampling_loc, 31 | const at::Tensor &attn_weight, 32 | const int im2col_step) 33 | { 34 | if (value.type().is_cuda()) 35 | { 36 | #ifdef WITH_CUDA 37 | return ms_deform_attn_cuda_forward( 38 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step); 39 | #else 40 | AT_ERROR("Not compiled with GPU support"); 41 | #endif 42 | } 43 | AT_ERROR("Not implemented on the CPU"); 44 | } 45 | 46 | std::vector 47 | ms_deform_attn_backward( 48 | const at::Tensor &value, 49 | const at::Tensor &spatial_shapes, 50 | const at::Tensor &level_start_index, 51 | const at::Tensor &sampling_loc, 52 | const at::Tensor &attn_weight, 53 | const at::Tensor &grad_output, 54 | const int im2col_step) 55 | { 56 | if (value.type().is_cuda()) 57 | { 58 | #ifdef WITH_CUDA 59 | return ms_deform_attn_cuda_backward( 60 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step); 61 | #else 62 | AT_ERROR("Not compiled with GPU support"); 63 | #endif 64 | } 65 | AT_ERROR("Not implemented on the CPU"); 66 | } 67 | 68 | -------------------------------------------------------------------------------- /maskdino/modeling/pixel_decoder/ops/src/vision.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #include "ms_deform_attn.h" 17 | 18 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 19 | m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward"); 20 | m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward"); 21 | } 22 | -------------------------------------------------------------------------------- /maskdino/modeling/pixel_decoder/ops/test.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | from __future__ import absolute_import 13 | from __future__ import print_function 14 | from __future__ import division 15 | 16 | import time 17 | import torch 18 | import torch.nn as nn 19 | from torch.autograd import gradcheck 20 | 21 | from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch 22 | 23 | 24 | N, M, D = 1, 2, 2 25 | Lq, L, P = 2, 2, 2 26 | shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda() 27 | level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1])) 28 | S = sum([(H*W).item() for H, W in shapes]) 29 | 30 | 31 | torch.manual_seed(3) 32 | 33 | 34 | @torch.no_grad() 35 | def check_forward_equal_with_pytorch_double(): 36 | value = torch.rand(N, S, M, D).cuda() * 0.01 37 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 38 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 39 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 40 | im2col_step = 2 41 | output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu() 42 | output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu() 43 | fwdok = torch.allclose(output_cuda, output_pytorch) 44 | max_abs_err = (output_cuda - output_pytorch).abs().max() 45 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() 46 | 47 | print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') 48 | 49 | 50 | @torch.no_grad() 51 | def check_forward_equal_with_pytorch_float(): 52 | value = torch.rand(N, S, M, D).cuda() * 0.01 53 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 54 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 55 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 56 | im2col_step = 2 57 | output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu() 58 | output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu() 59 | fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3) 60 | max_abs_err = (output_cuda - output_pytorch).abs().max() 61 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() 62 | 63 | print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') 64 | 65 | 66 | def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True): 67 | 68 | value = torch.rand(N, S, M, channels).cuda() * 0.01 69 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 70 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 71 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 72 | im2col_step = 2 73 | func = MSDeformAttnFunction.apply 74 | 75 | value.requires_grad = grad_value 76 | sampling_locations.requires_grad = grad_sampling_loc 77 | attention_weights.requires_grad = grad_attn_weight 78 | 79 | gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step)) 80 | 81 | print(f'* {gradok} check_gradient_numerical(D={channels})') 82 | 83 | 84 | if __name__ == '__main__': 85 | check_forward_equal_with_pytorch_double() 86 | check_forward_equal_with_pytorch_float() 87 | 88 | for channels in [30, 32, 64, 71, 1025, 2048, 3096]: 89 | check_gradient_numerical(channels, True, True, True) 90 | 91 | 92 | 93 | -------------------------------------------------------------------------------- /maskdino/modeling/pixel_decoder/position_encoding.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Copyright (c) 2022 IDEA. All Rights Reserved. 3 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 4 | # ------------------------------------------------------------------------ 5 | # Modified from Mask2Former https://github.com/facebookresearch/Mask2Former by Feng Li and Hao Zhang. 6 | """ 7 | Various positional encodings for the transformer. 8 | """ 9 | import math 10 | 11 | import torch 12 | from torch import nn 13 | 14 | 15 | class PositionEmbeddingSine(nn.Module): 16 | """ 17 | This is a more standard version of the position embedding, very similar to the one 18 | used by the Attention is all you need paper, generalized to work on images. 19 | """ 20 | 21 | def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None): 22 | super().__init__() 23 | self.num_pos_feats = num_pos_feats 24 | self.temperature = temperature 25 | self.normalize = normalize 26 | if scale is not None and normalize is False: 27 | raise ValueError("normalize should be True if scale is passed") 28 | if scale is None: 29 | scale = 2 * math.pi 30 | self.scale = scale 31 | 32 | def forward(self, x, mask=None): 33 | if mask is None: 34 | mask = torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool) 35 | not_mask = ~mask 36 | y_embed = not_mask.cumsum(1, dtype=torch.float32) 37 | x_embed = not_mask.cumsum(2, dtype=torch.float32) 38 | if self.normalize: 39 | eps = 1e-6 40 | y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale 41 | x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale 42 | 43 | dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) 44 | dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) 45 | 46 | pos_x = x_embed[:, :, :, None] / dim_t 47 | pos_y = y_embed[:, :, :, None] / dim_t 48 | pos_x = torch.stack( 49 | (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4 50 | ).flatten(3) 51 | pos_y = torch.stack( 52 | (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4 53 | ).flatten(3) 54 | pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) 55 | return pos 56 | 57 | def __repr__(self, _repr_indent=4): 58 | head = "Positional encoding " + self.__class__.__name__ 59 | body = [ 60 | "num_pos_feats: {}".format(self.num_pos_feats), 61 | "temperature: {}".format(self.temperature), 62 | "normalize: {}".format(self.normalize), 63 | "scale: {}".format(self.scale), 64 | ] 65 | # _repr_indent = 4 66 | lines = [head] + [" " * _repr_indent + line for line in body] 67 | return "\n".join(lines) 68 | -------------------------------------------------------------------------------- /maskdino/modeling/transformer_decoder/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) IDEA, Inc. and its affiliates. 2 | from .maskdino_decoder import MaskDINODecoder 3 | 4 | -------------------------------------------------------------------------------- /maskdino/test_time_augmentation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import copy 3 | import logging 4 | from itertools import count 5 | 6 | import numpy as np 7 | import torch 8 | from fvcore.transforms import HFlipTransform 9 | from torch import nn 10 | from torch.nn.parallel import DistributedDataParallel 11 | 12 | from detectron2.data.detection_utils import read_image 13 | from detectron2.modeling import DatasetMapperTTA 14 | 15 | 16 | __all__ = [ 17 | "SemanticSegmentorWithTTA", 18 | ] 19 | 20 | 21 | class SemanticSegmentorWithTTA(nn.Module): 22 | """ 23 | A SemanticSegmentor with test-time augmentation enabled. 24 | Its :meth:`__call__` method has the same interface as :meth:`SemanticSegmentor.forward`. 25 | """ 26 | 27 | def __init__(self, cfg, model, tta_mapper=None, batch_size=1): 28 | """ 29 | Args: 30 | cfg (CfgNode): 31 | model (SemanticSegmentor): a SemanticSegmentor to apply TTA on. 32 | tta_mapper (callable): takes a dataset dict and returns a list of 33 | augmented versions of the dataset dict. Defaults to 34 | `DatasetMapperTTA(cfg)`. 35 | batch_size (int): batch the augmented images into this batch size for inference. 36 | """ 37 | super().__init__() 38 | if isinstance(model, DistributedDataParallel): 39 | model = model.module 40 | self.cfg = cfg.clone() 41 | 42 | self.model = model 43 | 44 | if tta_mapper is None: 45 | tta_mapper = DatasetMapperTTA(cfg) 46 | self.tta_mapper = tta_mapper 47 | self.batch_size = batch_size 48 | 49 | def __call__(self, batched_inputs): 50 | """ 51 | Same input/output format as :meth:`SemanticSegmentor.forward` 52 | """ 53 | 54 | def _maybe_read_image(dataset_dict): 55 | ret = copy.copy(dataset_dict) 56 | if "image" not in ret: 57 | image = read_image(ret.pop("file_name"), self.model.input_format) 58 | image = torch.from_numpy(np.ascontiguousarray(image.transpose(2, 0, 1))) # CHW 59 | ret["image"] = image 60 | if "height" not in ret and "width" not in ret: 61 | ret["height"] = image.shape[1] 62 | ret["width"] = image.shape[2] 63 | return ret 64 | 65 | processed_results = [] 66 | for x in batched_inputs: 67 | result = self._inference_one_image(_maybe_read_image(x)) 68 | processed_results.append(result) 69 | return processed_results 70 | 71 | def _inference_one_image(self, input): 72 | """ 73 | Args: 74 | input (dict): one dataset dict with "image" field being a CHW tensor 75 | Returns: 76 | dict: one output dict 77 | """ 78 | orig_shape = (input["height"], input["width"]) 79 | augmented_inputs, tfms = self._get_augmented_inputs(input) 80 | 81 | final_predictions = None 82 | count_predictions = 0 83 | for input, tfm in zip(augmented_inputs, tfms): 84 | count_predictions += 1 85 | with torch.no_grad(): 86 | if final_predictions is None: 87 | if any(isinstance(t, HFlipTransform) for t in tfm.transforms): 88 | final_predictions = self.model([input])[0].pop("sem_seg").flip(dims=[2]) 89 | else: 90 | final_predictions = self.model([input])[0].pop("sem_seg") 91 | else: 92 | if any(isinstance(t, HFlipTransform) for t in tfm.transforms): 93 | final_predictions += self.model([input])[0].pop("sem_seg").flip(dims=[2]) 94 | else: 95 | final_predictions += self.model([input])[0].pop("sem_seg") 96 | 97 | final_predictions = final_predictions / count_predictions 98 | return {"sem_seg": final_predictions} 99 | 100 | def _get_augmented_inputs(self, input): 101 | augmented_inputs = self.tta_mapper(input) 102 | tfms = [x.pop("transforms") for x in augmented_inputs] 103 | return augmented_inputs, tfms 104 | -------------------------------------------------------------------------------- /maskdino/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # import misc -------------------------------------------------------------------------------- /maskdino/utils/box_ops.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | """ 3 | Utilities for bounding box manipulation and GIoU. 4 | """ 5 | import torch 6 | from torchvision.ops.boxes import box_area 7 | 8 | 9 | def box_cxcywh_to_xyxy(x): 10 | x_c, y_c, w, h = x.unbind(-1) 11 | b = [(x_c - 0.5 * w), (y_c - 0.5 * h), 12 | (x_c + 0.5 * w), (y_c + 0.5 * h)] 13 | return torch.stack(b, dim=-1) 14 | 15 | 16 | def box_xyxy_to_cxcywh(x): 17 | x0, y0, x1, y1 = x.unbind(-1) 18 | b = [(x0 + x1) / 2, (y0 + y1) / 2, 19 | (x1 - x0), (y1 - y0)] 20 | return torch.stack(b, dim=-1) 21 | 22 | 23 | # modified from torchvision to also return the union 24 | def box_iou(boxes1, boxes2): 25 | area1 = box_area(boxes1) 26 | area2 = box_area(boxes2) 27 | 28 | 29 | lt = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2] 30 | rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2] 31 | 32 | wh = (rb - lt).clamp(min=0) # [N,M,2] 33 | inter = wh[:, :, 0] * wh[:, :, 1] # [N,M] 34 | 35 | union = area1[:, None] + area2 - inter 36 | 37 | iou = inter / (union + 1e-6) 38 | return iou, union 39 | 40 | 41 | def generalized_box_iou(boxes1, boxes2): 42 | """ 43 | Generalized IoU from https://giou.stanford.edu/ 44 | 45 | The boxes should be in [x0, y0, x1, y1] format 46 | 47 | Returns a [N, M] pairwise matrix, where N = len(boxes1) 48 | and M = len(boxes2) 49 | """ 50 | # degenerate boxes gives inf / nan results 51 | # so do an early check 52 | assert (boxes1[:, 2:] >= boxes1[:, :2]).all() 53 | assert (boxes2[:, 2:] >= boxes2[:, :2]).all() 54 | iou, union = box_iou(boxes1, boxes2) 55 | 56 | lt = torch.min(boxes1[:, None, :2], boxes2[:, :2]) 57 | rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:]) 58 | 59 | wh = (rb - lt).clamp(min=0) # [N,M,2] 60 | area = wh[:, :, 0] * wh[:, :, 1] 61 | 62 | return iou - (area - union) / (area + 1e-6) 63 | 64 | 65 | 66 | # modified from torchvision to also return the union 67 | def box_iou_pairwise(boxes1, boxes2): 68 | area1 = box_area(boxes1) 69 | area2 = box_area(boxes2) 70 | 71 | lt = torch.max(boxes1[:, :2], boxes2[:, :2]) # [N,2] 72 | rb = torch.min(boxes1[:, 2:], boxes2[:, 2:]) # [N,2] 73 | 74 | wh = (rb - lt).clamp(min=0) # [N,2] 75 | inter = wh[:, 0] * wh[:, 1] # [N] 76 | 77 | union = area1 + area2 - inter 78 | 79 | iou = inter / union 80 | return iou, union 81 | 82 | 83 | def generalized_box_iou_pairwise(boxes1, boxes2): 84 | """ 85 | Generalized IoU from https://giou.stanford.edu/ 86 | 87 | Input: 88 | - boxes1, boxes2: N,4 89 | Output: 90 | - giou: N, 4 91 | """ 92 | # degenerate boxes gives inf / nan results 93 | # so do an early check 94 | assert (boxes1[:, 2:] >= boxes1[:, :2]).all() 95 | assert (boxes2[:, 2:] >= boxes2[:, :2]).all() 96 | assert boxes1.shape == boxes2.shape 97 | iou, union = box_iou_pairwise(boxes1, boxes2) # N, 4 98 | 99 | lt = torch.min(boxes1[:, :2], boxes2[:, :2]) 100 | rb = torch.max(boxes1[:, 2:], boxes2[:, 2:]) 101 | 102 | wh = (rb - lt).clamp(min=0) # [N,2] 103 | area = wh[:, 0] * wh[:, 1] 104 | 105 | return iou - (area - union) / area 106 | 107 | def masks_to_boxes(masks): 108 | """Compute the bounding boxes around the provided masks 109 | 110 | The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions. 111 | 112 | Returns a [N, 4] tensors, with the boxes in xyxy format 113 | """ 114 | if masks.numel() == 0: 115 | return torch.zeros((0, 4), device=masks.device) 116 | 117 | h, w = masks.shape[-2:] 118 | 119 | y = torch.arange(0, h, dtype=torch.float, device=masks.device) 120 | x = torch.arange(0, w, dtype=torch.float, device=masks.device) 121 | y, x = torch.meshgrid(y, x) 122 | 123 | x_mask = (masks * x.unsqueeze(0)) 124 | x_max = x_mask.flatten(1).max(-1)[0] 125 | x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0] 126 | 127 | y_mask = (masks * y.unsqueeze(0)) 128 | y_max = y_mask.flatten(1).max(-1)[0] 129 | y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0] 130 | 131 | return torch.stack([x_min, y_min, x_max, y_max], 1) 132 | 133 | if __name__ == '__main__': 134 | x = torch.rand(5, 4) 135 | y = torch.rand(3, 4) 136 | iou, union = box_iou(x, y) -------------------------------------------------------------------------------- /maskdino/utils/misc.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/util/misc.py 3 | """ 4 | Misc functions, including distributed helpers. 5 | 6 | Mostly copy-paste from torchvision references. 7 | """ 8 | from typing import List, Optional 9 | 10 | import torch 11 | import torch.distributed as dist 12 | import torchvision 13 | from torch import Tensor 14 | 15 | 16 | def _max_by_axis(the_list): 17 | # type: (List[List[int]]) -> List[int] 18 | maxes = the_list[0] 19 | for sublist in the_list[1:]: 20 | for index, item in enumerate(sublist): 21 | maxes[index] = max(maxes[index], item) 22 | return maxes 23 | 24 | 25 | class NestedTensor(object): 26 | def __init__(self, tensors, mask: Optional[Tensor]): 27 | self.tensors = tensors 28 | self.mask = mask 29 | 30 | def to(self, device): 31 | # type: (Device) -> NestedTensor # noqa 32 | cast_tensor = self.tensors.to(device) 33 | mask = self.mask 34 | if mask is not None: 35 | assert mask is not None 36 | cast_mask = mask.to(device) 37 | else: 38 | cast_mask = None 39 | return NestedTensor(cast_tensor, cast_mask) 40 | 41 | def decompose(self): 42 | return self.tensors, self.mask 43 | 44 | def __repr__(self): 45 | return str(self.tensors) 46 | 47 | 48 | def nested_tensor_from_tensor_list(tensor_list: List[Tensor]): 49 | # TODO make this more general 50 | if tensor_list[0].ndim == 3: 51 | if torchvision._is_tracing(): 52 | # nested_tensor_from_tensor_list() does not export well to ONNX 53 | # call _onnx_nested_tensor_from_tensor_list() instead 54 | return _onnx_nested_tensor_from_tensor_list(tensor_list) 55 | 56 | # TODO make it support different-sized images 57 | max_size = _max_by_axis([list(img.shape) for img in tensor_list]) 58 | # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list])) 59 | batch_shape = [len(tensor_list)] + max_size 60 | b, c, h, w = batch_shape 61 | dtype = tensor_list[0].dtype 62 | device = tensor_list[0].device 63 | tensor = torch.zeros(batch_shape, dtype=dtype, device=device) 64 | mask = torch.ones((b, h, w), dtype=torch.bool, device=device) 65 | for img, pad_img, m in zip(tensor_list, tensor, mask): 66 | pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) 67 | m[: img.shape[1], : img.shape[2]] = False 68 | else: 69 | raise ValueError("not supported") 70 | return NestedTensor(tensor, mask) 71 | 72 | 73 | # _onnx_nested_tensor_from_tensor_list() is an implementation of 74 | # nested_tensor_from_tensor_list() that is supported by ONNX tracing. 75 | @torch.jit.unused 76 | def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor: 77 | max_size = [] 78 | for i in range(tensor_list[0].dim()): 79 | max_size_i = torch.max( 80 | torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32) 81 | ).to(torch.int64) 82 | max_size.append(max_size_i) 83 | max_size = tuple(max_size) 84 | 85 | # work around for 86 | # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) 87 | # m[: img.shape[1], :img.shape[2]] = False 88 | # which is not yet supported in onnx 89 | padded_imgs = [] 90 | padded_masks = [] 91 | for img in tensor_list: 92 | padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))] 93 | padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0])) 94 | padded_imgs.append(padded_img) 95 | 96 | m = torch.zeros_like(img[0], dtype=torch.int, device=img.device) 97 | padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1) 98 | padded_masks.append(padded_mask.to(torch.bool)) 99 | 100 | tensor = torch.stack(padded_imgs) 101 | mask = torch.stack(padded_masks) 102 | 103 | return NestedTensor(tensor, mask=mask) 104 | 105 | 106 | def is_dist_avail_and_initialized(): 107 | if not dist.is_available(): 108 | return False 109 | if not dist.is_initialized(): 110 | return False 111 | return True 112 | 113 | def masks_to_boxes(masks): 114 | """Compute the bounding boxes around the provided masks 115 | The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions. 116 | Returns a [N, 4] tensors, with the boxes in xyxy format 117 | """ 118 | if masks.numel() == 0: 119 | return torch.zeros((0, 4), device=masks.device) 120 | 121 | h, w = masks.shape[-2:] 122 | 123 | y = torch.arange(0, h, dtype=torch.float) 124 | x = torch.arange(0, w, dtype=torch.float) 125 | y, x = torch.meshgrid(y, x) 126 | 127 | x_mask = masks * x.unsqueeze(0) 128 | x_max = x_mask.flatten(1).max(-1)[0] 129 | x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0] 130 | 131 | y_mask = masks * y.unsqueeze(0) 132 | y_max = y_mask.flatten(1).max(-1)[0] 133 | y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0] 134 | 135 | return torch.stack([x_min, y_min, x_max, y_max], 1) -------------------------------------------------------------------------------- /maskdino/utils/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import copy 3 | from torch import nn, Tensor 4 | import os 5 | 6 | import math 7 | import torch.nn.functional as F 8 | from torch import nn 9 | 10 | 11 | class MLP(nn.Module): 12 | """ Very simple multi-layer perceptron (also called FFN)""" 13 | 14 | def __init__(self, input_dim, hidden_dim, output_dim, num_layers): 15 | super().__init__() 16 | self.num_layers = num_layers 17 | h = [hidden_dim] * (num_layers - 1) 18 | self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) 19 | 20 | def forward(self, x): 21 | for i, layer in enumerate(self.layers): 22 | x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x) 23 | return x 24 | 25 | 26 | def inverse_sigmoid(x, eps=1e-5): 27 | x = x.clamp(min=0, max=1) 28 | x1 = x.clamp(min=eps) 29 | x2 = (1 - x).clamp(min=eps) 30 | return torch.log(x1/x2) 31 | 32 | 33 | def gen_encoder_output_proposals(memory:Tensor, memory_padding_mask:Tensor, spatial_shapes:Tensor): 34 | """ 35 | Input: 36 | - memory: bs, \sum{hw}, d_model 37 | - memory_padding_mask: bs, \sum{hw} 38 | - spatial_shapes: nlevel, 2 39 | Output: 40 | - output_memory: bs, \sum{hw}, d_model 41 | - output_proposals: bs, \sum{hw}, 4 42 | """ 43 | N_, S_, C_ = memory.shape 44 | base_scale = 4.0 45 | proposals = [] 46 | _cur = 0 47 | for lvl, (H_, W_) in enumerate(spatial_shapes): 48 | mask_flatten_ = memory_padding_mask[:, _cur:(_cur + H_ * W_)].view(N_, H_, W_, 1) 49 | valid_H = torch.sum(~mask_flatten_[:, :, 0, 0], 1) 50 | valid_W = torch.sum(~mask_flatten_[:, 0, :, 0], 1) 51 | 52 | grid_y, grid_x = torch.meshgrid(torch.linspace(0, H_ - 1, H_, dtype=torch.float32, device=memory.device), 53 | torch.linspace(0, W_ - 1, W_, dtype=torch.float32, device=memory.device)) 54 | grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1) 55 | 56 | scale = torch.cat([valid_W.unsqueeze(-1), valid_H.unsqueeze(-1)], 1).view(N_, 1, 1, 2) 57 | grid = (grid.unsqueeze(0).expand(N_, -1, -1, -1) + 0.5) / scale 58 | wh = torch.ones_like(grid) * 0.05 * (2.0 ** lvl) 59 | proposal = torch.cat((grid, wh), -1).view(N_, -1, 4) 60 | proposals.append(proposal) 61 | _cur += (H_ * W_) 62 | output_proposals = torch.cat(proposals, 1) 63 | output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True) 64 | output_proposals = torch.log(output_proposals / (1 - output_proposals)) 65 | output_proposals = output_proposals.masked_fill(memory_padding_mask.unsqueeze(-1), float('inf')) 66 | output_proposals = output_proposals.masked_fill(~output_proposals_valid, float('inf')) 67 | 68 | output_memory = memory 69 | output_memory = output_memory.masked_fill(memory_padding_mask.unsqueeze(-1), float(0)) 70 | output_memory = output_memory.masked_fill(~output_proposals_valid, float(0)) 71 | return output_memory, output_proposals 72 | 73 | 74 | def gen_sineembed_for_position(pos_tensor): 75 | # n_query, bs, _ = pos_tensor.size() 76 | # sineembed_tensor = torch.zeros(n_query, bs, 256) 77 | scale = 2 * math.pi 78 | dim_t = torch.arange(128, dtype=torch.float32, device=pos_tensor.device) 79 | dim_t = 10000 ** (2 * (dim_t // 2) / 128) 80 | x_embed = pos_tensor[:, :, 0] * scale 81 | y_embed = pos_tensor[:, :, 1] * scale 82 | pos_x = x_embed[:, :, None] / dim_t 83 | pos_y = y_embed[:, :, None] / dim_t 84 | pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2) 85 | pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=3).flatten(2) 86 | if pos_tensor.size(-1) == 2: 87 | pos = torch.cat((pos_y, pos_x), dim=2) 88 | elif pos_tensor.size(-1) == 4: 89 | w_embed = pos_tensor[:, :, 2] * scale 90 | pos_w = w_embed[:, :, None] / dim_t 91 | pos_w = torch.stack((pos_w[:, :, 0::2].sin(), pos_w[:, :, 1::2].cos()), dim=3).flatten(2) 92 | 93 | h_embed = pos_tensor[:, :, 3] * scale 94 | pos_h = h_embed[:, :, None] / dim_t 95 | pos_h = torch.stack((pos_h[:, :, 0::2].sin(), pos_h[:, :, 1::2].cos()), dim=3).flatten(2) 96 | 97 | pos = torch.cat((pos_y, pos_x, pos_w, pos_h), dim=2) 98 | else: 99 | raise ValueError("Unknown pos_tensor shape(-1):{}".format(pos_tensor.size(-1))) 100 | return pos 101 | 102 | 103 | def _get_activation_fn(activation): 104 | """Return an activation function given a string""" 105 | if activation == "relu": 106 | return F.relu 107 | if activation == "gelu": 108 | return F.gelu 109 | if activation == "glu": 110 | return F.glu 111 | if activation == "prelu": 112 | return nn.PReLU() 113 | if activation == "selu": 114 | return F.selu 115 | raise RuntimeError(F"activation should be relu/gelu, not {activation}.") 116 | 117 | 118 | def _get_clones(module, N, layer_share=False): 119 | 120 | if layer_share: 121 | return nn.ModuleList([module for i in range(N)]) 122 | else: 123 | return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | cython 2 | scipy 3 | shapely 4 | timm 5 | h5py 6 | submitit 7 | scikit-image 8 | opencv-python 9 | -------------------------------------------------------------------------------- /tools/README.md: -------------------------------------------------------------------------------- 1 | We use ResNet and Swin as the backbone in our model. 2 | 3 | * `convert-torchvision-to-d2.py` 4 | 5 | Tool to convert torchvision pre-trained weights for D2. 6 | 7 | ``` 8 | wget https://download.pytorch.org/models/resnet101-63fe2227.pth 9 | python tools/convert-torchvision-to-d2.py resnet101-63fe2227.pth R-101.pkl 10 | ``` 11 | 12 | * `convert-pretrained-swin-model-to-d2.py` 13 | 14 | Tool to convert Swin Transformer pre-trained weights for D2. 15 | 16 | ``` 17 | pip install timm 18 | 19 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth 20 | python tools/convert-pretrained-swin-model-to-d2.py swin_tiny_patch4_window7_224.pth swin_tiny_patch4_window7_224.pkl 21 | 22 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_small_patch4_window7_224.pth 23 | python tools/convert-pretrained-swin-model-to-d2.py swin_small_patch4_window7_224.pth swin_small_patch4_window7_224.pkl 24 | 25 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window12_384_22k.pth 26 | python tools/convert-pretrained-swin-model-to-d2.py swin_base_patch4_window12_384_22k.pth swin_base_patch4_window12_384_22k.pkl 27 | 28 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth 29 | python tools/convert-pretrained-swin-model-to-d2.py swin_large_patch4_window12_384_22k.pth swin_large_patch4_window12_384_22k.pkl 30 | ``` 31 | 32 | 33 | * `analyze_model.py` 34 | 35 | Tool to analyze model parameters and flops. 36 | 37 | Usage for semantic segmentation (ADE20K only, use with caution!): 38 | 39 | ``` 40 | python tools/analyze_model.py --num-inputs 1 --tasks flop --use-fixed-input-size --config-file CONFIG_FILE 41 | ``` 42 | 43 | Note that, for semantic segmentation (ADE20K only), we use a dummy image with fixed size that equals to `cfg.INPUT.CROP.SIZE[0] x cfg.INPUT.CROP.SIZE[0]`. 44 | Please do not use `--use-fixed-input-size` for calculating FLOPs on other datasets like Cityscapes! 45 | 46 | Usage for panoptic and instance segmentation: 47 | 48 | ``` 49 | python tools/analyze_model.py --num-inputs 100 --tasks flop --config-file CONFIG_FILE 50 | ``` 51 | 52 | Note that, for panoptic and instance segmentation, we compute the average flops over 100 real validation images. 53 | -------------------------------------------------------------------------------- /tools/analyze_model.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # Modified by Bowen Cheng from https://github.com/facebookresearch/detectron2/blob/main/tools/analyze_model.py 4 | 5 | import logging 6 | import numpy as np 7 | from collections import Counter 8 | import tqdm 9 | from fvcore.nn import flop_count_table # can also try flop_count_str 10 | 11 | from detectron2.checkpoint import DetectionCheckpointer 12 | from detectron2.config import CfgNode, LazyConfig, get_cfg, instantiate 13 | from detectron2.data import build_detection_test_loader 14 | from detectron2.engine import default_argument_parser 15 | from detectron2.modeling import build_model 16 | from detectron2.projects.deeplab import add_deeplab_config 17 | from detectron2.utils.analysis import ( 18 | FlopCountAnalysis, 19 | activation_count_operators, 20 | parameter_count_table, 21 | ) 22 | from detectron2.utils.logger import setup_logger 23 | 24 | # fmt: off 25 | import os 26 | import sys 27 | sys.path.insert(1, os.path.join(sys.path[0], '..')) 28 | # fmt: on 29 | 30 | from mask2former import add_maskformer2_config 31 | 32 | logger = logging.getLogger("detectron2") 33 | 34 | 35 | def setup(args): 36 | if args.config_file.endswith(".yaml"): 37 | cfg = get_cfg() 38 | add_deeplab_config(cfg) 39 | add_maskformer2_config(cfg) 40 | cfg.merge_from_file(args.config_file) 41 | cfg.DATALOADER.NUM_WORKERS = 0 42 | cfg.merge_from_list(args.opts) 43 | cfg.freeze() 44 | else: 45 | cfg = LazyConfig.load(args.config_file) 46 | cfg = LazyConfig.apply_overrides(cfg, args.opts) 47 | setup_logger(name="fvcore") 48 | setup_logger() 49 | return cfg 50 | 51 | 52 | def do_flop(cfg): 53 | if isinstance(cfg, CfgNode): 54 | data_loader = build_detection_test_loader(cfg, cfg.DATASETS.TEST[0]) 55 | model = build_model(cfg) 56 | DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS) 57 | else: 58 | data_loader = instantiate(cfg.dataloader.test) 59 | model = instantiate(cfg.model) 60 | model.to(cfg.train.device) 61 | DetectionCheckpointer(model).load(cfg.train.init_checkpoint) 62 | model.eval() 63 | 64 | counts = Counter() 65 | total_flops = [] 66 | for idx, data in zip(tqdm.trange(args.num_inputs), data_loader): # noqa 67 | if args.use_fixed_input_size and isinstance(cfg, CfgNode): 68 | import torch 69 | crop_size = cfg.INPUT.CROP.SIZE[0] 70 | data[0]["image"] = torch.zeros((3, crop_size, crop_size)) 71 | flops = FlopCountAnalysis(model, data) 72 | if idx > 0: 73 | flops.unsupported_ops_warnings(False).uncalled_modules_warnings(False) 74 | counts += flops.by_operator() 75 | total_flops.append(flops.total()) 76 | 77 | logger.info("Flops table computed from only one input sample:\n" + flop_count_table(flops)) 78 | logger.info( 79 | "Average GFlops for each type of operators:\n" 80 | + str([(k, v / (idx + 1) / 1e9) for k, v in counts.items()]) 81 | ) 82 | logger.info( 83 | "Total GFlops: {:.1f}±{:.1f}".format(np.mean(total_flops) / 1e9, np.std(total_flops) / 1e9) 84 | ) 85 | 86 | 87 | def do_activation(cfg): 88 | if isinstance(cfg, CfgNode): 89 | data_loader = build_detection_test_loader(cfg, cfg.DATASETS.TEST[0]) 90 | model = build_model(cfg) 91 | DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS) 92 | else: 93 | data_loader = instantiate(cfg.dataloader.test) 94 | model = instantiate(cfg.model) 95 | model.to(cfg.train.device) 96 | DetectionCheckpointer(model).load(cfg.train.init_checkpoint) 97 | model.eval() 98 | 99 | counts = Counter() 100 | total_activations = [] 101 | for idx, data in zip(tqdm.trange(args.num_inputs), data_loader): # noqa 102 | count = activation_count_operators(model, data) 103 | counts += count 104 | total_activations.append(sum(count.values())) 105 | logger.info( 106 | "(Million) Activations for Each Type of Operators:\n" 107 | + str([(k, v / idx) for k, v in counts.items()]) 108 | ) 109 | logger.info( 110 | "Total (Million) Activations: {}±{}".format( 111 | np.mean(total_activations), np.std(total_activations) 112 | ) 113 | ) 114 | 115 | 116 | def do_parameter(cfg): 117 | if isinstance(cfg, CfgNode): 118 | model = build_model(cfg) 119 | else: 120 | model = instantiate(cfg.model) 121 | logger.info("Parameter Count:\n" + parameter_count_table(model, max_depth=5)) 122 | 123 | 124 | def do_structure(cfg): 125 | if isinstance(cfg, CfgNode): 126 | model = build_model(cfg) 127 | else: 128 | model = instantiate(cfg.model) 129 | logger.info("Model Structure:\n" + str(model)) 130 | 131 | 132 | if __name__ == "__main__": 133 | parser = default_argument_parser( 134 | epilog=""" 135 | Examples: 136 | To show parameters of a model: 137 | $ ./analyze_model.py --tasks parameter \\ 138 | --config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml 139 | Flops and activations are data-dependent, therefore inputs and model weights 140 | are needed to count them: 141 | $ ./analyze_model.py --num-inputs 100 --tasks flop \\ 142 | --config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \\ 143 | MODEL.WEIGHTS /path/to/model.pkl 144 | """ 145 | ) 146 | parser.add_argument( 147 | "--tasks", 148 | choices=["flop", "activation", "parameter", "structure"], 149 | required=True, 150 | nargs="+", 151 | ) 152 | parser.add_argument( 153 | "-n", 154 | "--num-inputs", 155 | default=100, 156 | type=int, 157 | help="number of inputs used to compute statistics for flops/activations, " 158 | "both are data dependent.", 159 | ) 160 | parser.add_argument( 161 | "--use-fixed-input-size", 162 | action="store_true", 163 | help="use fixed input size when calculating flops", 164 | ) 165 | args = parser.parse_args() 166 | assert not args.eval_only 167 | assert args.num_gpus == 1 168 | 169 | cfg = setup(args) 170 | 171 | for task in args.tasks: 172 | { 173 | "flop": do_flop, 174 | "activation": do_activation, 175 | "parameter": do_parameter, 176 | "structure": do_structure, 177 | }[task](cfg) 178 | -------------------------------------------------------------------------------- /tools/convert-pretrained-swin-model-to-d2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 3 | 4 | import pickle as pkl 5 | import sys 6 | 7 | import torch 8 | 9 | """ 10 | Usage: 11 | # download pretrained swin model: 12 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth 13 | # run the conversion 14 | ./convert-pretrained-model-to-d2.py swin_tiny_patch4_window7_224.pth swin_tiny_patch4_window7_224.pkl 15 | # Then, use swin_tiny_patch4_window7_224.pkl with the following changes in config: 16 | MODEL: 17 | WEIGHTS: "/path/to/swin_tiny_patch4_window7_224.pkl" 18 | INPUT: 19 | FORMAT: "RGB" 20 | """ 21 | 22 | if __name__ == "__main__": 23 | input = sys.argv[1] 24 | 25 | obj = torch.load(input, map_location="cpu")["model"] 26 | 27 | res = {"model": obj, "__author__": "third_party", "matching_heuristics": True} 28 | 29 | with open(sys.argv[2], "wb") as f: 30 | pkl.dump(res, f) 31 | -------------------------------------------------------------------------------- /tools/convert-torchvision-to-d2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | 4 | import pickle as pkl 5 | import sys 6 | 7 | import torch 8 | 9 | """ 10 | Usage: 11 | # download one of the ResNet{18,34,50,101,152} models from torchvision: 12 | wget https://download.pytorch.org/models/resnet50-19c8e357.pth -O r50.pth 13 | # run the conversion 14 | ./convert-torchvision-to-d2.py r50.pth r50.pkl 15 | # Then, use r50.pkl with the following changes in config: 16 | MODEL: 17 | WEIGHTS: "/path/to/r50.pkl" 18 | PIXEL_MEAN: [123.675, 116.280, 103.530] 19 | PIXEL_STD: [58.395, 57.120, 57.375] 20 | RESNETS: 21 | DEPTH: 50 22 | STRIDE_IN_1X1: False 23 | INPUT: 24 | FORMAT: "RGB" 25 | """ 26 | 27 | if __name__ == "__main__": 28 | input = sys.argv[1] 29 | 30 | obj = torch.load(input, map_location="cpu") 31 | 32 | newmodel = {} 33 | for k in list(obj.keys()): 34 | old_k = k 35 | if "layer" not in k: 36 | k = "stem." + k 37 | for t in [1, 2, 3, 4]: 38 | k = k.replace("layer{}".format(t), "res{}".format(t + 1)) 39 | for t in [1, 2, 3]: 40 | k = k.replace("bn{}".format(t), "conv{}.norm".format(t)) 41 | k = k.replace("downsample.0", "shortcut") 42 | k = k.replace("downsample.1", "shortcut.norm") 43 | print(old_k, "->", k) 44 | newmodel[k] = obj.pop(old_k).detach().numpy() 45 | 46 | res = {"model": newmodel, "__author__": "torchvision", "matching_heuristics": True} 47 | 48 | with open(sys.argv[2], "wb") as f: 49 | pkl.dump(res, f) 50 | if obj: 51 | print("Unconverted keys:", obj.keys()) 52 | -------------------------------------------------------------------------------- /tools/evaluate_coco_boundary_ap.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 3 | # Modified by Bowen Cheng from: https://github.com/bowenc0221/boundary-iou-api/blob/master/tools/coco_instance_evaluation.py 4 | 5 | """ 6 | Evaluation for COCO val2017: 7 | python ./tools/coco_instance_evaluation.py \ 8 | --gt-json-file COCO_GT_JSON \ 9 | --dt-json-file COCO_DT_JSON 10 | """ 11 | import argparse 12 | import json 13 | 14 | from boundary_iou.coco_instance_api.coco import COCO 15 | from boundary_iou.coco_instance_api.cocoeval import COCOeval 16 | 17 | 18 | def main(): 19 | parser = argparse.ArgumentParser() 20 | parser.add_argument("--gt-json-file", default="") 21 | parser.add_argument("--dt-json-file", default="") 22 | parser.add_argument("--iou-type", default="boundary") 23 | parser.add_argument("--dilation-ratio", default="0.020", type=float) 24 | args = parser.parse_args() 25 | print(args) 26 | 27 | annFile = args.gt_json_file 28 | resFile = args.dt_json_file 29 | dilation_ratio = args.dilation_ratio 30 | if args.iou_type == "boundary": 31 | get_boundary = True 32 | else: 33 | get_boundary = False 34 | cocoGt = COCO(annFile, get_boundary=get_boundary, dilation_ratio=dilation_ratio) 35 | 36 | # remove box predictions 37 | resFile = json.load(open(resFile)) 38 | for c in resFile: 39 | c.pop("bbox", None) 40 | 41 | cocoDt = cocoGt.loadRes(resFile) 42 | cocoEval = COCOeval(cocoGt, cocoDt, iouType=args.iou_type, dilation_ratio=dilation_ratio) 43 | cocoEval.evaluate() 44 | cocoEval.accumulate() 45 | cocoEval.summarize() 46 | 47 | 48 | if __name__ == '__main__': 49 | main() 50 | -------------------------------------------------------------------------------- /tools/evaluate_pq_for_semantic_segmentation.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | 4 | import argparse 5 | import json 6 | import os 7 | from collections import defaultdict 8 | from tqdm import tqdm 9 | 10 | import numpy as np 11 | import torch 12 | 13 | from detectron2.data import MetadataCatalog 14 | from detectron2.data.detection_utils import read_image 15 | from detectron2.utils.file_io import PathManager 16 | from pycocotools import mask as maskUtils 17 | 18 | from panopticapi.evaluation import PQStat 19 | 20 | 21 | def default_argument_parser(): 22 | """ 23 | Creates a parser with some common arguments used by analysis tools. 24 | Returns: 25 | argparse.ArgumentParser: 26 | """ 27 | parser = argparse.ArgumentParser(description="Evaluate PQ metric for semantic segmentation.") 28 | # NOTE: currently does not support Cityscapes, you need to convert 29 | # Cityscapes prediction format to Detectron2 prediction format. 30 | parser.add_argument( 31 | "--dataset-name", 32 | default="ade20k_sem_seg_val", 33 | choices=["ade20k_sem_seg_val", "coco_2017_test_stuff_10k_sem_seg", "ade20k_full_sem_seg_val"], 34 | help="dataset name you want to evaluate") 35 | parser.add_argument("--json-file", default="", help="path to detection json file") 36 | 37 | return parser 38 | 39 | 40 | # Modified from the official panoptic api: https://github.com/cocodataset/panopticapi/blob/master/panopticapi/evaluation.py 41 | def pq_compute_single_image(segm_gt, segm_dt, categories, ignore_label): 42 | pq_stat = PQStat() 43 | VOID = ignore_label 44 | OFFSET = 256 * 256 * 256 45 | 46 | pan_gt = segm_gt 47 | pan_pred = segm_dt 48 | 49 | gt_ann = {'segments_info': []} 50 | labels, labels_cnt = np.unique(segm_gt, return_counts=True) 51 | for cat_id, cnt in zip(labels, labels_cnt): 52 | if cat_id == VOID: 53 | continue 54 | gt_ann['segments_info'].append( 55 | {"id": cat_id, "category_id": cat_id, "area": cnt, "iscrowd": 0} 56 | ) 57 | 58 | pred_ann = {'segments_info': []} 59 | for cat_id in np.unique(segm_dt): 60 | pred_ann['segments_info'].append({"id": cat_id, "category_id": cat_id}) 61 | 62 | gt_segms = {el['id']: el for el in gt_ann['segments_info']} 63 | pred_segms = {el['id']: el for el in pred_ann['segments_info']} 64 | 65 | # predicted segments area calculation + prediction sanity checks 66 | pred_labels_set = set(el['id'] for el in pred_ann['segments_info']) 67 | labels, labels_cnt = np.unique(pan_pred, return_counts=True) 68 | for label, label_cnt in zip(labels, labels_cnt): 69 | if label not in pred_segms: 70 | if label == VOID: 71 | continue 72 | raise KeyError('In the image with ID {} segment with ID {} is presented in PNG and not presented in JSON.'.format(image_id, label)) 73 | pred_segms[label]['area'] = label_cnt 74 | pred_labels_set.remove(label) 75 | if pred_segms[label]['category_id'] not in categories: 76 | raise KeyError('In the image with ID {} segment with ID {} has unknown category_id {}.'.format(image_id, label, pred_segms[label]['category_id'])) 77 | if len(pred_labels_set) != 0: 78 | raise KeyError('In the image with ID {} the following segment IDs {} are presented in JSON and not presented in PNG.'.format(image_id, list(pred_labels_set))) 79 | 80 | # confusion matrix calculation 81 | pan_gt_pred = pan_gt.astype(np.uint64) * OFFSET + pan_pred.astype(np.uint64) 82 | gt_pred_map = {} 83 | labels, labels_cnt = np.unique(pan_gt_pred, return_counts=True) 84 | for label, intersection in zip(labels, labels_cnt): 85 | gt_id = label // OFFSET 86 | pred_id = label % OFFSET 87 | gt_pred_map[(gt_id, pred_id)] = intersection 88 | 89 | # count all matched pairs 90 | gt_matched = set() 91 | pred_matched = set() 92 | for label_tuple, intersection in gt_pred_map.items(): 93 | gt_label, pred_label = label_tuple 94 | if gt_label not in gt_segms: 95 | continue 96 | if pred_label not in pred_segms: 97 | continue 98 | if gt_segms[gt_label]['iscrowd'] == 1: 99 | continue 100 | if gt_segms[gt_label]['category_id'] != pred_segms[pred_label]['category_id']: 101 | continue 102 | 103 | union = pred_segms[pred_label]['area'] + gt_segms[gt_label]['area'] - intersection - gt_pred_map.get((VOID, pred_label), 0) 104 | iou = intersection / union 105 | if iou > 0.5: 106 | pq_stat[gt_segms[gt_label]['category_id']].tp += 1 107 | pq_stat[gt_segms[gt_label]['category_id']].iou += iou 108 | gt_matched.add(gt_label) 109 | pred_matched.add(pred_label) 110 | 111 | # count false positives 112 | crowd_labels_dict = {} 113 | for gt_label, gt_info in gt_segms.items(): 114 | if gt_label in gt_matched: 115 | continue 116 | # crowd segments are ignored 117 | if gt_info['iscrowd'] == 1: 118 | crowd_labels_dict[gt_info['category_id']] = gt_label 119 | continue 120 | pq_stat[gt_info['category_id']].fn += 1 121 | 122 | # count false positives 123 | for pred_label, pred_info in pred_segms.items(): 124 | if pred_label in pred_matched: 125 | continue 126 | # intersection of the segment with VOID 127 | intersection = gt_pred_map.get((VOID, pred_label), 0) 128 | # plus intersection with corresponding CROWD region if it exists 129 | if pred_info['category_id'] in crowd_labels_dict: 130 | intersection += gt_pred_map.get((crowd_labels_dict[pred_info['category_id']], pred_label), 0) 131 | # predicted segment is ignored if more than half of the segment correspond to VOID and CROWD regions 132 | if intersection / pred_info['area'] > 0.5: 133 | continue 134 | pq_stat[pred_info['category_id']].fp += 1 135 | 136 | return pq_stat 137 | 138 | 139 | def main(): 140 | parser = default_argument_parser() 141 | args = parser.parse_args() 142 | 143 | _root = os.getenv("DETECTRON2_DATASETS", "datasets") 144 | json_file = args.json_file 145 | 146 | with open(json_file) as f: 147 | predictions = json.load(f) 148 | 149 | imgToAnns = defaultdict(list) 150 | for pred in predictions: 151 | image_id = os.path.basename(pred["file_name"]).split(".")[0] 152 | imgToAnns[image_id].append( 153 | {"category_id" : pred["category_id"], "segmentation" : pred["segmentation"]} 154 | ) 155 | 156 | image_ids = list(imgToAnns.keys()) 157 | 158 | meta = MetadataCatalog.get(args.dataset_name) 159 | class_names = meta.stuff_classes 160 | num_classes = len(meta.stuff_classes) 161 | ignore_label = meta.ignore_label 162 | conf_matrix = np.zeros((num_classes + 1, num_classes + 1), dtype=np.int64) 163 | 164 | categories = {} 165 | for i in range(num_classes): 166 | categories[i] = {"id": i, "name": class_names[i], "isthing": 0} 167 | 168 | pq_stat = PQStat() 169 | 170 | for image_id in tqdm(image_ids): 171 | if args.dataset_name == "ade20k_sem_seg_val": 172 | gt_dir = os.path.join(_root, "ADEChallengeData2016", "annotations_detectron2", "validation") 173 | segm_gt = read_image(os.path.join(gt_dir, image_id + ".png")).copy().astype(np.int64) 174 | elif args.dataset_name == "coco_2017_test_stuff_10k_sem_seg": 175 | gt_dir = os.path.join(_root, "coco", "coco_stuff_10k", "annotations_detectron2", "test") 176 | segm_gt = read_image(os.path.join(gt_dir, image_id + ".png")).copy().astype(np.int64) 177 | elif args.dataset_name == "ade20k_full_sem_seg_val": 178 | gt_dir = os.path.join(_root, "ADE20K_2021_17_01", "annotations_detectron2", "validation") 179 | segm_gt = read_image(os.path.join(gt_dir, image_id + ".tif")).copy().astype(np.int64) 180 | else: 181 | raise ValueError(f"Unsupported dataset {args.dataset_name}") 182 | 183 | # get predictions 184 | segm_dt = np.zeros_like(segm_gt) 185 | anns = imgToAnns[image_id] 186 | for ann in anns: 187 | # map back category_id 188 | if hasattr(meta, "stuff_dataset_id_to_contiguous_id"): 189 | if ann["category_id"] in meta.stuff_dataset_id_to_contiguous_id: 190 | category_id = meta.stuff_dataset_id_to_contiguous_id[ann["category_id"]] 191 | else: 192 | category_id = ann["category_id"] 193 | mask = maskUtils.decode(ann["segmentation"]) 194 | segm_dt[mask > 0] = category_id 195 | 196 | # miou 197 | gt = segm_gt.copy() 198 | pred = segm_dt.copy() 199 | gt[gt == ignore_label] = num_classes 200 | conf_matrix += np.bincount( 201 | (num_classes + 1) * pred.reshape(-1) + gt.reshape(-1), 202 | minlength=conf_matrix.size, 203 | ).reshape(conf_matrix.shape) 204 | 205 | # pq 206 | pq_stat_single = pq_compute_single_image(segm_gt, segm_dt, categories, meta.ignore_label) 207 | pq_stat += pq_stat_single 208 | 209 | metrics = [("All", None), ("Stuff", False)] 210 | results = {} 211 | for name, isthing in metrics: 212 | results[name], per_class_results = pq_stat.pq_average(categories, isthing=isthing) 213 | if name == 'All': 214 | results['per_class'] = per_class_results 215 | print("{:10s}| {:>5s} {:>5s} {:>5s} {:>5s}".format("", "PQ", "SQ", "RQ", "N")) 216 | print("-" * (10 + 7 * 4)) 217 | 218 | for name, _isthing in metrics: 219 | print("{:10s}| {:5.1f} {:5.1f} {:5.1f} {:5d}".format( 220 | name, 221 | 100 * results[name]['pq'], 222 | 100 * results[name]['sq'], 223 | 100 * results[name]['rq'], 224 | results[name]['n']) 225 | ) 226 | 227 | # calculate miou 228 | acc = np.full(num_classes, np.nan, dtype=np.float64) 229 | iou = np.full(num_classes, np.nan, dtype=np.float64) 230 | tp = conf_matrix.diagonal()[:-1].astype(np.float64) 231 | pos_gt = np.sum(conf_matrix[:-1, :-1], axis=0).astype(np.float64) 232 | pos_pred = np.sum(conf_matrix[:-1, :-1], axis=1).astype(np.float64) 233 | acc_valid = pos_gt > 0 234 | acc[acc_valid] = tp[acc_valid] / pos_gt[acc_valid] 235 | iou_valid = (pos_gt + pos_pred) > 0 236 | union = pos_gt + pos_pred - tp 237 | iou[acc_valid] = tp[acc_valid] / union[acc_valid] 238 | miou = np.sum(iou[acc_valid]) / np.sum(iou_valid) 239 | 240 | print("") 241 | print(f"mIoU: {miou}") 242 | 243 | 244 | if __name__ == '__main__': 245 | main() 246 | --------------------------------------------------------------------------------