├── .gitignore ├── INSTALL.md ├── LICENSE ├── LICENSE_MASK2FORMER ├── MODELS.md ├── README.md ├── checkpoints └── README.md ├── configs ├── ade20k │ ├── instance-segmentation │ │ ├── Base-ADE20K-InstanceSegmentation.yaml │ │ ├── maskformer2_R50_bs16_160k.yaml │ │ └── swin │ │ │ └── maskformer2_swin_large_IN21k_384_bs16_160k.yaml │ ├── panoptic-segmentation │ │ ├── Base-ADE20K-PanopticSegmentation.yaml │ │ ├── maskformer2_R50_bs16_160k.yaml │ │ └── swin │ │ │ └── maskformer2_swin_large_IN21k_384_bs16_160k.yaml │ └── semantic-segmentation │ │ ├── Base-ADE20K-SemanticSegmentation.yaml │ │ ├── maskformer2_R101_bs16_90k.yaml │ │ ├── maskformer2_R50_bs16_160k.yaml │ │ └── swin │ │ ├── maskformer2_swin_base_384_bs16_160k_res640.yaml │ │ ├── maskformer2_swin_base_IN21k_384_bs16_160k_res640.yaml │ │ ├── maskformer2_swin_large_IN21k_384_bs16_160k_res640.yaml │ │ ├── maskformer2_swin_small_bs16_160k.yaml │ │ └── maskformer2_swin_tiny_bs16_160k.yaml ├── cityscapes │ ├── instance-segmentation │ │ ├── Base-Cityscapes-InstanceSegmentation.yaml │ │ ├── maskformer2_R101_bs16_90k.yaml │ │ ├── maskformer2_R50_bs16_90k.yaml │ │ └── swin │ │ │ ├── maskformer2_swin_base_IN21k_384_bs16_90k.yaml │ │ │ ├── maskformer2_swin_large_IN21k_384_bs16_90k.yaml │ │ │ ├── maskformer2_swin_small_bs16_90k.yaml │ │ │ └── maskformer2_swin_tiny_bs16_90k.yaml │ ├── panoptic-segmentation │ │ ├── Base-Cityscapes-PanopticSegmentation.yaml │ │ ├── maskformer2_R101_bs16_90k.yaml │ │ ├── maskformer2_R50_bs16_90k.yaml │ │ └── swin │ │ │ ├── maskformer2_swin_base_IN21k_384_bs16_90k.yaml │ │ │ ├── maskformer2_swin_large_IN21k_384_bs16_90k.yaml │ │ │ ├── maskformer2_swin_small_bs16_90k.yaml │ │ │ └── maskformer2_swin_tiny_bs16_90k.yaml │ ├── pps │ │ ├── Base-Cityscapes-PPS.yaml │ │ ├── tapps_cityscapes_r50_cocoinit.yaml │ │ ├── tapps_cityscapes_r50_in1kinit.yaml │ │ └── tapps_cityscapes_swinb_cocoinit.yaml │ └── semantic-segmentation │ │ ├── Base-Cityscapes-SemanticSegmentation.yaml │ │ ├── maskformer2_R101_bs16_90k.yaml │ │ ├── maskformer2_R50_bs16_90k.yaml │ │ └── swin │ │ ├── maskformer2_swin_base_IN21k_384_bs16_90k.yaml │ │ ├── maskformer2_swin_large_IN21k_384_bs16_90k.yaml │ │ ├── maskformer2_swin_small_bs16_90k.yaml │ │ └── maskformer2_swin_tiny_bs16_90k.yaml ├── coco │ ├── instance-segmentation │ │ ├── Base-COCO-InstanceSegmentation.yaml │ │ ├── maskformer2_R101_bs16_50ep.yaml │ │ ├── maskformer2_R50_bs16_50ep.yaml │ │ └── swin │ │ │ ├── maskformer2_swin_base_384_bs16_50ep.yaml │ │ │ ├── maskformer2_swin_base_IN21k_384_bs16_50ep.yaml │ │ │ ├── maskformer2_swin_large_IN21k_384_bs16_100ep.yaml │ │ │ ├── maskformer2_swin_small_bs16_50ep.yaml │ │ │ └── maskformer2_swin_tiny_bs16_50ep.yaml │ └── panoptic-segmentation │ │ ├── Base-COCO-PanopticSegmentation.yaml │ │ ├── maskformer2_R101_bs16_50ep.yaml │ │ ├── maskformer2_R50_bs16_50ep.yaml │ │ └── swin │ │ ├── maskformer2_swin_base_384_bs16_50ep.yaml │ │ ├── maskformer2_swin_base_IN21k_384_bs16_50ep.yaml │ │ ├── maskformer2_swin_large_IN21k_384_bs16_100ep.yaml │ │ ├── maskformer2_swin_small_bs16_50ep.yaml │ │ └── maskformer2_swin_tiny_bs16_50ep.yaml ├── mapillary-vistas │ ├── panoptic-segmentation │ │ ├── Base-MapillaryVistas-PanopticSegmentation.yaml │ │ ├── maskformer_R50_bs16_300k.yaml │ │ └── swin │ │ │ └── maskformer2_swin_large_IN21k_384_bs16_300k.yaml │ └── semantic-segmentation │ │ ├── Base-MapillaryVistas-SemanticSegmentation.yaml │ │ ├── maskformer2_R50_bs16_300k.yaml │ │ └── swin │ │ └── maskformer2_swin_large_IN21k_384_bs16_300k.yaml └── pascal │ └── pps │ ├── Base-Pascal-PPS-LSJ.yaml │ ├── pascal_107 │ ├── tapps_pascal107_r50_cocoinit.yaml │ └── tapps_pascal107_r50_in1kinit.yaml │ ├── tapps_pascal_r50_cocoinit.yaml │ ├── tapps_pascal_r50_in1kinit.yaml │ └── tapps_pascal_swinb_cocoinit.yaml ├── datasets ├── README.md ├── ade20k_instance_catid_mapping.txt ├── ade20k_instance_imgCatIds.json ├── prepare_ade20k_ins_seg.py ├── prepare_ade20k_pan_seg.py ├── prepare_ade20k_sem_seg.py ├── prepare_cityscapes_pp.py ├── prepare_coco_semantic_annos_from_panoptic_annos.py ├── prepare_pascal_pp.py └── prepare_pascal_pp_107.py ├── eval ├── eval_partpq.py └── visualize_pps.py ├── inference_single_img.py ├── requirements.txt ├── tapps ├── __init__.py ├── config.py ├── data │ ├── __init__.py │ ├── dataset_mappers │ │ ├── __init__.py │ │ ├── augmentations.py │ │ ├── coco_instance_new_baseline_dataset_mapper.py │ │ ├── coco_panoptic_new_baseline_dataset_mapper.py │ │ ├── mask_former_instance_dataset_mapper.py │ │ ├── mask_former_panoptic_dataset_mapper.py │ │ ├── mask_former_panoptic_parts_dataset_mapper.py │ │ ├── mask_former_semantic_dataset_mapper.py │ │ └── pascal_panoptic_parts_new_baseline_dataset_mapper.py │ └── datasets │ │ ├── __init__.py │ │ ├── register_ade20k_full.py │ │ ├── register_ade20k_instance.py │ │ ├── register_ade20k_panoptic.py │ │ ├── register_cityscapes_panoptic_parts.py │ │ ├── register_coco_panoptic_annos_semseg.py │ │ ├── register_coco_stuff_10k.py │ │ ├── register_mapillary_vistas.py │ │ ├── register_mapillary_vistas_panoptic.py │ │ ├── register_pascal_panoptic_parts.py │ │ └── register_pascal_panoptic_parts_107.py ├── evaluation │ ├── __init__.py │ └── instance_evaluation.py ├── maskformer_model.py ├── modeling │ ├── __init__.py │ ├── backbone │ │ ├── __init__.py │ │ └── swin.py │ ├── criterion.py │ ├── matcher.py │ ├── meta_arch │ │ ├── __init__.py │ │ ├── mask_former_head.py │ │ └── per_pixel_baseline.py │ ├── pixel_decoder │ │ ├── __init__.py │ │ ├── fpn.py │ │ ├── msdeformattn.py │ │ └── ops │ │ │ ├── functions │ │ │ ├── __init__.py │ │ │ └── ms_deform_attn_func.py │ │ │ ├── make.sh │ │ │ ├── modules │ │ │ ├── __init__.py │ │ │ └── ms_deform_attn.py │ │ │ ├── setup.py │ │ │ ├── src │ │ │ ├── cpu │ │ │ │ ├── ms_deform_attn_cpu.cpp │ │ │ │ └── ms_deform_attn_cpu.h │ │ │ ├── cuda │ │ │ │ ├── ms_deform_attn_cuda.cu │ │ │ │ ├── ms_deform_attn_cuda.h │ │ │ │ └── ms_deform_im2col_cuda.cuh │ │ │ ├── ms_deform_attn.h │ │ │ └── vision.cpp │ │ │ └── test.py │ └── transformer_decoder │ │ ├── __init__.py │ │ ├── mask2former_transformer_decoder.py │ │ ├── maskformer_transformer_decoder.py │ │ ├── part_decoder.py │ │ ├── position_encoding.py │ │ └── transformer.py ├── test_time_augmentation.py └── utils │ ├── __init__.py │ └── misc.py ├── tools ├── README.md ├── analyze_model.py ├── convert-pretrained-swin-model-to-d2.py ├── convert-torchvision-to-d2.py ├── evaluate_coco_boundary_ap.py └── evaluate_pq_for_semantic_segmentation.py ├── train_net.py └── utils └── panoptic_parts ├── LICENSE ├── MANIFEST.in ├── README.md ├── __init__.py ├── docs ├── Makefile ├── make.bat ├── requirements.txt └── source │ ├── api_and_code.rst │ ├── conf.py │ ├── contact.md │ ├── errata_cvpr2021.md │ ├── evaluate_results.md │ ├── generate_results.md │ ├── ground_truth_usage_cases.md │ ├── index.rst │ ├── installation.md │ ├── introduction.md │ ├── label_format.md │ ├── scripts.md │ ├── tools.md │ └── visualization.md ├── optional.txt ├── panoptic_parts ├── __init__.py ├── cityscapes_panoptic_parts │ ├── __init__.py │ └── dataset_v2.0 │ │ └── README.md ├── evaluation │ ├── __init__.py │ ├── eval_PartPQ.py │ ├── experimental_eval_PartIOU.py │ └── prepare_data.py ├── merging │ ├── __init__.py │ ├── merge_to_panoptic.py │ └── merge_to_pps.py ├── pascal_panoptic_parts │ ├── __init__.py │ └── dataset_v2.0 │ │ └── README.md ├── specs │ ├── __init__.py │ ├── dataset_spec.py │ ├── dataset_specs │ │ ├── cpp_datasetspec.yaml │ │ └── ppp_datasetspec.yaml │ ├── eval_spec.py │ └── eval_specs │ │ ├── ppq_cpp_19_23_cvpr21_default_evalspec.yaml │ │ ├── ppq_cpp_19_23_cvpr21_grouped_evalspec.yaml │ │ ├── ppq_ppp_59_107_cvpr21_default_evalspec.yaml │ │ └── ppq_ppp_59_57_cvpr21_default_evalspec.yaml ├── utils │ ├── __init__.py │ ├── evaluation_PartPQ.py │ ├── experimental_evaluation_IOU.py │ ├── format.py │ ├── internal │ │ ├── __init__.py │ │ ├── convert_annotations_v1_to_v2.py │ │ ├── populate_ppp_official_evalspec.py │ │ └── ppq_ppp_20_58_part_groupings.yaml │ ├── utils.py │ └── visualization.py └── visualization │ ├── __init__.py │ └── visualize_label_with_legend.py ├── pyproject.toml ├── requirements.txt ├── setup.cfg └── tests ├── __init__.py ├── cityscapes_panoptic_parts ├── __init__.py ├── dataset_sanity_check.py ├── visualize_from_paths_test.sh └── visualize_label_with_legend_test.sh ├── pascal_panoptic_parts └── visualize_from_paths_test.sh └── utils ├── __init__.py ├── format_test.py ├── utils_test.py └── visualization_test.py /.gitignore: -------------------------------------------------------------------------------- 1 | # output dir 2 | output 3 | predictions 4 | instant_test_output 5 | inference_test_output 6 | 7 | 8 | *.png 9 | *.json 10 | *.diff 11 | *.jpg 12 | !/projects/DensePose/doc/images/*.jpg 13 | 14 | # compilation and distribution 15 | __pycache__ 16 | _ext 17 | *.pyc 18 | *.pyd 19 | *.so 20 | *.dll 21 | *.egg-info/ 22 | build/ 23 | dist/ 24 | wheels/ 25 | 26 | # pytorch/python/numpy formats 27 | *.pth 28 | *.pkl 29 | *.npy 30 | *.ts 31 | model_ts*.txt 32 | 33 | # ipython/jupyter notebooks 34 | *.ipynb 35 | **/.ipynb_checkpoints/ 36 | 37 | # Editor temporaries 38 | *.swn 39 | *.swo 40 | *.swp 41 | *~ 42 | 43 | # editor settings 44 | .idea 45 | .vscode 46 | _darcs 47 | 48 | # project dirs 49 | /checkpoints/* 50 | /detectron2/model_zoo/configs 51 | /datasets/* 52 | !/datasets/*.* 53 | /projects/*/datasets 54 | /models 55 | /snippet -------------------------------------------------------------------------------- /INSTALL.md: -------------------------------------------------------------------------------- 1 | ## Installation 2 | 3 | These installation instructions have been slightly adapted from the original [Mask2Former instructions](https://github.com/facebookresearch/Mask2Former/blob/main/INSTALL.md). 4 | 5 | ### Requirements 6 | - Linux or macOS with Python ≥ 3.6 7 | - PyTorch ≥ 1.9 and [torchvision](https://github.com/pytorch/vision/) that matches the PyTorch installation. 8 | Install them together at [pytorch.org](https://pytorch.org) to make sure of this. Note, please check 9 | PyTorch version matches that is required by Detectron2. 10 | - Detectron2: follow [Detectron2 installation instructions](https://detectron2.readthedocs.io/tutorials/install.html). 11 | - OpenCV is optional but needed by demo and visualization 12 | - `pip install -r requirements.txt` 13 | 14 | ### CUDA kernel for MSDeformAttn 15 | After preparing the required environment, run the following command to compile CUDA kernel for MSDeformAttn: 16 | 17 | `CUDA_HOME` must be defined and points to the directory of the installed CUDA toolkit. 18 | 19 | ```bash 20 | cd tapps/modeling/pixel_decoder/ops 21 | sh make.sh 22 | ``` 23 | 24 | #### Building on another system 25 | To build on a system that does not have a GPU device but provide the drivers: 26 | ```bash 27 | TORCH_CUDA_ARCH_LIST='8.0' FORCE_CUDA=1 python setup.py build install 28 | ``` 29 | 30 | ### Example conda environment setup 31 | ```bash 32 | conda create --name tapps python=3.9 -y 33 | conda activate tapps 34 | conda install pytorch==1.13.1 torchvision==0.14.1 pytorch-cuda=11.7 -c pytorch -c nvidia 35 | conda install ruamel.yaml pandas scipy shapely h5py 36 | pip install -U opencv-python 37 | 38 | # under your working directory 39 | git clone git@github.com:facebookresearch/detectron2.git 40 | cd detectron2 41 | python -m pip install -e . 42 | pip install git+https://github.com/cocodataset/panopticapi.git 43 | pip install git+https://github.com/mcordts/cityscapesScripts.git 44 | 45 | cd .. 46 | git clone https://github.com/tue-mps/tapps.git 47 | cd tapps 48 | pip install timm submitit cython scikit-image psutil scikit-learn 49 | cd tapps/modeling/pixel_decoder/ops 50 | sh make.sh 51 | ``` 52 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2024 Daan de Geus, Eindhoven University of Technology 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. 20 | -------------------------------------------------------------------------------- /LICENSE_MASK2FORMER: -------------------------------------------------------------------------------- 1 | Copyright (c) 2022 Meta, Inc. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. 20 | -------------------------------------------------------------------------------- /checkpoints/README.md: -------------------------------------------------------------------------------- 1 | # Download pre-trained model weights 2 | 3 | ## TAPPS model weights 4 | [Here](../MODELS.md), we list the different models that we release, and provide a download link. 5 | 6 | 7 | ## COCO pre-trained model weights 8 | To initialize a model with COCO (panoptic) pre-trained weights, as done in our work, follow these steps: 9 | 10 | 1. Identify the backbone architecture of the model you wish to train (e.g., ResNet-50 or Swin-B) 11 | 2. For this backbone, download the model weights provided in the [original Mask2Former repository](https://github.com/facebookresearch/Mask2Former/blob/main/MODEL_ZOO.md#panoptic-segmentation), trained for COCO panoptic segmentation. 12 | 3. Place these model weights in the `checkpoints` directory, following this structure: 13 | 14 | ``` 15 | checkpoints/ 16 | maskformer2_R50_bs16_50ep/ 17 | model_final_94dc52.pkl 18 | maskformer2_swin_base_IN21k_384_bs16_50ep/ 19 | model_final_54b88a.pkl 20 | ``` 21 | 22 | Then, you can simply run the training code following [the instructions provided here](../README.md#training). In the default configs, the path to the COCO pre-trained weights is already provided. -------------------------------------------------------------------------------- /configs/ade20k/instance-segmentation/Base-ADE20K-InstanceSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | # NORM: "SyncBN" 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("ade20k_instance_train",) 18 | TEST: ("ade20k_instance_val",) 19 | SOLVER: 20 | IMS_PER_BATCH: 16 21 | BASE_LR: 0.0001 22 | MAX_ITER: 160000 23 | WARMUP_FACTOR: 1.0 24 | WARMUP_ITERS: 0 25 | WEIGHT_DECAY: 0.05 26 | OPTIMIZER: "ADAMW" 27 | LR_SCHEDULER_NAME: "WarmupPolyLR" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | AMP: 35 | ENABLED: True 36 | INPUT: 37 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"] 38 | MIN_SIZE_TRAIN_SAMPLING: "choice" 39 | MIN_SIZE_TEST: 640 40 | MAX_SIZE_TRAIN: 2560 41 | MAX_SIZE_TEST: 2560 42 | CROP: 43 | ENABLED: True 44 | TYPE: "absolute" 45 | SIZE: (640, 640) 46 | SINGLE_CATEGORY_MAX_AREA: 1.0 47 | COLOR_AUG_SSD: True 48 | SIZE_DIVISIBILITY: 640 # used in dataset mapper 49 | FORMAT: "RGB" 50 | DATASET_MAPPER_NAME: "mask_former_instance" 51 | TEST: 52 | EVAL_PERIOD: 5000 53 | AUG: 54 | ENABLED: False 55 | MIN_SIZES: [320, 480, 640, 800, 960, 1120] 56 | MAX_SIZE: 4480 57 | FLIP: True 58 | DATALOADER: 59 | FILTER_EMPTY_ANNOTATIONS: True 60 | NUM_WORKERS: 4 61 | VERSION: 2 62 | -------------------------------------------------------------------------------- /configs/ade20k/instance-segmentation/maskformer2_R50_bs16_160k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-ADE20K-InstanceSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IGNORE_VALUE: 255 7 | NUM_CLASSES: 100 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 14 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 15 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 16 | COMMON_STRIDE: 4 17 | TRANSFORMER_ENC_LAYERS: 6 18 | MASK_FORMER: 19 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 20 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 21 | DEEP_SUPERVISION: True 22 | NO_OBJECT_WEIGHT: 0.1 23 | CLASS_WEIGHT: 2.0 24 | MASK_WEIGHT: 5.0 25 | DICE_WEIGHT: 5.0 26 | HIDDEN_DIM: 256 27 | NUM_OBJECT_QUERIES: 100 28 | NHEADS: 8 29 | DROPOUT: 0.0 30 | DIM_FEEDFORWARD: 2048 31 | ENC_LAYERS: 0 32 | PRE_NORM: False 33 | ENFORCE_INPUT_PROJ: False 34 | SIZE_DIVISIBILITY: 32 35 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 36 | TRAIN_NUM_POINTS: 12544 37 | OVERSAMPLE_RATIO: 3.0 38 | IMPORTANCE_SAMPLE_RATIO: 0.75 39 | TEST: 40 | SEMANTIC_ON: True 41 | INSTANCE_ON: True 42 | PANOPTIC_ON: True 43 | OVERLAP_THRESHOLD: 0.8 44 | OBJECT_MASK_THRESHOLD: 0.8 45 | -------------------------------------------------------------------------------- /configs/ade20k/instance-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_160k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | MASK_FORMER: 18 | NUM_OBJECT_QUERIES: 200 19 | -------------------------------------------------------------------------------- /configs/ade20k/panoptic-segmentation/Base-ADE20K-PanopticSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | # NORM: "SyncBN" 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("ade20k_panoptic_train",) 18 | TEST: ("ade20k_panoptic_val",) 19 | SOLVER: 20 | IMS_PER_BATCH: 16 21 | BASE_LR: 0.0001 22 | MAX_ITER: 160000 23 | WARMUP_FACTOR: 1.0 24 | WARMUP_ITERS: 0 25 | WEIGHT_DECAY: 0.05 26 | OPTIMIZER: "ADAMW" 27 | LR_SCHEDULER_NAME: "WarmupPolyLR" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | AMP: 35 | ENABLED: True 36 | INPUT: 37 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"] 38 | MIN_SIZE_TRAIN_SAMPLING: "choice" 39 | MIN_SIZE_TEST: 640 40 | MAX_SIZE_TRAIN: 2560 41 | MAX_SIZE_TEST: 2560 42 | CROP: 43 | ENABLED: True 44 | TYPE: "absolute" 45 | SIZE: (640, 640) 46 | SINGLE_CATEGORY_MAX_AREA: 1.0 47 | COLOR_AUG_SSD: True 48 | SIZE_DIVISIBILITY: 640 # used in dataset mapper 49 | FORMAT: "RGB" 50 | DATASET_MAPPER_NAME: "mask_former_panoptic" 51 | TEST: 52 | EVAL_PERIOD: 5000 53 | AUG: 54 | ENABLED: False 55 | MIN_SIZES: [320, 480, 640, 800, 960, 1120] 56 | MAX_SIZE: 4480 57 | FLIP: True 58 | DATALOADER: 59 | FILTER_EMPTY_ANNOTATIONS: True 60 | NUM_WORKERS: 4 61 | VERSION: 2 62 | -------------------------------------------------------------------------------- /configs/ade20k/panoptic-segmentation/maskformer2_R50_bs16_160k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-ADE20K-PanopticSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IGNORE_VALUE: 255 7 | NUM_CLASSES: 150 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 14 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 15 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 16 | COMMON_STRIDE: 4 17 | TRANSFORMER_ENC_LAYERS: 6 18 | MASK_FORMER: 19 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 20 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 21 | DEEP_SUPERVISION: True 22 | NO_OBJECT_WEIGHT: 0.1 23 | CLASS_WEIGHT: 2.0 24 | MASK_WEIGHT: 5.0 25 | DICE_WEIGHT: 5.0 26 | HIDDEN_DIM: 256 27 | NUM_OBJECT_QUERIES: 100 28 | NHEADS: 8 29 | DROPOUT: 0.0 30 | DIM_FEEDFORWARD: 2048 31 | ENC_LAYERS: 0 32 | PRE_NORM: False 33 | ENFORCE_INPUT_PROJ: False 34 | SIZE_DIVISIBILITY: 32 35 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 36 | TRAIN_NUM_POINTS: 12544 37 | OVERSAMPLE_RATIO: 3.0 38 | IMPORTANCE_SAMPLE_RATIO: 0.75 39 | TEST: 40 | SEMANTIC_ON: True 41 | INSTANCE_ON: True 42 | PANOPTIC_ON: True 43 | OVERLAP_THRESHOLD: 0.8 44 | OBJECT_MASK_THRESHOLD: 0.8 45 | -------------------------------------------------------------------------------- /configs/ade20k/panoptic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_160k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | MASK_FORMER: 18 | NUM_OBJECT_QUERIES: 200 19 | -------------------------------------------------------------------------------- /configs/ade20k/semantic-segmentation/Base-ADE20K-SemanticSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | # NORM: "SyncBN" 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("ade20k_sem_seg_train",) 18 | TEST: ("ade20k_sem_seg_val",) 19 | SOLVER: 20 | IMS_PER_BATCH: 16 21 | BASE_LR: 0.0001 22 | MAX_ITER: 160000 23 | WARMUP_FACTOR: 1.0 24 | WARMUP_ITERS: 0 25 | WEIGHT_DECAY: 0.05 26 | OPTIMIZER: "ADAMW" 27 | LR_SCHEDULER_NAME: "WarmupPolyLR" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | AMP: 35 | ENABLED: True 36 | INPUT: 37 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 512) for x in range(5, 21)]"] 38 | MIN_SIZE_TRAIN_SAMPLING: "choice" 39 | MIN_SIZE_TEST: 512 40 | MAX_SIZE_TRAIN: 2048 41 | MAX_SIZE_TEST: 2048 42 | CROP: 43 | ENABLED: True 44 | TYPE: "absolute" 45 | SIZE: (512, 512) 46 | SINGLE_CATEGORY_MAX_AREA: 1.0 47 | COLOR_AUG_SSD: True 48 | SIZE_DIVISIBILITY: 512 # used in dataset mapper 49 | FORMAT: "RGB" 50 | DATASET_MAPPER_NAME: "mask_former_semantic" 51 | TEST: 52 | EVAL_PERIOD: 5000 53 | AUG: 54 | ENABLED: False 55 | MIN_SIZES: [256, 384, 512, 640, 768, 896] 56 | MAX_SIZE: 3584 57 | FLIP: True 58 | DATALOADER: 59 | FILTER_EMPTY_ANNOTATIONS: True 60 | NUM_WORKERS: 4 61 | VERSION: 2 62 | -------------------------------------------------------------------------------- /configs/ade20k/semantic-segmentation/maskformer2_R101_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: maskformer2_R50_bs16_160k.yaml 2 | MODEL: 3 | WEIGHTS: "R-101.pkl" 4 | RESNETS: 5 | DEPTH: 101 6 | STEM_TYPE: "basic" # not used 7 | STEM_OUT_CHANNELS: 64 8 | STRIDE_IN_1X1: False 9 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 10 | NORM: "SyncBN" 11 | RES5_MULTI_GRID: [1, 1, 1] # not used 12 | -------------------------------------------------------------------------------- /configs/ade20k/semantic-segmentation/maskformer2_R50_bs16_160k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-ADE20K-SemanticSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IGNORE_VALUE: 255 7 | NUM_CLASSES: 150 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 14 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 15 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 16 | COMMON_STRIDE: 4 17 | TRANSFORMER_ENC_LAYERS: 6 18 | MASK_FORMER: 19 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 20 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 21 | DEEP_SUPERVISION: True 22 | NO_OBJECT_WEIGHT: 0.1 23 | CLASS_WEIGHT: 2.0 24 | MASK_WEIGHT: 5.0 25 | DICE_WEIGHT: 5.0 26 | HIDDEN_DIM: 256 27 | NUM_OBJECT_QUERIES: 100 28 | NHEADS: 8 29 | DROPOUT: 0.0 30 | DIM_FEEDFORWARD: 2048 31 | ENC_LAYERS: 0 32 | PRE_NORM: False 33 | ENFORCE_INPUT_PROJ: False 34 | SIZE_DIVISIBILITY: 32 35 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 36 | TRAIN_NUM_POINTS: 12544 37 | OVERSAMPLE_RATIO: 3.0 38 | IMPORTANCE_SAMPLE_RATIO: 0.75 39 | TEST: 40 | SEMANTIC_ON: True 41 | INSTANCE_ON: False 42 | PANOPTIC_ON: False 43 | OVERLAP_THRESHOLD: 0.8 44 | OBJECT_MASK_THRESHOLD: 0.8 45 | -------------------------------------------------------------------------------- /configs/ade20k/semantic-segmentation/swin/maskformer2_swin_base_384_bs16_160k_res640.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 128 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [4, 8, 16, 32] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_base_patch4_window12_384.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | INPUT: 18 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"] 19 | MIN_SIZE_TRAIN_SAMPLING: "choice" 20 | MIN_SIZE_TEST: 640 21 | MAX_SIZE_TRAIN: 2560 22 | MAX_SIZE_TEST: 2560 23 | CROP: 24 | ENABLED: True 25 | TYPE: "absolute" 26 | SIZE: (640, 640) 27 | SINGLE_CATEGORY_MAX_AREA: 1.0 28 | COLOR_AUG_SSD: True 29 | SIZE_DIVISIBILITY: 640 # used in dataset mapper 30 | FORMAT: "RGB" 31 | TEST: 32 | EVAL_PERIOD: 5000 33 | AUG: 34 | ENABLED: False 35 | MIN_SIZES: [320, 480, 640, 800, 960, 1120] 36 | MAX_SIZE: 4480 37 | FLIP: True 38 | -------------------------------------------------------------------------------- /configs/ade20k/semantic-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_160k_res640.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 128 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [4, 8, 16, 32] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_base_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | INPUT: 18 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"] 19 | MIN_SIZE_TRAIN_SAMPLING: "choice" 20 | MIN_SIZE_TEST: 640 21 | MAX_SIZE_TRAIN: 2560 22 | MAX_SIZE_TEST: 2560 23 | CROP: 24 | ENABLED: True 25 | TYPE: "absolute" 26 | SIZE: (640, 640) 27 | SINGLE_CATEGORY_MAX_AREA: 1.0 28 | COLOR_AUG_SSD: True 29 | SIZE_DIVISIBILITY: 640 # used in dataset mapper 30 | FORMAT: "RGB" 31 | TEST: 32 | EVAL_PERIOD: 5000 33 | AUG: 34 | ENABLED: False 35 | MIN_SIZES: [320, 480, 640, 800, 960, 1120] 36 | MAX_SIZE: 4480 37 | FLIP: True 38 | -------------------------------------------------------------------------------- /configs/ade20k/semantic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_160k_res640.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | INPUT: 18 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"] 19 | MIN_SIZE_TRAIN_SAMPLING: "choice" 20 | MIN_SIZE_TEST: 640 21 | MAX_SIZE_TRAIN: 2560 22 | MAX_SIZE_TEST: 2560 23 | CROP: 24 | ENABLED: True 25 | TYPE: "absolute" 26 | SIZE: (640, 640) 27 | SINGLE_CATEGORY_MAX_AREA: 1.0 28 | COLOR_AUG_SSD: True 29 | SIZE_DIVISIBILITY: 640 # used in dataset mapper 30 | FORMAT: "RGB" 31 | TEST: 32 | EVAL_PERIOD: 5000 33 | AUG: 34 | ENABLED: False 35 | MIN_SIZES: [320, 480, 640, 800, 960, 1120] 36 | MAX_SIZE: 4480 37 | FLIP: True 38 | -------------------------------------------------------------------------------- /configs/ade20k/semantic-segmentation/swin/maskformer2_swin_small_bs16_160k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_small_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | -------------------------------------------------------------------------------- /configs/ade20k/semantic-segmentation/swin/maskformer2_swin_tiny_bs16_160k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 6, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_tiny_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | -------------------------------------------------------------------------------- /configs/cityscapes/instance-segmentation/Base-Cityscapes-InstanceSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | NORM: "SyncBN" # use syncbn for cityscapes dataset 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("cityscapes_fine_instance_seg_train",) 18 | TEST: ("cityscapes_fine_instance_seg_val",) 19 | SOLVER: 20 | IMS_PER_BATCH: 16 21 | BASE_LR: 0.0001 22 | MAX_ITER: 90000 23 | WARMUP_FACTOR: 1.0 24 | WARMUP_ITERS: 0 25 | WEIGHT_DECAY: 0.05 26 | OPTIMIZER: "ADAMW" 27 | LR_SCHEDULER_NAME: "WarmupPolyLR" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | AMP: 35 | ENABLED: True 36 | INPUT: 37 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1024) for x in range(5, 21)]"] 38 | MIN_SIZE_TRAIN_SAMPLING: "choice" 39 | MIN_SIZE_TEST: 1024 40 | MAX_SIZE_TRAIN: 4096 41 | MAX_SIZE_TEST: 2048 42 | CROP: 43 | ENABLED: True 44 | TYPE: "absolute" 45 | SIZE: (512, 1024) 46 | SINGLE_CATEGORY_MAX_AREA: 1.0 47 | COLOR_AUG_SSD: True 48 | SIZE_DIVISIBILITY: -1 49 | FORMAT: "RGB" 50 | DATASET_MAPPER_NAME: "mask_former_instance" 51 | TEST: 52 | EVAL_PERIOD: 5000 53 | AUG: 54 | ENABLED: False 55 | MIN_SIZES: [512, 768, 1024, 1280, 1536, 1792] 56 | MAX_SIZE: 4096 57 | FLIP: True 58 | DATALOADER: 59 | FILTER_EMPTY_ANNOTATIONS: True 60 | NUM_WORKERS: 4 61 | VERSION: 2 62 | -------------------------------------------------------------------------------- /configs/cityscapes/instance-segmentation/maskformer2_R101_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | WEIGHTS: "R-101.pkl" 4 | RESNETS: 5 | DEPTH: 101 6 | STEM_TYPE: "basic" # not used 7 | STEM_OUT_CHANNELS: 64 8 | STRIDE_IN_1X1: False 9 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 10 | NORM: "SyncBN" 11 | RES5_MULTI_GRID: [1, 1, 1] # not used 12 | -------------------------------------------------------------------------------- /configs/cityscapes/instance-segmentation/maskformer2_R50_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-Cityscapes-InstanceSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IGNORE_VALUE: 255 7 | NUM_CLASSES: 8 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 14 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 15 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 16 | COMMON_STRIDE: 4 17 | TRANSFORMER_ENC_LAYERS: 6 18 | MASK_FORMER: 19 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 20 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 21 | DEEP_SUPERVISION: True 22 | NO_OBJECT_WEIGHT: 0.1 23 | CLASS_WEIGHT: 2.0 24 | MASK_WEIGHT: 5.0 25 | DICE_WEIGHT: 5.0 26 | HIDDEN_DIM: 256 27 | NUM_OBJECT_QUERIES: 100 28 | NHEADS: 8 29 | DROPOUT: 0.0 30 | DIM_FEEDFORWARD: 2048 31 | ENC_LAYERS: 0 32 | PRE_NORM: False 33 | ENFORCE_INPUT_PROJ: False 34 | SIZE_DIVISIBILITY: 32 35 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 36 | TRAIN_NUM_POINTS: 12544 37 | OVERSAMPLE_RATIO: 3.0 38 | IMPORTANCE_SAMPLE_RATIO: 0.75 39 | TEST: 40 | SEMANTIC_ON: False 41 | INSTANCE_ON: True 42 | PANOPTIC_ON: False 43 | OVERLAP_THRESHOLD: 0.8 44 | OBJECT_MASK_THRESHOLD: 0.8 45 | -------------------------------------------------------------------------------- /configs/cityscapes/instance-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 128 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [4, 8, 16, 32] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_base_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | -------------------------------------------------------------------------------- /configs/cityscapes/instance-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | MASK_FORMER: 18 | NUM_OBJECT_QUERIES: 200 19 | -------------------------------------------------------------------------------- /configs/cityscapes/instance-segmentation/swin/maskformer2_swin_small_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_small_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | -------------------------------------------------------------------------------- /configs/cityscapes/instance-segmentation/swin/maskformer2_swin_tiny_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 6, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_tiny_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | -------------------------------------------------------------------------------- /configs/cityscapes/panoptic-segmentation/Base-Cityscapes-PanopticSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | NORM: "SyncBN" # use syncbn for cityscapes dataset 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("cityscapes_fine_panoptic_train",) 18 | TEST: ("cityscapes_fine_panoptic_val",) 19 | SOLVER: 20 | IMS_PER_BATCH: 16 21 | BASE_LR: 0.0001 22 | MAX_ITER: 90000 23 | WARMUP_FACTOR: 1.0 24 | WARMUP_ITERS: 0 25 | WEIGHT_DECAY: 0.05 26 | OPTIMIZER: "ADAMW" 27 | LR_SCHEDULER_NAME: "WarmupPolyLR" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | AMP: 35 | ENABLED: True 36 | INPUT: 37 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1024) for x in range(5, 21)]"] 38 | MIN_SIZE_TRAIN_SAMPLING: "choice" 39 | MIN_SIZE_TEST: 1024 40 | MAX_SIZE_TRAIN: 4096 41 | MAX_SIZE_TEST: 2048 42 | CROP: 43 | ENABLED: True 44 | TYPE: "absolute" 45 | SIZE: (512, 1024) 46 | SINGLE_CATEGORY_MAX_AREA: 1.0 47 | COLOR_AUG_SSD: True 48 | SIZE_DIVISIBILITY: -1 49 | FORMAT: "RGB" 50 | DATASET_MAPPER_NAME: "mask_former_panoptic" 51 | TEST: 52 | EVAL_PERIOD: 5000 53 | AUG: 54 | ENABLED: False 55 | MIN_SIZES: [512, 768, 1024, 1280, 1536, 1792] 56 | MAX_SIZE: 4096 57 | FLIP: True 58 | DATALOADER: 59 | FILTER_EMPTY_ANNOTATIONS: True 60 | NUM_WORKERS: 4 61 | VERSION: 2 62 | -------------------------------------------------------------------------------- /configs/cityscapes/panoptic-segmentation/maskformer2_R101_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | WEIGHTS: "R-101.pkl" 4 | RESNETS: 5 | DEPTH: 101 6 | STEM_TYPE: "basic" # not used 7 | STEM_OUT_CHANNELS: 64 8 | STRIDE_IN_1X1: False 9 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 10 | NORM: "SyncBN" 11 | RES5_MULTI_GRID: [1, 1, 1] # not used 12 | -------------------------------------------------------------------------------- /configs/cityscapes/panoptic-segmentation/maskformer2_R50_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-Cityscapes-PanopticSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IGNORE_VALUE: 255 7 | NUM_CLASSES: 19 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 14 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 15 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 16 | COMMON_STRIDE: 4 17 | TRANSFORMER_ENC_LAYERS: 6 18 | MASK_FORMER: 19 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 20 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 21 | DEEP_SUPERVISION: True 22 | NO_OBJECT_WEIGHT: 0.1 23 | CLASS_WEIGHT: 2.0 24 | MASK_WEIGHT: 5.0 25 | DICE_WEIGHT: 5.0 26 | HIDDEN_DIM: 256 27 | NUM_OBJECT_QUERIES: 100 28 | NHEADS: 8 29 | DROPOUT: 0.0 30 | DIM_FEEDFORWARD: 2048 31 | ENC_LAYERS: 0 32 | PRE_NORM: False 33 | ENFORCE_INPUT_PROJ: False 34 | SIZE_DIVISIBILITY: 32 35 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 36 | TRAIN_NUM_POINTS: 12544 37 | OVERSAMPLE_RATIO: 3.0 38 | IMPORTANCE_SAMPLE_RATIO: 0.75 39 | TEST: 40 | SEMANTIC_ON: True 41 | INSTANCE_ON: True 42 | PANOPTIC_ON: True 43 | OVERLAP_THRESHOLD: 0.8 44 | OBJECT_MASK_THRESHOLD: 0.8 45 | -------------------------------------------------------------------------------- /configs/cityscapes/panoptic-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 128 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [4, 8, 16, 32] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_base_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | -------------------------------------------------------------------------------- /configs/cityscapes/panoptic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | MASK_FORMER: 18 | NUM_OBJECT_QUERIES: 200 19 | -------------------------------------------------------------------------------- /configs/cityscapes/panoptic-segmentation/swin/maskformer2_swin_small_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_small_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | -------------------------------------------------------------------------------- /configs/cityscapes/panoptic-segmentation/swin/maskformer2_swin_tiny_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 6, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_tiny_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | -------------------------------------------------------------------------------- /configs/cityscapes/pps/Base-Cityscapes-PPS.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | NORM: "SyncBN" # use syncbn for cityscapes dataset 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("cityscapes_panoptic_parts_train",) 18 | TEST: ("cityscapes_panoptic_parts_val",) 19 | NAME: "Cityscapes" 20 | SOLVER: 21 | IMS_PER_BATCH: 16 22 | BASE_LR: 0.0001 23 | MAX_ITER: 90000 24 | CHECKPOINT_PERIOD: 20000 25 | WARMUP_FACTOR: 1.0 26 | WARMUP_ITERS: 0 27 | WEIGHT_DECAY: 0.05 28 | OPTIMIZER: "ADAMW" 29 | LR_SCHEDULER_NAME: "WarmupPolyLR" 30 | BACKBONE_MULTIPLIER: 0.1 31 | CLIP_GRADIENTS: 32 | ENABLED: True 33 | CLIP_TYPE: "full_model" 34 | CLIP_VALUE: 0.01 35 | NORM_TYPE: 2.0 36 | AMP: 37 | ENABLED: True 38 | INPUT: 39 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1024) for x in range(5, 21)]"] 40 | MIN_SIZE_TRAIN_SAMPLING: "choice" 41 | MIN_SIZE_TEST: 1024 42 | MAX_SIZE_TRAIN: 4096 43 | MAX_SIZE_TEST: 2048 44 | CROP: 45 | ENABLED: True 46 | TYPE: "absolute" 47 | SIZE: (512, 1024) 48 | SINGLE_CATEGORY_MAX_AREA: 1.0 49 | COLOR_AUG_SSD: True 50 | SIZE_DIVISIBILITY: -1 51 | FORMAT: "RGB" 52 | DATASET_MAPPER_NAME: "mask_former_panoptic_parts" 53 | TEST: 54 | EVAL_PERIOD: 20000 55 | AUG: 56 | ENABLED: False 57 | MIN_SIZES: [512, 768, 1024, 1280, 1536, 1792] 58 | MAX_SIZE: 4096 59 | FLIP: True 60 | DATALOADER: 61 | FILTER_EMPTY_ANNOTATIONS: True 62 | NUM_WORKERS: 4 63 | VERSION: 2 64 | PPS_EVAL_SPEC: "utils/panoptic_parts/panoptic_parts/specs/eval_specs/ppq_cpp_19_23_cvpr21_grouped_evalspec.yaml" -------------------------------------------------------------------------------- /configs/cityscapes/pps/tapps_cityscapes_r50_cocoinit.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-Cityscapes-PPS.yaml 2 | OUTPUT_DIR: "output/tapps_cityscapes_r50_cocoinit/" 3 | MODEL: 4 | WEIGHTS: "checkpoints/maskformer2_R50_bs16_50ep/model_final_94dc52.pkl" 5 | META_ARCHITECTURE: "MaskFormer" 6 | SEM_SEG_HEAD: 7 | NAME: "MaskFormerHead" 8 | IGNORE_VALUE: 255 9 | NUM_CLASSES: 19 10 | LOSS_WEIGHT: 1.0 11 | CONVS_DIM: 256 12 | MASK_DIM: 256 13 | NORM: "GN" 14 | # pixel decoder 15 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 16 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 17 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 18 | COMMON_STRIDE: 4 19 | TRANSFORMER_ENC_LAYERS: 6 20 | RESNETS: 21 | NORM: "SyncBN" 22 | MASK_FORMER: 23 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 24 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 25 | DEEP_SUPERVISION: True 26 | NO_OBJECT_WEIGHT: 0.1 27 | CLASS_WEIGHT: 2.0 28 | MASK_WEIGHT: 5.0 29 | DICE_WEIGHT: 5.0 30 | HIDDEN_DIM: 256 31 | NUM_OBJECT_QUERIES: 100 32 | NHEADS: 8 33 | DROPOUT: 0.0 34 | DIM_FEEDFORWARD: 2048 35 | ENC_LAYERS: 0 36 | PRE_NORM: False 37 | ENFORCE_INPUT_PROJ: False 38 | SIZE_DIVISIBILITY: 32 39 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 40 | TRAIN_NUM_POINTS: 12544 41 | OVERSAMPLE_RATIO: 3.0 42 | IMPORTANCE_SAMPLE_RATIO: 0.75 43 | PARTS_ON: True 44 | NUM_PART_CLASSES: 9 45 | LOSS_WEIGHT_PARTS: 1.0 46 | LOSS_WEIGHT_PANOPTIC: 1.0 47 | PARTS_CONF_THRESHOLD: 0.1 48 | TEST: 49 | SEMANTIC_ON: False 50 | INSTANCE_ON: False 51 | PANOPTIC_ON: True 52 | PARTS_ON: True 53 | OVERLAP_THRESHOLD: 0.8 54 | OBJECT_MASK_THRESHOLD: 0.8 55 | -------------------------------------------------------------------------------- /configs/cityscapes/pps/tapps_cityscapes_r50_in1kinit.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-Cityscapes-PPS.yaml 2 | OUTPUT_DIR: "output/tapps_cityscapes_r50_in1kinit/" 3 | MODEL: 4 | META_ARCHITECTURE: "MaskFormer" 5 | SEM_SEG_HEAD: 6 | NAME: "MaskFormerHead" 7 | IGNORE_VALUE: 255 8 | NUM_CLASSES: 19 9 | LOSS_WEIGHT: 1.0 10 | CONVS_DIM: 256 11 | MASK_DIM: 256 12 | NORM: "GN" 13 | # pixel decoder 14 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 15 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 16 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 17 | COMMON_STRIDE: 4 18 | TRANSFORMER_ENC_LAYERS: 6 19 | RESNETS: 20 | NORM: "SyncBN" 21 | MASK_FORMER: 22 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 23 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 24 | DEEP_SUPERVISION: True 25 | NO_OBJECT_WEIGHT: 0.1 26 | CLASS_WEIGHT: 2.0 27 | MASK_WEIGHT: 5.0 28 | DICE_WEIGHT: 5.0 29 | HIDDEN_DIM: 256 30 | NUM_OBJECT_QUERIES: 100 31 | NHEADS: 8 32 | DROPOUT: 0.0 33 | DIM_FEEDFORWARD: 2048 34 | ENC_LAYERS: 0 35 | PRE_NORM: False 36 | ENFORCE_INPUT_PROJ: False 37 | SIZE_DIVISIBILITY: 32 38 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 39 | TRAIN_NUM_POINTS: 12544 40 | OVERSAMPLE_RATIO: 3.0 41 | IMPORTANCE_SAMPLE_RATIO: 0.75 42 | PARTS_ON: True 43 | NUM_PART_CLASSES: 9 44 | LOSS_WEIGHT_PARTS: 1.0 45 | LOSS_WEIGHT_PANOPTIC: 1.0 46 | PARTS_CONF_THRESHOLD: 0.1 47 | TEST: 48 | SEMANTIC_ON: False 49 | INSTANCE_ON: False 50 | PANOPTIC_ON: True 51 | PARTS_ON: True 52 | OVERLAP_THRESHOLD: 0.8 53 | OBJECT_MASK_THRESHOLD: 0.8 -------------------------------------------------------------------------------- /configs/cityscapes/pps/tapps_cityscapes_swinb_cocoinit.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-Cityscapes-PPS.yaml 2 | OUTPUT_DIR: "output/tapps_cityscapes_swinb_cocoinit/" 3 | MODEL: 4 | WEIGHTS: "checkpoints/maskformer2_swin_base_IN21k_384_bs16_50ep/model_final_54b88a.pkl" 5 | META_ARCHITECTURE: "MaskFormer" 6 | BACKBONE: 7 | NAME: "D2SwinTransformer" 8 | SWIN: 9 | EMBED_DIM: 128 10 | DEPTHS: [ 2, 2, 18, 2 ] 11 | NUM_HEADS: [ 4, 8, 16, 32 ] 12 | WINDOW_SIZE: 12 13 | APE: False 14 | DROP_PATH_RATE: 0.3 15 | PATCH_NORM: True 16 | PRETRAIN_IMG_SIZE: 384 17 | PIXEL_MEAN: [ 123.675, 116.280, 103.530 ] 18 | PIXEL_STD: [ 58.395, 57.120, 57.375 ] 19 | SEM_SEG_HEAD: 20 | NAME: "MaskFormerHead" 21 | IGNORE_VALUE: 255 22 | NUM_CLASSES: 19 23 | LOSS_WEIGHT: 1.0 24 | CONVS_DIM: 256 25 | MASK_DIM: 256 26 | NORM: "GN" 27 | # pixel decoder 28 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 29 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 30 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 31 | COMMON_STRIDE: 4 32 | TRANSFORMER_ENC_LAYERS: 6 33 | MASK_FORMER: 34 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 35 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 36 | DEEP_SUPERVISION: True 37 | NO_OBJECT_WEIGHT: 0.1 38 | CLASS_WEIGHT: 2.0 39 | MASK_WEIGHT: 5.0 40 | DICE_WEIGHT: 5.0 41 | HIDDEN_DIM: 256 42 | NUM_OBJECT_QUERIES: 100 43 | NHEADS: 8 44 | DROPOUT: 0.0 45 | DIM_FEEDFORWARD: 2048 46 | ENC_LAYERS: 0 47 | PRE_NORM: False 48 | ENFORCE_INPUT_PROJ: False 49 | SIZE_DIVISIBILITY: 32 50 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 51 | TRAIN_NUM_POINTS: 12544 52 | OVERSAMPLE_RATIO: 3.0 53 | IMPORTANCE_SAMPLE_RATIO: 0.75 54 | PARTS_ON: True 55 | NUM_PART_CLASSES: 9 56 | LOSS_WEIGHT_PARTS: 1.0 57 | LOSS_WEIGHT_PANOPTIC: 1.0 58 | PARTS_CONF_THRESHOLD: 0.1 59 | TEST: 60 | SEMANTIC_ON: False 61 | INSTANCE_ON: False 62 | PANOPTIC_ON: True 63 | PARTS_ON: True 64 | OVERLAP_THRESHOLD: 0.8 65 | OBJECT_MASK_THRESHOLD: 0.8 66 | 67 | -------------------------------------------------------------------------------- /configs/cityscapes/semantic-segmentation/Base-Cityscapes-SemanticSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | NORM: "SyncBN" # use syncbn for cityscapes dataset 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("cityscapes_fine_sem_seg_train",) 18 | TEST: ("cityscapes_fine_sem_seg_val",) 19 | SOLVER: 20 | IMS_PER_BATCH: 16 21 | BASE_LR: 0.0001 22 | MAX_ITER: 90000 23 | WARMUP_FACTOR: 1.0 24 | WARMUP_ITERS: 0 25 | WEIGHT_DECAY: 0.05 26 | OPTIMIZER: "ADAMW" 27 | LR_SCHEDULER_NAME: "WarmupPolyLR" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | AMP: 35 | ENABLED: True 36 | INPUT: 37 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1024) for x in range(5, 21)]"] 38 | MIN_SIZE_TRAIN_SAMPLING: "choice" 39 | MIN_SIZE_TEST: 1024 40 | MAX_SIZE_TRAIN: 4096 41 | MAX_SIZE_TEST: 2048 42 | CROP: 43 | ENABLED: True 44 | TYPE: "absolute" 45 | SIZE: (512, 1024) 46 | SINGLE_CATEGORY_MAX_AREA: 1.0 47 | COLOR_AUG_SSD: True 48 | SIZE_DIVISIBILITY: -1 49 | FORMAT: "RGB" 50 | DATASET_MAPPER_NAME: "mask_former_semantic" 51 | TEST: 52 | EVAL_PERIOD: 5000 53 | AUG: 54 | ENABLED: False 55 | MIN_SIZES: [512, 768, 1024, 1280, 1536, 1792] 56 | MAX_SIZE: 4096 57 | FLIP: True 58 | DATALOADER: 59 | FILTER_EMPTY_ANNOTATIONS: True 60 | NUM_WORKERS: 4 61 | VERSION: 2 62 | -------------------------------------------------------------------------------- /configs/cityscapes/semantic-segmentation/maskformer2_R101_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | WEIGHTS: "R-101.pkl" 4 | RESNETS: 5 | DEPTH: 101 6 | STEM_TYPE: "basic" # not used 7 | STEM_OUT_CHANNELS: 64 8 | STRIDE_IN_1X1: False 9 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 10 | NORM: "SyncBN" 11 | RES5_MULTI_GRID: [1, 1, 1] # not used 12 | -------------------------------------------------------------------------------- /configs/cityscapes/semantic-segmentation/maskformer2_R50_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-Cityscapes-SemanticSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IGNORE_VALUE: 255 7 | NUM_CLASSES: 19 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 14 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 15 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 16 | COMMON_STRIDE: 4 17 | TRANSFORMER_ENC_LAYERS: 6 18 | MASK_FORMER: 19 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 20 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 21 | DEEP_SUPERVISION: True 22 | NO_OBJECT_WEIGHT: 0.1 23 | CLASS_WEIGHT: 2.0 24 | MASK_WEIGHT: 5.0 25 | DICE_WEIGHT: 5.0 26 | HIDDEN_DIM: 256 27 | NUM_OBJECT_QUERIES: 100 28 | NHEADS: 8 29 | DROPOUT: 0.0 30 | DIM_FEEDFORWARD: 2048 31 | ENC_LAYERS: 0 32 | PRE_NORM: False 33 | ENFORCE_INPUT_PROJ: False 34 | SIZE_DIVISIBILITY: 32 35 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 36 | TRAIN_NUM_POINTS: 12544 37 | OVERSAMPLE_RATIO: 3.0 38 | IMPORTANCE_SAMPLE_RATIO: 0.75 39 | TEST: 40 | SEMANTIC_ON: True 41 | INSTANCE_ON: False 42 | PANOPTIC_ON: False 43 | OVERLAP_THRESHOLD: 0.8 44 | OBJECT_MASK_THRESHOLD: 0.8 45 | -------------------------------------------------------------------------------- /configs/cityscapes/semantic-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 128 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [4, 8, 16, 32] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_base_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | -------------------------------------------------------------------------------- /configs/cityscapes/semantic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | MASK_FORMER: 18 | NUM_OBJECT_QUERIES: 100 19 | -------------------------------------------------------------------------------- /configs/cityscapes/semantic-segmentation/swin/maskformer2_swin_small_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_small_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | -------------------------------------------------------------------------------- /configs/cityscapes/semantic-segmentation/swin/maskformer2_swin_tiny_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 6, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_tiny_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | -------------------------------------------------------------------------------- /configs/coco/instance-segmentation/Base-COCO-InstanceSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | # NORM: "SyncBN" 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("coco_2017_train",) 18 | TEST: ("coco_2017_val",) 19 | SOLVER: 20 | IMS_PER_BATCH: 16 21 | BASE_LR: 0.0001 22 | STEPS: (327778, 355092) 23 | MAX_ITER: 368750 24 | WARMUP_FACTOR: 1.0 25 | WARMUP_ITERS: 10 26 | WEIGHT_DECAY: 0.05 27 | OPTIMIZER: "ADAMW" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | AMP: 35 | ENABLED: True 36 | INPUT: 37 | IMAGE_SIZE: 1024 38 | MIN_SCALE: 0.1 39 | MAX_SCALE: 2.0 40 | FORMAT: "RGB" 41 | DATASET_MAPPER_NAME: "coco_instance_lsj" 42 | TEST: 43 | EVAL_PERIOD: 5000 44 | DATALOADER: 45 | FILTER_EMPTY_ANNOTATIONS: True 46 | NUM_WORKERS: 4 47 | VERSION: 2 48 | -------------------------------------------------------------------------------- /configs/coco/instance-segmentation/maskformer2_R101_bs16_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | WEIGHTS: "R-101.pkl" 4 | RESNETS: 5 | DEPTH: 101 6 | STEM_TYPE: "basic" # not used 7 | STEM_OUT_CHANNELS: 64 8 | STRIDE_IN_1X1: False 9 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 10 | # NORM: "SyncBN" 11 | RES5_MULTI_GRID: [1, 1, 1] # not used 12 | -------------------------------------------------------------------------------- /configs/coco/instance-segmentation/maskformer2_R50_bs16_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-COCO-InstanceSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IGNORE_VALUE: 255 7 | NUM_CLASSES: 80 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 14 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 15 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 16 | COMMON_STRIDE: 4 17 | TRANSFORMER_ENC_LAYERS: 6 18 | MASK_FORMER: 19 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 20 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 21 | DEEP_SUPERVISION: True 22 | NO_OBJECT_WEIGHT: 0.1 23 | CLASS_WEIGHT: 2.0 24 | MASK_WEIGHT: 5.0 25 | DICE_WEIGHT: 5.0 26 | HIDDEN_DIM: 256 27 | NUM_OBJECT_QUERIES: 100 28 | NHEADS: 8 29 | DROPOUT: 0.0 30 | DIM_FEEDFORWARD: 2048 31 | ENC_LAYERS: 0 32 | PRE_NORM: False 33 | ENFORCE_INPUT_PROJ: False 34 | SIZE_DIVISIBILITY: 32 35 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 36 | TRAIN_NUM_POINTS: 12544 37 | OVERSAMPLE_RATIO: 3.0 38 | IMPORTANCE_SAMPLE_RATIO: 0.75 39 | TEST: 40 | SEMANTIC_ON: False 41 | INSTANCE_ON: True 42 | PANOPTIC_ON: False 43 | OVERLAP_THRESHOLD: 0.8 44 | OBJECT_MASK_THRESHOLD: 0.8 45 | -------------------------------------------------------------------------------- /configs/coco/instance-segmentation/swin/maskformer2_swin_base_384_bs16_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 128 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [4, 8, 16, 32] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_base_patch4_window12_384.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | -------------------------------------------------------------------------------- /configs/coco/instance-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 128 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [4, 8, 16, 32] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_base_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | -------------------------------------------------------------------------------- /configs/coco/instance-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_100ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | MASK_FORMER: 18 | NUM_OBJECT_QUERIES: 200 19 | SOLVER: 20 | STEPS: (655556, 710184) 21 | MAX_ITER: 737500 22 | -------------------------------------------------------------------------------- /configs/coco/instance-segmentation/swin/maskformer2_swin_small_bs16_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_small_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | -------------------------------------------------------------------------------- /configs/coco/instance-segmentation/swin/maskformer2_swin_tiny_bs16_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 6, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_tiny_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | -------------------------------------------------------------------------------- /configs/coco/panoptic-segmentation/Base-COCO-PanopticSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | # NORM: "SyncBN" 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("coco_2017_train_panoptic",) 18 | TEST: ("coco_2017_val_panoptic",) # to evaluate instance and semantic performance as well 19 | SOLVER: 20 | IMS_PER_BATCH: 16 21 | BASE_LR: 0.0001 22 | STEPS: (327778, 355092) 23 | MAX_ITER: 368750 24 | WARMUP_FACTOR: 1.0 25 | WARMUP_ITERS: 10 26 | WEIGHT_DECAY: 0.05 27 | OPTIMIZER: "ADAMW" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | AMP: 35 | ENABLED: True 36 | INPUT: 37 | IMAGE_SIZE: 1024 38 | MIN_SCALE: 0.1 39 | MAX_SCALE: 2.0 40 | FORMAT: "RGB" 41 | DATASET_MAPPER_NAME: "coco_panoptic_lsj" 42 | TEST: 43 | EVAL_PERIOD: 5000 44 | DATALOADER: 45 | FILTER_EMPTY_ANNOTATIONS: True 46 | NUM_WORKERS: 4 47 | VERSION: 2 48 | -------------------------------------------------------------------------------- /configs/coco/panoptic-segmentation/maskformer2_R101_bs16_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | WEIGHTS: "R-101.pkl" 4 | RESNETS: 5 | DEPTH: 101 6 | STEM_TYPE: "basic" # not used 7 | STEM_OUT_CHANNELS: 64 8 | STRIDE_IN_1X1: False 9 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 10 | # NORM: "SyncBN" 11 | RES5_MULTI_GRID: [1, 1, 1] # not used 12 | -------------------------------------------------------------------------------- /configs/coco/panoptic-segmentation/maskformer2_R50_bs16_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-COCO-PanopticSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 7 | IGNORE_VALUE: 255 8 | NUM_CLASSES: 133 9 | LOSS_WEIGHT: 1.0 10 | CONVS_DIM: 256 11 | MASK_DIM: 256 12 | NORM: "GN" 13 | # pixel decoder 14 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 15 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 16 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 17 | COMMON_STRIDE: 4 18 | TRANSFORMER_ENC_LAYERS: 6 19 | MASK_FORMER: 20 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 21 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 22 | DEEP_SUPERVISION: True 23 | NO_OBJECT_WEIGHT: 0.1 24 | CLASS_WEIGHT: 2.0 25 | MASK_WEIGHT: 5.0 26 | DICE_WEIGHT: 5.0 27 | HIDDEN_DIM: 256 28 | NUM_OBJECT_QUERIES: 100 29 | NHEADS: 8 30 | DROPOUT: 0.0 31 | DIM_FEEDFORWARD: 2048 32 | ENC_LAYERS: 0 33 | PRE_NORM: False 34 | ENFORCE_INPUT_PROJ: False 35 | SIZE_DIVISIBILITY: 32 36 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 37 | TRAIN_NUM_POINTS: 12544 38 | OVERSAMPLE_RATIO: 3.0 39 | IMPORTANCE_SAMPLE_RATIO: 0.75 40 | TEST: 41 | SEMANTIC_ON: True 42 | INSTANCE_ON: True 43 | PANOPTIC_ON: True 44 | OVERLAP_THRESHOLD: 0.8 45 | OBJECT_MASK_THRESHOLD: 0.8 46 | -------------------------------------------------------------------------------- /configs/coco/panoptic-segmentation/swin/maskformer2_swin_base_384_bs16_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 128 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [4, 8, 16, 32] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_base_patch4_window12_384.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | -------------------------------------------------------------------------------- /configs/coco/panoptic-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 128 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [4, 8, 16, 32] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_base_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | -------------------------------------------------------------------------------- /configs/coco/panoptic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_100ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | MASK_FORMER: 18 | NUM_OBJECT_QUERIES: 200 19 | SOLVER: 20 | STEPS: (655556, 710184) 21 | MAX_ITER: 737500 22 | -------------------------------------------------------------------------------- /configs/coco/panoptic-segmentation/swin/maskformer2_swin_small_bs16_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_small_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | -------------------------------------------------------------------------------- /configs/coco/panoptic-segmentation/swin/maskformer2_swin_tiny_bs16_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 6, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_tiny_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | -------------------------------------------------------------------------------- /configs/mapillary-vistas/panoptic-segmentation/Base-MapillaryVistas-PanopticSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | # NORM: "SyncBN" 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("mapillary_vistas_panoptic_train",) 18 | TEST: ("mapillary_vistas_panoptic_val",) 19 | SOLVER: 20 | IMS_PER_BATCH: 16 21 | BASE_LR: 0.0001 22 | MAX_ITER: 300000 23 | WARMUP_FACTOR: 1.0 24 | WARMUP_ITERS: 0 25 | WEIGHT_DECAY: 0.05 26 | OPTIMIZER: "ADAMW" 27 | LR_SCHEDULER_NAME: "WarmupPolyLR" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | AMP: 35 | ENABLED: True 36 | INPUT: 37 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 2048) for x in range(5, 21)]"] 38 | MIN_SIZE_TRAIN_SAMPLING: "choice" 39 | MIN_SIZE_TEST: 2048 40 | MAX_SIZE_TRAIN: 8192 41 | MAX_SIZE_TEST: 2048 42 | CROP: 43 | ENABLED: True 44 | TYPE: "absolute" 45 | SIZE: (1024, 1024) 46 | SINGLE_CATEGORY_MAX_AREA: 1.0 47 | COLOR_AUG_SSD: True 48 | SIZE_DIVISIBILITY: 1024 # used in dataset mapper 49 | FORMAT: "RGB" 50 | DATASET_MAPPER_NAME: "mask_former_panoptic" 51 | TEST: 52 | EVAL_PERIOD: 0 53 | DATALOADER: 54 | FILTER_EMPTY_ANNOTATIONS: True 55 | NUM_WORKERS: 10 56 | VERSION: 2 57 | -------------------------------------------------------------------------------- /configs/mapillary-vistas/panoptic-segmentation/maskformer_R50_bs16_300k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-MapillaryVistas-PanopticSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IGNORE_VALUE: 65 7 | NUM_CLASSES: 65 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 14 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 15 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 16 | COMMON_STRIDE: 4 17 | TRANSFORMER_ENC_LAYERS: 6 18 | MASK_FORMER: 19 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 20 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 21 | DEEP_SUPERVISION: True 22 | NO_OBJECT_WEIGHT: 0.1 23 | CLASS_WEIGHT: 2.0 24 | MASK_WEIGHT: 5.0 25 | DICE_WEIGHT: 5.0 26 | HIDDEN_DIM: 256 27 | NUM_OBJECT_QUERIES: 100 28 | NHEADS: 8 29 | DROPOUT: 0.0 30 | DIM_FEEDFORWARD: 2048 31 | ENC_LAYERS: 0 32 | PRE_NORM: False 33 | ENFORCE_INPUT_PROJ: False 34 | SIZE_DIVISIBILITY: 32 35 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 36 | TRAIN_NUM_POINTS: 12544 37 | OVERSAMPLE_RATIO: 3.0 38 | IMPORTANCE_SAMPLE_RATIO: 0.75 39 | TEST: 40 | SEMANTIC_ON: True 41 | INSTANCE_ON: False 42 | PANOPTIC_ON: True 43 | OVERLAP_THRESHOLD: 0.8 44 | OBJECT_MASK_THRESHOLD: 0.0 45 | -------------------------------------------------------------------------------- /configs/mapillary-vistas/panoptic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_300k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer_R50_bs16_300k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | MASK_FORMER: 18 | NUM_OBJECT_QUERIES: 200 19 | -------------------------------------------------------------------------------- /configs/mapillary-vistas/semantic-segmentation/Base-MapillaryVistas-SemanticSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | # NORM: "SyncBN" 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("mapillary_vistas_sem_seg_train",) 18 | TEST: ("mapillary_vistas_sem_seg_val",) 19 | SOLVER: 20 | IMS_PER_BATCH: 16 21 | BASE_LR: 0.0001 22 | MAX_ITER: 300000 23 | WARMUP_FACTOR: 1.0 24 | WARMUP_ITERS: 0 25 | WEIGHT_DECAY: 0.05 26 | OPTIMIZER: "ADAMW" 27 | LR_SCHEDULER_NAME: "WarmupPolyLR" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | AMP: 35 | ENABLED: True 36 | INPUT: 37 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 2048) for x in range(5, 21)]"] 38 | MIN_SIZE_TRAIN_SAMPLING: "choice" 39 | MIN_SIZE_TEST: 2048 40 | MAX_SIZE_TRAIN: 8192 41 | MAX_SIZE_TEST: 2048 42 | CROP: 43 | ENABLED: True 44 | TYPE: "absolute" 45 | SIZE: (1024, 1024) 46 | SINGLE_CATEGORY_MAX_AREA: 1.0 47 | COLOR_AUG_SSD: True 48 | SIZE_DIVISIBILITY: 1024 # used in dataset mapper 49 | FORMAT: "RGB" 50 | DATASET_MAPPER_NAME: "mask_former_semantic" 51 | TEST: 52 | EVAL_PERIOD: 0 53 | DATALOADER: 54 | FILTER_EMPTY_ANNOTATIONS: True 55 | NUM_WORKERS: 10 56 | VERSION: 2 57 | -------------------------------------------------------------------------------- /configs/mapillary-vistas/semantic-segmentation/maskformer2_R50_bs16_300k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-MapillaryVistas-SemanticSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IGNORE_VALUE: 65 7 | NUM_CLASSES: 65 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 14 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 15 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 16 | COMMON_STRIDE: 4 17 | TRANSFORMER_ENC_LAYERS: 6 18 | MASK_FORMER: 19 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 20 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 21 | DEEP_SUPERVISION: True 22 | NO_OBJECT_WEIGHT: 0.1 23 | CLASS_WEIGHT: 2.0 24 | MASK_WEIGHT: 5.0 25 | DICE_WEIGHT: 5.0 26 | HIDDEN_DIM: 256 27 | NUM_OBJECT_QUERIES: 100 28 | NHEADS: 8 29 | DROPOUT: 0.0 30 | DIM_FEEDFORWARD: 2048 31 | ENC_LAYERS: 0 32 | PRE_NORM: False 33 | ENFORCE_INPUT_PROJ: False 34 | SIZE_DIVISIBILITY: 32 35 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 36 | TRAIN_NUM_POINTS: 12544 37 | OVERSAMPLE_RATIO: 3.0 38 | IMPORTANCE_SAMPLE_RATIO: 0.75 39 | TEST: 40 | SEMANTIC_ON: True 41 | INSTANCE_ON: False 42 | PANOPTIC_ON: False 43 | OVERLAP_THRESHOLD: 0.8 44 | OBJECT_MASK_THRESHOLD: 0.0 45 | -------------------------------------------------------------------------------- /configs/mapillary-vistas/semantic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_300k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_300k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | MASK_FORMER: 18 | NUM_OBJECT_QUERIES: 100 19 | -------------------------------------------------------------------------------- /configs/pascal/pps/Base-Pascal-PPS-LSJ.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | # NORM: "SyncBN" # use syncbn for cityscapes dataset 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("pascal_panoptic_parts_train",) 18 | TEST: ("pascal_panoptic_parts_val",) 19 | NAME: "Pascal" 20 | SOLVER: 21 | IMS_PER_BATCH: 16 22 | BASE_LR: 0.0001 23 | MAX_ITER: 60000 24 | CHECKPOINT_PERIOD: 10000 25 | WARMUP_FACTOR: 1.0 26 | WARMUP_ITERS: 100 27 | WEIGHT_DECAY: 0.05 28 | OPTIMIZER: "ADAMW" 29 | LR_SCHEDULER_NAME: "WarmupPolyLR" 30 | BACKBONE_MULTIPLIER: 0.1 31 | CLIP_GRADIENTS: 32 | ENABLED: True 33 | CLIP_TYPE: "full_model" 34 | CLIP_VALUE: 0.01 35 | NORM_TYPE: 2.0 36 | AMP: 37 | ENABLED: True 38 | INPUT: 39 | IMAGE_SIZE: 1024 40 | MIN_SCALE: 0.1 41 | MAX_SCALE: 2.0 42 | FORMAT: "RGB" 43 | DATASET_MAPPER_NAME: "pascal_panoptic_parts_lsj" 44 | TEST: 45 | EVAL_PERIOD: 20000 46 | DATALOADER: 47 | FILTER_EMPTY_ANNOTATIONS: True 48 | NUM_WORKERS: 4 49 | VERSION: 2 50 | PPS_EVAL_SPEC: "utils/panoptic_parts/panoptic_parts/specs/eval_specs/ppq_ppp_59_57_cvpr21_default_evalspec.yaml" -------------------------------------------------------------------------------- /configs/pascal/pps/pascal_107/tapps_pascal107_r50_cocoinit.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../Base-Pascal-PPS-LSJ.yaml 2 | OUTPUT_DIR: "output/tapps_pascal107_r50_cocoinit/" 3 | DATASETS: 4 | TRAIN: ("pascal_panoptic_parts_107_train",) 5 | TEST: ("pascal_panoptic_parts_107_val",) 6 | NAME: "Pascal107" 7 | SOLVER: 8 | MAX_ITER: 10000 9 | MODEL: 10 | WEIGHTS: "checkpoints/maskformer2_R50_bs16_50ep/model_final_94dc52.pkl" 11 | META_ARCHITECTURE: "MaskFormer" 12 | SEM_SEG_HEAD: 13 | NAME: "MaskFormerHead" 14 | IGNORE_VALUE: 255 15 | NUM_CLASSES: 59 16 | LOSS_WEIGHT: 1.0 17 | CONVS_DIM: 256 18 | MASK_DIM: 256 19 | NORM: "GN" 20 | # pixel decoder 21 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 22 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 23 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 24 | COMMON_STRIDE: 4 25 | TRANSFORMER_ENC_LAYERS: 6 26 | MASK_FORMER: 27 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 28 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 29 | DEEP_SUPERVISION: True 30 | NO_OBJECT_WEIGHT: 0.1 31 | CLASS_WEIGHT: 2.0 32 | MASK_WEIGHT: 5.0 33 | DICE_WEIGHT: 5.0 34 | HIDDEN_DIM: 256 35 | NUM_OBJECT_QUERIES: 100 36 | NHEADS: 8 37 | DROPOUT: 0.0 38 | DIM_FEEDFORWARD: 2048 39 | ENC_LAYERS: 0 40 | PRE_NORM: False 41 | ENFORCE_INPUT_PROJ: False 42 | SIZE_DIVISIBILITY: 32 43 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 44 | TRAIN_NUM_POINTS: 12544 45 | OVERSAMPLE_RATIO: 3.0 46 | IMPORTANCE_SAMPLE_RATIO: 0.75 47 | PARTS_ON: True 48 | NUM_PART_CLASSES: 107 49 | LOSS_WEIGHT_PARTS: 1.0 50 | LOSS_WEIGHT_PANOPTIC: 1.0 51 | PARTS_CONF_THRESHOLD: 0.1 52 | TEST: 53 | SEMANTIC_ON: True 54 | INSTANCE_ON: False 55 | PANOPTIC_ON: True 56 | PARTS_ON: True 57 | OVERLAP_THRESHOLD: 0.8 58 | OBJECT_MASK_THRESHOLD: 0.5 59 | PPS_EVAL_SPEC: "utils/panoptic_parts/panoptic_parts/specs/eval_specs/ppq_ppp_59_107_cvpr21_default_evalspec.yaml" -------------------------------------------------------------------------------- /configs/pascal/pps/pascal_107/tapps_pascal107_r50_in1kinit.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../Base-Pascal-PPS-LSJ.yaml 2 | OUTPUT_DIR: "output/tapps_pascal107_r50_in1kinit/" 3 | DATASETS: 4 | TRAIN: ("pascal_panoptic_parts_107_train",) 5 | TEST: ("pascal_panoptic_parts_107_val",) 6 | NAME: "Pascal107" 7 | MODEL: 8 | META_ARCHITECTURE: "MaskFormer" 9 | SEM_SEG_HEAD: 10 | NAME: "MaskFormerHead" 11 | IGNORE_VALUE: 255 12 | NUM_CLASSES: 59 13 | LOSS_WEIGHT: 1.0 14 | CONVS_DIM: 256 15 | MASK_DIM: 256 16 | NORM: "GN" 17 | # pixel decoder 18 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 19 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 20 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 21 | COMMON_STRIDE: 4 22 | TRANSFORMER_ENC_LAYERS: 6 23 | MASK_FORMER: 24 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 25 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 26 | DEEP_SUPERVISION: True 27 | NO_OBJECT_WEIGHT: 0.1 28 | CLASS_WEIGHT: 2.0 29 | MASK_WEIGHT: 5.0 30 | DICE_WEIGHT: 5.0 31 | HIDDEN_DIM: 256 32 | NUM_OBJECT_QUERIES: 100 33 | NHEADS: 8 34 | DROPOUT: 0.0 35 | DIM_FEEDFORWARD: 2048 36 | ENC_LAYERS: 0 37 | PRE_NORM: False 38 | ENFORCE_INPUT_PROJ: False 39 | SIZE_DIVISIBILITY: 32 40 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 41 | TRAIN_NUM_POINTS: 12544 42 | OVERSAMPLE_RATIO: 3.0 43 | IMPORTANCE_SAMPLE_RATIO: 0.75 44 | PARTS_ON: True 45 | NUM_PART_CLASSES: 107 46 | LOSS_WEIGHT_PARTS: 1.0 47 | LOSS_WEIGHT_PANOPTIC: 1.0 48 | PARTS_CONF_THRESHOLD: 0.1 49 | TEST: 50 | SEMANTIC_ON: True 51 | INSTANCE_ON: False 52 | PANOPTIC_ON: True 53 | PARTS_ON: True 54 | OVERLAP_THRESHOLD: 0.8 55 | OBJECT_MASK_THRESHOLD: 0.5 56 | PPS_EVAL_SPEC: "utils/panoptic_parts/panoptic_parts/specs/eval_specs/ppq_ppp_59_107_cvpr21_default_evalspec.yaml" -------------------------------------------------------------------------------- /configs/pascal/pps/tapps_pascal_r50_cocoinit.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-Pascal-PPS-LSJ.yaml 2 | OUTPUT_DIR: "output/tapps_pascal_r50_cocoinit/" 3 | SOLVER: 4 | MAX_ITER: 10000 5 | MODEL: 6 | WEIGHTS: "checkpoints/maskformer2_R50_bs16_50ep/model_final_94dc52.pkl" 7 | META_ARCHITECTURE: "MaskFormer" 8 | SEM_SEG_HEAD: 9 | NAME: "MaskFormerHead" 10 | IGNORE_VALUE: 255 11 | NUM_CLASSES: 59 12 | LOSS_WEIGHT: 1.0 13 | CONVS_DIM: 256 14 | MASK_DIM: 256 15 | NORM: "GN" 16 | # pixel decoder 17 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 18 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 19 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 20 | COMMON_STRIDE: 4 21 | TRANSFORMER_ENC_LAYERS: 6 22 | MASK_FORMER: 23 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 24 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 25 | DEEP_SUPERVISION: True 26 | NO_OBJECT_WEIGHT: 0.1 27 | CLASS_WEIGHT: 2.0 28 | MASK_WEIGHT: 5.0 29 | DICE_WEIGHT: 5.0 30 | HIDDEN_DIM: 256 31 | NUM_OBJECT_QUERIES: 100 32 | NHEADS: 8 33 | DROPOUT: 0.0 34 | DIM_FEEDFORWARD: 2048 35 | ENC_LAYERS: 0 36 | PRE_NORM: False 37 | ENFORCE_INPUT_PROJ: False 38 | SIZE_DIVISIBILITY: 32 39 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 40 | TRAIN_NUM_POINTS: 12544 41 | OVERSAMPLE_RATIO: 3.0 42 | IMPORTANCE_SAMPLE_RATIO: 0.75 43 | PARTS_ON: True 44 | NUM_PART_CLASSES: 57 45 | LOSS_WEIGHT_PARTS: 1.0 46 | LOSS_WEIGHT_PANOPTIC: 1.0 47 | PARTS_CONF_THRESHOLD: 0.1 48 | TEST: 49 | SEMANTIC_ON: True 50 | INSTANCE_ON: False 51 | PANOPTIC_ON: True 52 | PARTS_ON: True 53 | OVERLAP_THRESHOLD: 0.8 54 | OBJECT_MASK_THRESHOLD: 0.5 -------------------------------------------------------------------------------- /configs/pascal/pps/tapps_pascal_r50_in1kinit.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-Pascal-PPS-LSJ.yaml 2 | OUTPUT_DIR: "output/tapps_pascal_r50_in1kinit/" 3 | MODEL: 4 | META_ARCHITECTURE: "MaskFormer" 5 | SEM_SEG_HEAD: 6 | NAME: "MaskFormerHead" 7 | IGNORE_VALUE: 255 8 | NUM_CLASSES: 59 9 | LOSS_WEIGHT: 1.0 10 | CONVS_DIM: 256 11 | MASK_DIM: 256 12 | NORM: "GN" 13 | # pixel decoder 14 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 15 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 16 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 17 | COMMON_STRIDE: 4 18 | TRANSFORMER_ENC_LAYERS: 6 19 | MASK_FORMER: 20 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 21 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 22 | DEEP_SUPERVISION: True 23 | NO_OBJECT_WEIGHT: 0.1 24 | CLASS_WEIGHT: 2.0 25 | MASK_WEIGHT: 5.0 26 | DICE_WEIGHT: 5.0 27 | HIDDEN_DIM: 256 28 | NUM_OBJECT_QUERIES: 100 29 | NHEADS: 8 30 | DROPOUT: 0.0 31 | DIM_FEEDFORWARD: 2048 32 | ENC_LAYERS: 0 33 | PRE_NORM: False 34 | ENFORCE_INPUT_PROJ: False 35 | SIZE_DIVISIBILITY: 32 36 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 37 | TRAIN_NUM_POINTS: 12544 38 | OVERSAMPLE_RATIO: 3.0 39 | IMPORTANCE_SAMPLE_RATIO: 0.75 40 | PARTS_ON: True 41 | NUM_PART_CLASSES: 57 42 | LOSS_WEIGHT_PARTS: 1.0 43 | LOSS_WEIGHT_PANOPTIC: 1.0 44 | PARTS_CONF_THRESHOLD: 0.1 45 | TEST: 46 | SEMANTIC_ON: True 47 | INSTANCE_ON: False 48 | PANOPTIC_ON: True 49 | PARTS_ON: True 50 | OVERLAP_THRESHOLD: 0.8 51 | OBJECT_MASK_THRESHOLD: 0.5 -------------------------------------------------------------------------------- /configs/pascal/pps/tapps_pascal_swinb_cocoinit.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-Pascal-PPS-LSJ.yaml 2 | OUTPUT_DIR: "output/tapps_pascal_swinb_cocoinit/" 3 | SOLVER: 4 | MAX_ITER: 10000 5 | MODEL: 6 | WEIGHTS: "checkpoints/maskformer2_swin_base_IN21k_384_bs16_50ep/model_final_54b88a.pkl" 7 | BACKBONE: 8 | NAME: "D2SwinTransformer" 9 | SWIN: 10 | EMBED_DIM: 128 11 | DEPTHS: [ 2, 2, 18, 2 ] 12 | NUM_HEADS: [ 4, 8, 16, 32 ] 13 | WINDOW_SIZE: 12 14 | APE: False 15 | DROP_PATH_RATE: 0.3 16 | PATCH_NORM: True 17 | PRETRAIN_IMG_SIZE: 384 18 | PIXEL_MEAN: [ 123.675, 116.280, 103.530 ] 19 | PIXEL_STD: [ 58.395, 57.120, 57.375 ] 20 | META_ARCHITECTURE: "MaskFormer" 21 | SEM_SEG_HEAD: 22 | NAME: "MaskFormerHead" 23 | IGNORE_VALUE: 255 24 | NUM_CLASSES: 59 25 | LOSS_WEIGHT: 1.0 26 | CONVS_DIM: 256 27 | MASK_DIM: 256 28 | NORM: "GN" 29 | # pixel decoder 30 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 31 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 32 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 33 | COMMON_STRIDE: 4 34 | TRANSFORMER_ENC_LAYERS: 6 35 | MASK_FORMER: 36 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 37 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 38 | DEEP_SUPERVISION: True 39 | NO_OBJECT_WEIGHT: 0.1 40 | CLASS_WEIGHT: 2.0 41 | MASK_WEIGHT: 5.0 42 | DICE_WEIGHT: 5.0 43 | HIDDEN_DIM: 256 44 | NUM_OBJECT_QUERIES: 100 45 | NHEADS: 8 46 | DROPOUT: 0.0 47 | DIM_FEEDFORWARD: 2048 48 | ENC_LAYERS: 0 49 | PRE_NORM: False 50 | ENFORCE_INPUT_PROJ: False 51 | SIZE_DIVISIBILITY: 32 52 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 53 | TRAIN_NUM_POINTS: 12544 54 | OVERSAMPLE_RATIO: 3.0 55 | IMPORTANCE_SAMPLE_RATIO: 0.75 56 | PARTS_ON: True 57 | NUM_PART_CLASSES: 57 58 | LOSS_WEIGHT_PARTS: 1.0 59 | LOSS_WEIGHT_PANOPTIC: 1.0 60 | PARTS_CONF_THRESHOLD: 0.1 61 | TEST: 62 | SEMANTIC_ON: True 63 | INSTANCE_ON: False 64 | PANOPTIC_ON: True 65 | PARTS_ON: True 66 | OVERLAP_THRESHOLD: 0.8 67 | OBJECT_MASK_THRESHOLD: 0.5 -------------------------------------------------------------------------------- /datasets/README.md: -------------------------------------------------------------------------------- 1 | # Prepare Datasets for TAPPS 2 | 3 | TAPPS has builtin support for two datasets: Cityscapes Panoptic Parts (Cityscapes-PP) and Pascal Panoptic Parts (Pascal-PP) 4 | The datasets are assumed to exist in a directory specified by the environment variable 5 | `DETECTRON2_DATASETS`. 6 | Under this directory, detectron2 will look for datasets in the structure described below. 7 | ``` 8 | $DETECTRON2_DATASETS/ 9 | cityscapes/ 10 | pascal/ 11 | ``` 12 | 13 | You can set the location for builtin datasets by `export DETECTRON2_DATASETS=/path/to/datasets`. 14 | If left unset, the default is `./datasets` relative to your current working directory. 15 | 16 | ## Expected dataset structure for [Cityscapes-PP](https://github.com/pmeletis/panoptic_parts): 17 | 18 | First download the [Cityscapes dataset](https://cityscapes-dataset.com/downloads/) and put the data in the `cityscapes` directory. Download `gtFine_trainvaltest.zip`, `leftImg8bit_trainvaltest.zip`, and `gtFinePanopticParts.zip`. Structure it as below: 19 | 20 | ``` 21 | cityscapes/ 22 | gtFine/ 23 | train/ 24 | aachen/ 25 | ... 26 | val/ 27 | test/ 28 | leftImg8bit/ 29 | train/ 30 | val/ 31 | test/ 32 | gtFinePanopticParts/ 33 | train/ 34 | val/ 35 | ``` 36 | In any directory, clone cityscapesScripts by: 37 | ```bash 38 | git clone https://github.com/mcordts/cityscapesScripts.git 39 | ``` 40 | 41 | To create labelTrainIds.png, first prepare the above structure, then run cityscapesScripts with: 42 | ```bash 43 | CITYSCAPES_DATASET=/path/to/abovementioned/cityscapes python cityscapesscripts/preparation/createTrainIdLabelImgs.py 44 | ``` 45 | 46 | To generate Cityscapes panoptic dataset, run cityscapesScripts with: 47 | ```bash 48 | CITYSCAPES_DATASET=/path/to/abovementioned/cityscapes python cityscapesscripts/preparation/createPanopticImgs.py 49 | ``` 50 | 51 | To prepare the part segmentation files necessary for training, run: 52 | ```bash 53 | python datasets/prepare_cityscapes_pp.py 54 | ``` 55 | 56 | After doing this, the data should be in the following structure: 57 | ``` 58 | cityscapes/ 59 | gtFine/ 60 | train/ 61 | aachen/ 62 | ... 63 | val/ 64 | test/ 65 | cityscapes_panoptic_train.json 66 | cityscapes_panoptic_train/ 67 | cityscapes_panoptic_val.json 68 | cityscapes_panoptic_val/ 69 | cityscapes_panoptic_test.json 70 | cityscapes_panoptic_test/ 71 | leftImg8bit/ 72 | train/ 73 | val/ 74 | test/ 75 | gtFinePanopticParts/ 76 | train/ 77 | val/ 78 | gtFineParts/ 79 | train/ 80 | val/ 81 | images_val.json 82 | images_train.json 83 | ``` 84 | 85 | ## Expected dataset structure for [Pascal-PP](https://github.com/pmeletis/panoptic_parts): 86 | 87 | Download the [Pascal-PP labels](https://github.com/pmeletis/panoptic_parts) and the [Pascal VOC 2010 images](http://host.robots.ox.ac.uk/pascal/VOC/voc2010/). Organize the data in the following structure: 88 | 89 | ``` 90 | pascal/ 91 | JPEGImages/ # From VOC2010 92 | labels/ # From pascal_panoptic_parts_v2.0 93 | training/ 94 | validation/ 95 | ``` 96 | 97 | To generate the panoptic, semantic and part segmentation annotations and split the images into training and validation splits, run: 98 | ```bash 99 | python datasets/prepare_pascal_pp.py 100 | ``` 101 | 102 | Afterwards, the data should have the following structure: 103 | ``` 104 | pascal/ 105 | images/ 106 | training/ 107 | validation/ 108 | labels/ 109 | training/ 110 | validation/ 111 | panoptic/ 112 | training/ 113 | validation/ 114 | panoptic_training.json 115 | panoptic_validation.json 116 | semantic/ 117 | training/ 118 | validation/ 119 | parts/ 120 | training/ 121 | validation/ 122 | images_training.json 123 | images_validation.json 124 | ``` 125 | 126 | Note: if you wish to use the Pascal-PP-107 labels (instead of the default Pascal-PP-57), also run: 127 | ```bash 128 | python datasets/prepare_pascal_pp_107.py 129 | ``` -------------------------------------------------------------------------------- /datasets/ade20k_instance_catid_mapping.txt: -------------------------------------------------------------------------------- 1 | Instacne100 SceneParse150 FullADE20K 2 | 1 8 165 3 | 2 9 3055 4 | 3 11 350 5 | 4 13 1831 6 | 5 15 774 7 | 5 15 783 8 | 6 16 2684 9 | 7 19 687 10 | 8 20 471 11 | 9 21 401 12 | 10 23 1735 13 | 11 24 2473 14 | 12 25 2329 15 | 13 28 1564 16 | 14 31 57 17 | 15 32 2272 18 | 16 33 907 19 | 17 34 724 20 | 18 36 2985 21 | 18 36 533 22 | 19 37 1395 23 | 20 38 155 24 | 21 39 2053 25 | 22 40 689 26 | 23 42 266 27 | 24 43 581 28 | 25 44 2380 29 | 26 45 491 30 | 27 46 627 31 | 28 48 2388 32 | 29 50 943 33 | 30 51 2096 34 | 31 54 2530 35 | 32 56 420 36 | 33 57 1948 37 | 34 58 1869 38 | 35 59 2251 39 | 36 63 239 40 | 37 65 571 41 | 38 66 2793 42 | 39 67 978 43 | 40 68 236 44 | 41 70 181 45 | 42 71 629 46 | 43 72 2598 47 | 44 73 1744 48 | 45 74 1374 49 | 46 75 591 50 | 47 76 2679 51 | 48 77 223 52 | 49 79 47 53 | 50 81 327 54 | 51 82 2821 55 | 52 83 1451 56 | 53 84 2880 57 | 54 86 480 58 | 55 87 77 59 | 56 88 2616 60 | 57 89 246 61 | 57 89 247 62 | 58 90 2733 63 | 59 91 14 64 | 60 93 38 65 | 61 94 1936 66 | 62 96 120 67 | 63 98 1702 68 | 64 99 249 69 | 65 103 2928 70 | 66 104 2337 71 | 67 105 1023 72 | 68 108 2989 73 | 69 109 1930 74 | 70 111 2586 75 | 71 112 131 76 | 72 113 146 77 | 73 116 95 78 | 74 117 1563 79 | 75 119 1708 80 | 76 120 103 81 | 77 121 1002 82 | 78 122 2569 83 | 79 124 2833 84 | 80 125 1551 85 | 81 126 1981 86 | 82 127 29 87 | 83 128 187 88 | 84 130 747 89 | 85 131 2254 90 | 86 133 2262 91 | 87 134 1260 92 | 88 135 2243 93 | 89 136 2932 94 | 90 137 2836 95 | 91 138 2850 96 | 92 139 64 97 | 93 140 894 98 | 94 143 1919 99 | 95 144 1583 100 | 96 145 318 101 | 97 147 2046 102 | 98 148 1098 103 | 99 149 530 104 | 100 150 954 105 | -------------------------------------------------------------------------------- /datasets/prepare_ade20k_sem_seg.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # Copyright (c) Facebook, Inc. and its affiliates. 4 | import os 5 | from pathlib import Path 6 | 7 | import numpy as np 8 | import tqdm 9 | from PIL import Image 10 | 11 | 12 | def convert(input, output): 13 | img = np.asarray(Image.open(input)) 14 | assert img.dtype == np.uint8 15 | img = img - 1 # 0 (ignore) becomes 255. others are shifted by 1 16 | Image.fromarray(img).save(output) 17 | 18 | 19 | if __name__ == "__main__": 20 | dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "ADEChallengeData2016" 21 | for name in ["training", "validation"]: 22 | annotation_dir = dataset_dir / "annotations" / name 23 | output_dir = dataset_dir / "annotations_detectron2" / name 24 | output_dir.mkdir(parents=True, exist_ok=True) 25 | for file in tqdm.tqdm(list(annotation_dir.iterdir())): 26 | output_file = output_dir / file.name 27 | convert(file, output_file) 28 | -------------------------------------------------------------------------------- /datasets/prepare_coco_semantic_annos_from_panoptic_annos.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # Copyright (c) Facebook, Inc. and its affiliates. 4 | 5 | import functools 6 | import json 7 | import multiprocessing as mp 8 | import numpy as np 9 | import os 10 | import time 11 | from fvcore.common.download import download 12 | from panopticapi.utils import rgb2id 13 | from PIL import Image 14 | 15 | from detectron2.data.datasets.builtin_meta import COCO_CATEGORIES 16 | 17 | 18 | def _process_panoptic_to_semantic(input_panoptic, output_semantic, segments, id_map): 19 | panoptic = np.asarray(Image.open(input_panoptic), dtype=np.uint32) 20 | panoptic = rgb2id(panoptic) 21 | output = np.zeros_like(panoptic, dtype=np.uint8) + 255 22 | for seg in segments: 23 | cat_id = seg["category_id"] 24 | new_cat_id = id_map[cat_id] 25 | output[panoptic == seg["id"]] = new_cat_id 26 | Image.fromarray(output).save(output_semantic) 27 | 28 | 29 | def separate_coco_semantic_from_panoptic(panoptic_json, panoptic_root, sem_seg_root, categories): 30 | """ 31 | Create semantic segmentation annotations from panoptic segmentation 32 | annotations, to be used by PanopticFPN. 33 | It maps all thing categories to class 0, and maps all unlabeled pixels to class 255. 34 | It maps all stuff categories to contiguous ids starting from 1. 35 | Args: 36 | panoptic_json (str): path to the panoptic json file, in COCO's format. 37 | panoptic_root (str): a directory with panoptic annotation files, in COCO's format. 38 | sem_seg_root (str): a directory to output semantic annotation files 39 | categories (list[dict]): category metadata. Each dict needs to have: 40 | "id": corresponds to the "category_id" in the json annotations 41 | "isthing": 0 or 1 42 | """ 43 | os.makedirs(sem_seg_root, exist_ok=True) 44 | 45 | id_map = {} # map from category id to id in the output semantic annotation 46 | assert len(categories) <= 254 47 | for i, k in enumerate(categories): 48 | id_map[k["id"]] = i 49 | # what is id = 0? 50 | # id_map[0] = 255 51 | print(id_map) 52 | 53 | with open(panoptic_json) as f: 54 | obj = json.load(f) 55 | 56 | pool = mp.Pool(processes=max(mp.cpu_count() // 2, 4)) 57 | 58 | def iter_annotations(): 59 | for anno in obj["annotations"]: 60 | file_name = anno["file_name"] 61 | segments = anno["segments_info"] 62 | input = os.path.join(panoptic_root, file_name) 63 | output = os.path.join(sem_seg_root, file_name) 64 | yield input, output, segments 65 | 66 | print("Start writing to {} ...".format(sem_seg_root)) 67 | start = time.time() 68 | pool.starmap( 69 | functools.partial(_process_panoptic_to_semantic, id_map=id_map), 70 | iter_annotations(), 71 | chunksize=100, 72 | ) 73 | print("Finished. time: {:.2f}s".format(time.time() - start)) 74 | 75 | 76 | if __name__ == "__main__": 77 | dataset_dir = os.path.join(os.getenv("DETECTRON2_DATASETS", "datasets"), "coco") 78 | for s in ["val2017", "train2017"]: 79 | separate_coco_semantic_from_panoptic( 80 | os.path.join(dataset_dir, "annotations/panoptic_{}.json".format(s)), 81 | os.path.join(dataset_dir, "panoptic_{}".format(s)), 82 | os.path.join(dataset_dir, "panoptic_semseg_{}".format(s)), 83 | COCO_CATEGORIES, 84 | ) 85 | -------------------------------------------------------------------------------- /eval/eval_partpq.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import argparse 4 | 5 | sys.path.append("utils/panoptic_parts") 6 | from panoptic_parts.evaluation import eval_PartPQ 7 | 8 | def eval_partpq(save_dir, dataset): 9 | root = os.getenv("DETECTRON2_DATASETS", "datasets") 10 | 11 | if dataset in ['pascal', 'Pascal', 'pascal57', 'Pascal57']: 12 | eval_spec_path = "utils/panoptic_parts/panoptic_parts/specs/eval_specs/ppq_ppp_59_57_cvpr21_default_evalspec.yaml" 13 | gt_path = os.path.join(root, "pascal/labels/validation") 14 | images_json = os.path.join(root, "pascal/images_validation.json") 15 | elif dataset in ['pascal107, Pascal107']: 16 | eval_spec_path = "utils/panoptic_parts/panoptic_parts/specs/eval_specs/ppq_ppp_59_107_cvpr21_default_evalspec.yaml" 17 | gt_path = os.path.join(root, "pascal/labels/validation") 18 | images_json = os.path.join(root, "pascal/images_validation.json") 19 | elif dataset in ['cityscapes', 'Cityscapes']: 20 | eval_spec_path = "utils/panoptic_parts/panoptic_parts/specs/eval_specs/ppq_cpp_19_23_cvpr21_grouped_evalspec.yaml" 21 | gt_path = os.path.join(root, "cityscapes/gtFinePanopticParts/val/") 22 | images_json = os.path.join(root, "cityscapes/images_val.json") 23 | else: 24 | raise NotImplementedError(f"Only implemented for Pascal, Pascal107 and Cityscapes, not {dataset}.") 25 | 26 | pps_pred_path = os.path.join(save_dir, "pps") 27 | results_dir = os.path.join(save_dir, "results") 28 | 29 | # Eval PPS predictions with PartPQ 30 | results = eval_PartPQ.evaluate(eval_spec_path, 31 | gt_path, 32 | pps_pred_path, 33 | images_json, 34 | save_dir=results_dir, 35 | return_results=True) 36 | 37 | part_pq = results[0][0]["PartPQ"] 38 | part_pq_p = results[0][1]["PartPQ_parts"] 39 | part_pq_np = results[0][2]["PartPQ_noparts"] 40 | metrics = {"part_pq": part_pq, "part_pq_p": part_pq_p, "part_pq_np": part_pq_np} 41 | print(metrics) 42 | 43 | 44 | if __name__ == "__main__": 45 | parser = argparse.ArgumentParser() 46 | parser.add_argument('--save_dir', type=str) 47 | parser.add_argument('--dataset', type=str) 48 | args = parser.parse_args() 49 | 50 | save_dir = args.save_dir 51 | dataset = args.dataset 52 | 53 | eval_partpq(save_dir, 54 | dataset) 55 | -------------------------------------------------------------------------------- /eval/visualize_pps.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import numpy as np 4 | import sys 5 | import psutil 6 | 7 | from PIL import Image 8 | from functools import partial 9 | 10 | import multiprocessing 11 | from multiprocessing import Pool 12 | 13 | sys.path.append("utils/panoptic_parts") 14 | from panoptic_parts.utils.format import encode_ids 15 | from panoptic_parts.utils.visualization import experimental_colorize_label 16 | from panoptic_parts.specs.eval_spec import PartPQEvalSpec 17 | 18 | 19 | def colorize_pps_and_store_single(file, predictions_path, sid2color, save_path, eval_spec, dataset, cpu_aff=None): 20 | predictions_np = np.array(Image.open(os.path.join(predictions_path, file)), dtype=np.int32) 21 | sids = predictions_np[..., 0] 22 | iids = predictions_np[..., 1] 23 | pids = predictions_np[..., 2] 24 | 25 | sids_no_parts = eval_spec.eval_sid_no_parts 26 | sids_stuff = eval_spec.eval_sid_stuff 27 | 28 | sids_wo_parts = np.isin(sids, sids_no_parts) 29 | sids_w_stuff = np.isin(sids, sids_stuff) 30 | iids[sids_w_stuff] = -1 31 | pids[sids_wo_parts] = -1 32 | 33 | sids[sids == 255] = 0 34 | pids[pids == 255] = -1 35 | iids[iids == 255] = -1 36 | uids = encode_ids(sids, iids, pids) 37 | 38 | if cpu_aff is not None: 39 | process = psutil.Process() 40 | process.cpu_affinity(cpu_aff) 41 | 42 | if dataset == 'cityscapes' or dataset == 'Cityscapes': 43 | is_cpp = True 44 | else: 45 | is_cpp = False 46 | 47 | pps_colors = experimental_colorize_label(uids, sid2color=sid2color, is_cpp=is_cpp) 48 | pps_colors_img = Image.fromarray(pps_colors.astype(np.uint8)) 49 | pps_colors_img.save(os.path.join(save_path, file)) 50 | 51 | 52 | def convert_pps_to_colors_and_store(predictions_path, save_path, eval_spec_path, dataset): 53 | eval_spec = PartPQEvalSpec(eval_spec_path) 54 | sid2color = eval_spec.dataset_spec.sid2scene_color 55 | 56 | files = list() 57 | for file in os.listdir(predictions_path): 58 | if file.endswith(".png"): 59 | files.append(file) 60 | 61 | if not os.path.exists(save_path): 62 | os.mkdir(save_path) 63 | 64 | # for file in tqdm(files): 65 | process = psutil.Process() 66 | cpu_aff = process.cpu_affinity() 67 | 68 | num_cpus = round(multiprocessing.cpu_count() / 2) 69 | 70 | colorize_pps_and_store_single_fn = partial(colorize_pps_and_store_single, 71 | predictions_path=predictions_path, 72 | sid2color=sid2color, 73 | save_path=save_path, 74 | cpu_aff=cpu_aff, 75 | eval_spec=eval_spec, 76 | dataset=dataset) 77 | print(f"Now visualizing {len(files)} PPS predictions... this could take a while.") 78 | with Pool(num_cpus) as p: 79 | p.map(colorize_pps_and_store_single_fn, files) 80 | 81 | 82 | def visualize(pred_dir, save_dir, dataset): 83 | if dataset in ['pascal', 'Pascal', 'pascal57', 'Pascal57']: 84 | eval_spec_path = "utils/panoptic_parts/panoptic_parts/specs/eval_specs/ppq_ppp_59_57_cvpr21_default_evalspec.yaml" 85 | elif dataset in ['pascal107, Pascal107']: 86 | eval_spec_path = "utils/panoptic_parts/panoptic_parts/specs/eval_specs/ppq_ppp_59_107_cvpr21_default_evalspec.yaml" 87 | elif dataset in ['cityscapes', 'Cityscapes']: 88 | eval_spec_path = "utils/panoptic_parts/panoptic_parts/specs/eval_specs/ppq_cpp_19_23_cvpr21_grouped_evalspec.yaml" 89 | else: 90 | raise NotImplementedError(f"Only implemented for Pascal, Pascal107 and Cityscapes, not {dataset}.") 91 | 92 | if not os.path.isdir(save_dir): 93 | os.mkdir(save_dir) 94 | 95 | convert_pps_to_colors_and_store(pred_dir, save_dir, eval_spec_path, dataset=dataset) 96 | 97 | if __name__ == "__main__": 98 | parser = argparse.ArgumentParser() 99 | parser.add_argument('--pred_dir', type=str) 100 | parser.add_argument('--save_dir', type=str) 101 | parser.add_argument('--dataset', type=str) 102 | args = parser.parse_args() 103 | 104 | pred_dir = args.pred_dir 105 | save_dir = args.save_dir 106 | dataset = args.dataset 107 | 108 | visualize(pred_dir, 109 | save_dir, 110 | dataset) 111 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | cython 2 | scipy 3 | shapely 4 | timm 5 | h5py 6 | submitit 7 | scikit-image 8 | psutil 9 | scikit-learn -------------------------------------------------------------------------------- /tapps/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from . import data # register all new datasets 3 | from . import modeling 4 | 5 | # config 6 | from .config import add_maskformer2_config 7 | 8 | # dataset loading 9 | from .data.dataset_mappers.coco_instance_new_baseline_dataset_mapper import COCOInstanceNewBaselineDatasetMapper 10 | from .data.dataset_mappers.coco_panoptic_new_baseline_dataset_mapper import COCOPanopticNewBaselineDatasetMapper 11 | from .data.dataset_mappers.mask_former_instance_dataset_mapper import ( 12 | MaskFormerInstanceDatasetMapper, 13 | ) 14 | from .data.dataset_mappers.mask_former_panoptic_dataset_mapper import ( 15 | MaskFormerPanopticDatasetMapper, 16 | ) 17 | 18 | from .data.dataset_mappers.mask_former_panoptic_parts_dataset_mapper import ( 19 | MaskFormerPanopticPartsDatasetMapper, 20 | ) 21 | 22 | from .data.dataset_mappers.mask_former_semantic_dataset_mapper import ( 23 | MaskFormerSemanticDatasetMapper, 24 | ) 25 | 26 | from .data.dataset_mappers.pascal_panoptic_parts_new_baseline_dataset_mapper import ( 27 | PascalPanopticPartsNewBaselineDatasetMapper, 28 | ) 29 | 30 | from .data.datasets.register_pascal_panoptic_parts import register_all_pascal_panoptic_parts 31 | from .data.datasets.register_cityscapes_panoptic_parts import register_all_cityscapes_panoptic_parts 32 | from .data.datasets.register_pascal_panoptic_parts_107 import register_all_pascal_panoptic_parts_107 33 | 34 | # models 35 | from .maskformer_model import MaskFormer 36 | from .test_time_augmentation import SemanticSegmentorWithTTA 37 | 38 | # evaluation 39 | from .evaluation.instance_evaluation import InstanceSegEvaluator 40 | -------------------------------------------------------------------------------- /tapps/data/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from . import datasets 3 | -------------------------------------------------------------------------------- /tapps/data/dataset_mappers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /tapps/data/dataset_mappers/augmentations.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | """ 4 | Implement many useful :class:`Augmentation`. 5 | """ 6 | 7 | import numpy as np 8 | from typing import List, Optional, Union 9 | 10 | 11 | from fvcore.transforms.transform import ( 12 | Transform, 13 | TransformList, 14 | ) 15 | 16 | from detectron2.data.transforms.augmentation import Augmentation, AugmentationList 17 | 18 | __all__ = [ 19 | "AugInput", 20 | ] 21 | 22 | 23 | def _check_img_dtype(img): 24 | assert isinstance(img, np.ndarray), "[Augmentation] Needs an numpy array, but got a {}!".format( 25 | type(img) 26 | ) 27 | assert not isinstance(img.dtype, np.integer) or ( 28 | img.dtype == np.uint8 29 | ), "[Augmentation] Got image of type {}, use uint8 or floating points instead!".format( 30 | img.dtype 31 | ) 32 | assert img.ndim in [2, 3], img.ndim 33 | 34 | 35 | 36 | class AugInput: 37 | """ 38 | Input that can be used with :meth:`Augmentation.__call__`. 39 | This is a standard implementation for the majority of use cases. 40 | This class provides the standard attributes **"image", "boxes", "sem_seg"** 41 | defined in :meth:`__init__` and they may be needed by different augmentations. 42 | Most augmentation policies do not need attributes beyond these three. 43 | 44 | After applying augmentations to these attributes (using :meth:`AugInput.transform`), 45 | the returned transforms can then be used to transform other data structures that users have. 46 | 47 | Examples: 48 | :: 49 | input = AugInput(image, boxes=boxes) 50 | tfms = augmentation(input) 51 | transformed_image = input.image 52 | transformed_boxes = input.boxes 53 | transformed_other_data = tfms.apply_other(other_data) 54 | 55 | An extended project that works with new data types may implement augmentation policies 56 | that need other inputs. An algorithm may need to transform inputs in a way different 57 | from the standard approach defined in this class. In those rare situations, users can 58 | implement a class similar to this class, that satify the following condition: 59 | 60 | * The input must provide access to these data in the form of attribute access 61 | (``getattr``). For example, if an :class:`Augmentation` to be applied needs "image" 62 | and "sem_seg" arguments, its input must have the attribute "image" and "sem_seg". 63 | * The input must have a ``transform(tfm: Transform) -> None`` method which 64 | in-place transforms all its attributes. 65 | """ 66 | 67 | def __init__( 68 | self, 69 | image: np.ndarray, 70 | *, 71 | boxes: Optional[np.ndarray] = None, 72 | sem_seg: Optional[np.ndarray] = None, 73 | part_seg: Optional[np.ndarray] = None, 74 | ratio = None 75 | ): 76 | """ 77 | Args: 78 | image (ndarray): (H,W) or (H,W,C) ndarray of type uint8 in range [0, 255], or 79 | floating point in range [0, 1] or [0, 255]. The meaning of C is up 80 | to users. 81 | boxes (ndarray or None): Nx4 float32 boxes in XYXY_ABS mode 82 | sem_seg (ndarray or None): HxW uint8 semantic segmentation mask. Each element 83 | is an integer label of pixel. 84 | part_seg (ndarray or None): HxW uint8 part segmentation mask. Each element 85 | is an integer label of pixel. 86 | """ 87 | _check_img_dtype(image) 88 | self.image = image 89 | self.boxes = boxes 90 | self.sem_seg = sem_seg 91 | self.part_seg = part_seg 92 | self.ratio = ratio 93 | 94 | def transform(self, tfm: Transform) -> None: 95 | """ 96 | In-place transform all attributes of this class. 97 | 98 | By "in-place", it means after calling this method, accessing an attribute such 99 | as ``self.image`` will return transformed data. 100 | """ 101 | self.image = tfm.apply_image(self.image) 102 | if self.boxes is not None: 103 | self.boxes = tfm.apply_box(self.boxes) 104 | if self.sem_seg is not None: 105 | self.sem_seg = tfm.apply_segmentation(self.sem_seg) 106 | if self.part_seg is not None: 107 | self.part_seg = tfm.apply_segmentation(self.part_seg) 108 | 109 | def apply_augmentations( 110 | self, augmentations: List[Union[Augmentation, Transform]] 111 | ) -> TransformList: 112 | """ 113 | Equivalent of ``AugmentationList(augmentations)(self)`` 114 | """ 115 | return AugmentationList(augmentations)(self) 116 | -------------------------------------------------------------------------------- /tapps/data/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from . import ( 3 | register_ade20k_full, 4 | register_ade20k_panoptic, 5 | register_coco_stuff_10k, 6 | register_mapillary_vistas, 7 | register_coco_panoptic_annos_semseg, 8 | register_ade20k_instance, 9 | register_mapillary_vistas_panoptic, 10 | register_cityscapes_panoptic_parts, 11 | register_pascal_panoptic_parts, 12 | register_pascal_panoptic_parts_107, 13 | ) 14 | -------------------------------------------------------------------------------- /tapps/evaluation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tue-mps/tapps/54d8d9f039fa91da6e046bfd8c62c13a9dc5e8b0/tapps/evaluation/__init__.py -------------------------------------------------------------------------------- /tapps/modeling/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from .backbone.swin import D2SwinTransformer 3 | from .pixel_decoder.fpn import BasePixelDecoder 4 | from .pixel_decoder.msdeformattn import MSDeformAttnPixelDecoder 5 | from .meta_arch.mask_former_head import MaskFormerHead 6 | from .meta_arch.per_pixel_baseline import PerPixelBaselineHead, PerPixelBaselinePlusHead 7 | -------------------------------------------------------------------------------- /tapps/modeling/backbone/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /tapps/modeling/meta_arch/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /tapps/modeling/pixel_decoder/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /tapps/modeling/pixel_decoder/ops/functions/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | from .ms_deform_attn_func import MSDeformAttnFunction 13 | 14 | -------------------------------------------------------------------------------- /tapps/modeling/pixel_decoder/ops/functions/ms_deform_attn_func.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | from __future__ import absolute_import 13 | from __future__ import print_function 14 | from __future__ import division 15 | 16 | import torch 17 | import torch.nn.functional as F 18 | from torch.autograd import Function 19 | from torch.autograd.function import once_differentiable 20 | 21 | try: 22 | import MultiScaleDeformableAttention as MSDA 23 | except ModuleNotFoundError as e: 24 | info_string = ( 25 | "\n\nPlease compile MultiScaleDeformableAttention CUDA op with the following commands:\n" 26 | "\t`cd tapps/modeling/pixel_decoder/ops`\n" 27 | "\t`sh make.sh`\n" 28 | ) 29 | raise ModuleNotFoundError(info_string) 30 | 31 | 32 | class MSDeformAttnFunction(Function): 33 | @staticmethod 34 | def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step): 35 | ctx.im2col_step = im2col_step 36 | output = MSDA.ms_deform_attn_forward( 37 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step) 38 | ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights) 39 | return output 40 | 41 | @staticmethod 42 | @once_differentiable 43 | def backward(ctx, grad_output): 44 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors 45 | grad_value, grad_sampling_loc, grad_attn_weight = \ 46 | MSDA.ms_deform_attn_backward( 47 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step) 48 | 49 | return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None 50 | 51 | 52 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights): 53 | # for debug and test only, 54 | # need to use cuda version instead 55 | N_, S_, M_, D_ = value.shape 56 | _, Lq_, M_, L_, P_, _ = sampling_locations.shape 57 | value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1) 58 | sampling_grids = 2 * sampling_locations - 1 59 | sampling_value_list = [] 60 | for lid_, (H_, W_) in enumerate(value_spatial_shapes): 61 | # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_ 62 | value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_) 63 | # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2 64 | sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1) 65 | # N_*M_, D_, Lq_, P_ 66 | sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_, 67 | mode='bilinear', padding_mode='zeros', align_corners=False) 68 | sampling_value_list.append(sampling_value_l_) 69 | # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_) 70 | attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_) 71 | output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_) 72 | return output.transpose(1, 2).contiguous() 73 | -------------------------------------------------------------------------------- /tapps/modeling/pixel_decoder/ops/make.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # ------------------------------------------------------------------------------------------------ 3 | # Deformable DETR 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | # ------------------------------------------------------------------------------------------------ 7 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | # ------------------------------------------------------------------------------------------------ 9 | 10 | # Copyright (c) Facebook, Inc. and its affiliates. 11 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 12 | 13 | python setup.py build install 14 | -------------------------------------------------------------------------------- /tapps/modeling/pixel_decoder/ops/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | from .ms_deform_attn import MSDeformAttn 13 | -------------------------------------------------------------------------------- /tapps/modeling/pixel_decoder/ops/setup.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | import os 13 | import glob 14 | 15 | import torch 16 | 17 | from torch.utils.cpp_extension import CUDA_HOME 18 | from torch.utils.cpp_extension import CppExtension 19 | from torch.utils.cpp_extension import CUDAExtension 20 | 21 | from setuptools import find_packages 22 | from setuptools import setup 23 | 24 | requirements = ["torch", "torchvision"] 25 | 26 | def get_extensions(): 27 | this_dir = os.path.dirname(os.path.abspath(__file__)) 28 | extensions_dir = os.path.join(this_dir, "src") 29 | 30 | main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) 31 | source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) 32 | source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) 33 | 34 | sources = main_file + source_cpu 35 | extension = CppExtension 36 | extra_compile_args = {"cxx": []} 37 | define_macros = [] 38 | 39 | # Force cuda since torch ask for a device, not if cuda is in fact available. 40 | if (os.environ.get('FORCE_CUDA') or torch.cuda.is_available()) and CUDA_HOME is not None: 41 | extension = CUDAExtension 42 | sources += source_cuda 43 | define_macros += [("WITH_CUDA", None)] 44 | extra_compile_args["nvcc"] = [ 45 | "-DCUDA_HAS_FP16=1", 46 | "-D__CUDA_NO_HALF_OPERATORS__", 47 | "-D__CUDA_NO_HALF_CONVERSIONS__", 48 | "-D__CUDA_NO_HALF2_OPERATORS__", 49 | ] 50 | else: 51 | if CUDA_HOME is None: 52 | raise NotImplementedError('CUDA_HOME is None. Please set environment variable CUDA_HOME.') 53 | else: 54 | raise NotImplementedError('No CUDA runtime is found. Please set FORCE_CUDA=1 or test it by running torch.cuda.is_available().') 55 | 56 | sources = [os.path.join(extensions_dir, s) for s in sources] 57 | include_dirs = [extensions_dir] 58 | ext_modules = [ 59 | extension( 60 | "MultiScaleDeformableAttention", 61 | sources, 62 | include_dirs=include_dirs, 63 | define_macros=define_macros, 64 | extra_compile_args=extra_compile_args, 65 | ) 66 | ] 67 | return ext_modules 68 | 69 | setup( 70 | name="MultiScaleDeformableAttention", 71 | version="1.0", 72 | author="Weijie Su", 73 | url="https://github.com/fundamentalvision/Deformable-DETR", 74 | description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention", 75 | packages=find_packages(exclude=("configs", "tests",)), 76 | ext_modules=get_extensions(), 77 | cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, 78 | ) 79 | -------------------------------------------------------------------------------- /tapps/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #include 17 | 18 | #include 19 | #include 20 | 21 | 22 | at::Tensor 23 | ms_deform_attn_cpu_forward( 24 | const at::Tensor &value, 25 | const at::Tensor &spatial_shapes, 26 | const at::Tensor &level_start_index, 27 | const at::Tensor &sampling_loc, 28 | const at::Tensor &attn_weight, 29 | const int im2col_step) 30 | { 31 | AT_ERROR("Not implement on cpu"); 32 | } 33 | 34 | std::vector 35 | ms_deform_attn_cpu_backward( 36 | const at::Tensor &value, 37 | const at::Tensor &spatial_shapes, 38 | const at::Tensor &level_start_index, 39 | const at::Tensor &sampling_loc, 40 | const at::Tensor &attn_weight, 41 | const at::Tensor &grad_output, 42 | const int im2col_step) 43 | { 44 | AT_ERROR("Not implement on cpu"); 45 | } 46 | 47 | -------------------------------------------------------------------------------- /tapps/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #pragma once 17 | #include 18 | 19 | at::Tensor 20 | ms_deform_attn_cpu_forward( 21 | const at::Tensor &value, 22 | const at::Tensor &spatial_shapes, 23 | const at::Tensor &level_start_index, 24 | const at::Tensor &sampling_loc, 25 | const at::Tensor &attn_weight, 26 | const int im2col_step); 27 | 28 | std::vector 29 | ms_deform_attn_cpu_backward( 30 | const at::Tensor &value, 31 | const at::Tensor &spatial_shapes, 32 | const at::Tensor &level_start_index, 33 | const at::Tensor &sampling_loc, 34 | const at::Tensor &attn_weight, 35 | const at::Tensor &grad_output, 36 | const int im2col_step); 37 | 38 | 39 | -------------------------------------------------------------------------------- /tapps/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #pragma once 17 | #include 18 | 19 | at::Tensor ms_deform_attn_cuda_forward( 20 | const at::Tensor &value, 21 | const at::Tensor &spatial_shapes, 22 | const at::Tensor &level_start_index, 23 | const at::Tensor &sampling_loc, 24 | const at::Tensor &attn_weight, 25 | const int im2col_step); 26 | 27 | std::vector ms_deform_attn_cuda_backward( 28 | const at::Tensor &value, 29 | const at::Tensor &spatial_shapes, 30 | const at::Tensor &level_start_index, 31 | const at::Tensor &sampling_loc, 32 | const at::Tensor &attn_weight, 33 | const at::Tensor &grad_output, 34 | const int im2col_step); 35 | 36 | -------------------------------------------------------------------------------- /tapps/modeling/pixel_decoder/ops/src/ms_deform_attn.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #pragma once 17 | 18 | #include "cpu/ms_deform_attn_cpu.h" 19 | 20 | #ifdef WITH_CUDA 21 | #include "cuda/ms_deform_attn_cuda.h" 22 | #endif 23 | 24 | 25 | at::Tensor 26 | ms_deform_attn_forward( 27 | const at::Tensor &value, 28 | const at::Tensor &spatial_shapes, 29 | const at::Tensor &level_start_index, 30 | const at::Tensor &sampling_loc, 31 | const at::Tensor &attn_weight, 32 | const int im2col_step) 33 | { 34 | if (value.type().is_cuda()) 35 | { 36 | #ifdef WITH_CUDA 37 | return ms_deform_attn_cuda_forward( 38 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step); 39 | #else 40 | AT_ERROR("Not compiled with GPU support"); 41 | #endif 42 | } 43 | AT_ERROR("Not implemented on the CPU"); 44 | } 45 | 46 | std::vector 47 | ms_deform_attn_backward( 48 | const at::Tensor &value, 49 | const at::Tensor &spatial_shapes, 50 | const at::Tensor &level_start_index, 51 | const at::Tensor &sampling_loc, 52 | const at::Tensor &attn_weight, 53 | const at::Tensor &grad_output, 54 | const int im2col_step) 55 | { 56 | if (value.type().is_cuda()) 57 | { 58 | #ifdef WITH_CUDA 59 | return ms_deform_attn_cuda_backward( 60 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step); 61 | #else 62 | AT_ERROR("Not compiled with GPU support"); 63 | #endif 64 | } 65 | AT_ERROR("Not implemented on the CPU"); 66 | } 67 | 68 | -------------------------------------------------------------------------------- /tapps/modeling/pixel_decoder/ops/src/vision.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #include "ms_deform_attn.h" 17 | 18 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 19 | m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward"); 20 | m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward"); 21 | } 22 | -------------------------------------------------------------------------------- /tapps/modeling/pixel_decoder/ops/test.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | from __future__ import absolute_import 13 | from __future__ import print_function 14 | from __future__ import division 15 | 16 | import time 17 | import torch 18 | import torch.nn as nn 19 | from torch.autograd import gradcheck 20 | 21 | from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch 22 | 23 | 24 | N, M, D = 1, 2, 2 25 | Lq, L, P = 2, 2, 2 26 | shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda() 27 | level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1])) 28 | S = sum([(H*W).item() for H, W in shapes]) 29 | 30 | 31 | torch.manual_seed(3) 32 | 33 | 34 | @torch.no_grad() 35 | def check_forward_equal_with_pytorch_double(): 36 | value = torch.rand(N, S, M, D).cuda() * 0.01 37 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 38 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 39 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 40 | im2col_step = 2 41 | output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu() 42 | output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu() 43 | fwdok = torch.allclose(output_cuda, output_pytorch) 44 | max_abs_err = (output_cuda - output_pytorch).abs().max() 45 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() 46 | 47 | print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') 48 | 49 | 50 | @torch.no_grad() 51 | def check_forward_equal_with_pytorch_float(): 52 | value = torch.rand(N, S, M, D).cuda() * 0.01 53 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 54 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 55 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 56 | im2col_step = 2 57 | output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu() 58 | output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu() 59 | fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3) 60 | max_abs_err = (output_cuda - output_pytorch).abs().max() 61 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() 62 | 63 | print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') 64 | 65 | 66 | def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True): 67 | 68 | value = torch.rand(N, S, M, channels).cuda() * 0.01 69 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 70 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 71 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 72 | im2col_step = 2 73 | func = MSDeformAttnFunction.apply 74 | 75 | value.requires_grad = grad_value 76 | sampling_locations.requires_grad = grad_sampling_loc 77 | attention_weights.requires_grad = grad_attn_weight 78 | 79 | gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step)) 80 | 81 | print(f'* {gradok} check_gradient_numerical(D={channels})') 82 | 83 | 84 | if __name__ == '__main__': 85 | check_forward_equal_with_pytorch_double() 86 | check_forward_equal_with_pytorch_float() 87 | 88 | for channels in [30, 32, 64, 71, 1025, 2048, 3096]: 89 | check_gradient_numerical(channels, True, True, True) 90 | 91 | 92 | 93 | -------------------------------------------------------------------------------- /tapps/modeling/transformer_decoder/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from .maskformer_transformer_decoder import StandardTransformerDecoder 3 | from .mask2former_transformer_decoder import MultiScaleMaskedTransformerDecoder 4 | from .part_decoder import PartDecoder 5 | -------------------------------------------------------------------------------- /tapps/modeling/transformer_decoder/part_decoder.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import fvcore.nn.weight_init as weight_init 3 | from typing import Optional 4 | import torch 5 | from torch import nn, Tensor 6 | from torch.nn import functional as F 7 | import numpy as np 8 | 9 | 10 | class MLP(nn.Module): 11 | """ Very simple multi-layer perceptron (also called FFN)""" 12 | 13 | def __init__(self, input_dim, hidden_dim, output_dim, num_layers): 14 | super().__init__() 15 | self.num_layers = num_layers 16 | h = [hidden_dim] * (num_layers - 1) 17 | self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) 18 | 19 | def forward(self, x): 20 | for i, layer in enumerate(self.layers): 21 | x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x) 22 | return x 23 | 24 | 25 | class PartDecoder(nn.Module): 26 | def __init__(self, 27 | num_part_classes, 28 | input_dim, 29 | hidden_dim, 30 | mask_dim, 31 | ): 32 | super().__init__() 33 | 34 | self.num_part_classes = num_part_classes 35 | 36 | self.mask_head = MLP(input_dim, hidden_dim, mask_dim * num_part_classes, num_layers=3) 37 | 38 | def forward(self, queries, num_parts_per_query, mask_features, part_ids_per_query=None): 39 | # queries shape: [Nb, num_queries (padded to max), hidden_dim] 40 | 41 | mask_embeds = self.mask_head(queries) 42 | mask_embeds = torch.tensor_split(mask_embeds, self.num_part_classes, dim=2) 43 | 44 | # mask_embeds_total is [Nb, num_queries, num_partcls, num_channels] 45 | mask_embeds_total = torch.stack(mask_embeds, dim=2) 46 | embeds_shape = mask_embeds_total.shape 47 | 48 | # mask_embeds_total is [Nb, num_queries * num_partcls, num_channels] 49 | mask_embeds_total = mask_embeds_total.view(embeds_shape[0], 50 | embeds_shape[1] * embeds_shape[2], 51 | embeds_shape[3]) 52 | 53 | # outputs_mask shape is [Nb, num_queries * num_partcls, height, width] 54 | outputs_mask = torch.einsum("bqc,bchw->bqhw", mask_embeds_total, mask_features) 55 | # outputs_mask shape is [Nb, num_queries, num_partcls, height, width] 56 | outputs_mask = outputs_mask.view(embeds_shape[0], 57 | embeds_shape[1], 58 | embeds_shape[2], 59 | outputs_mask.shape[2], 60 | outputs_mask.shape[3]) 61 | 62 | # num_parts_per_query: list of length batch_size 63 | gather_batch_dim = [] 64 | gather_num_queries = [] 65 | gather_num_partcls = [] 66 | 67 | for i, num_parts in enumerate(num_parts_per_query): 68 | if len(num_parts) != 0: 69 | if part_ids_per_query is None: 70 | idx_partcls = torch.cat([torch.arange(0, num_part) for num_part in num_parts], dim=0) 71 | else: 72 | idx_partcls = torch.cat([pt_idx for pt_idx in part_ids_per_query[i]]) 73 | idx_queries = torch.cat([torch.full_like(torch.arange(0, num_part), e, dtype=torch.long) 74 | for e, num_part in enumerate(num_parts)], dim=0) 75 | idx_batch = torch.full_like(idx_partcls, fill_value=i, dtype=torch.long) 76 | 77 | gather_batch_dim.append(idx_batch) 78 | gather_num_partcls.append(idx_partcls) 79 | gather_num_queries.append(idx_queries) 80 | 81 | if len(gather_batch_dim) != 0: 82 | gather_batch_dim = torch.cat(gather_batch_dim, dim=0) 83 | gather_num_queries = torch.cat(gather_num_queries, dim=0) 84 | gather_num_partcls = torch.cat(gather_num_partcls, dim=0) 85 | 86 | else: 87 | gather_batch_dim = torch.zeros([0], dtype=torch.long, device=mask_features.device) 88 | gather_num_queries = torch.zeros([0], dtype=torch.long, device=mask_features.device) 89 | gather_num_partcls = torch.zeros([0], dtype=torch.long, device=mask_features.device) 90 | 91 | output_masks = outputs_mask[gather_batch_dim, gather_num_queries, gather_num_partcls] 92 | 93 | return output_masks -------------------------------------------------------------------------------- /tapps/modeling/transformer_decoder/position_encoding.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py 3 | """ 4 | Various positional encodings for the transformer. 5 | """ 6 | import math 7 | 8 | import torch 9 | from torch import nn 10 | 11 | 12 | class PositionEmbeddingSine(nn.Module): 13 | """ 14 | This is a more standard version of the position embedding, very similar to the one 15 | used by the Attention is all you need paper, generalized to work on images. 16 | """ 17 | 18 | def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None): 19 | super().__init__() 20 | self.num_pos_feats = num_pos_feats 21 | self.temperature = temperature 22 | self.normalize = normalize 23 | if scale is not None and normalize is False: 24 | raise ValueError("normalize should be True if scale is passed") 25 | if scale is None: 26 | scale = 2 * math.pi 27 | self.scale = scale 28 | 29 | def forward(self, x, mask=None): 30 | if mask is None: 31 | mask = torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool) 32 | not_mask = ~mask 33 | y_embed = not_mask.cumsum(1, dtype=torch.float32) 34 | x_embed = not_mask.cumsum(2, dtype=torch.float32) 35 | if self.normalize: 36 | eps = 1e-6 37 | y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale 38 | x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale 39 | 40 | dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) 41 | dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) 42 | 43 | pos_x = x_embed[:, :, :, None] / dim_t 44 | pos_y = y_embed[:, :, :, None] / dim_t 45 | pos_x = torch.stack( 46 | (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4 47 | ).flatten(3) 48 | pos_y = torch.stack( 49 | (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4 50 | ).flatten(3) 51 | pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) 52 | return pos 53 | 54 | def __repr__(self, _repr_indent=4): 55 | head = "Positional encoding " + self.__class__.__name__ 56 | body = [ 57 | "num_pos_feats: {}".format(self.num_pos_feats), 58 | "temperature: {}".format(self.temperature), 59 | "normalize: {}".format(self.normalize), 60 | "scale: {}".format(self.scale), 61 | ] 62 | # _repr_indent = 4 63 | lines = [head] + [" " * _repr_indent + line for line in body] 64 | return "\n".join(lines) 65 | -------------------------------------------------------------------------------- /tapps/test_time_augmentation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import copy 3 | import logging 4 | from itertools import count 5 | 6 | import numpy as np 7 | import torch 8 | from fvcore.transforms import HFlipTransform 9 | from torch import nn 10 | from torch.nn.parallel import DistributedDataParallel 11 | 12 | from detectron2.data.detection_utils import read_image 13 | from detectron2.modeling import DatasetMapperTTA 14 | 15 | 16 | __all__ = [ 17 | "SemanticSegmentorWithTTA", 18 | ] 19 | 20 | 21 | class SemanticSegmentorWithTTA(nn.Module): 22 | """ 23 | A SemanticSegmentor with test-time augmentation enabled. 24 | Its :meth:`__call__` method has the same interface as :meth:`SemanticSegmentor.forward`. 25 | """ 26 | 27 | def __init__(self, cfg, model, tta_mapper=None, batch_size=1): 28 | """ 29 | Args: 30 | cfg (CfgNode): 31 | model (SemanticSegmentor): a SemanticSegmentor to apply TTA on. 32 | tta_mapper (callable): takes a dataset dict and returns a list of 33 | augmented versions of the dataset dict. Defaults to 34 | `DatasetMapperTTA(cfg)`. 35 | batch_size (int): batch the augmented images into this batch size for inference. 36 | """ 37 | super().__init__() 38 | if isinstance(model, DistributedDataParallel): 39 | model = model.module 40 | self.cfg = cfg.clone() 41 | 42 | self.model = model 43 | 44 | if tta_mapper is None: 45 | tta_mapper = DatasetMapperTTA(cfg) 46 | self.tta_mapper = tta_mapper 47 | self.batch_size = batch_size 48 | 49 | def __call__(self, batched_inputs): 50 | """ 51 | Same input/output format as :meth:`SemanticSegmentor.forward` 52 | """ 53 | 54 | def _maybe_read_image(dataset_dict): 55 | ret = copy.copy(dataset_dict) 56 | if "image" not in ret: 57 | image = read_image(ret.pop("file_name"), self.model.input_format) 58 | image = torch.from_numpy(np.ascontiguousarray(image.transpose(2, 0, 1))) # CHW 59 | ret["image"] = image 60 | if "height" not in ret and "width" not in ret: 61 | ret["height"] = image.shape[1] 62 | ret["width"] = image.shape[2] 63 | return ret 64 | 65 | processed_results = [] 66 | for x in batched_inputs: 67 | result = self._inference_one_image(_maybe_read_image(x)) 68 | processed_results.append(result) 69 | return processed_results 70 | 71 | def _inference_one_image(self, input): 72 | """ 73 | Args: 74 | input (dict): one dataset dict with "image" field being a CHW tensor 75 | Returns: 76 | dict: one output dict 77 | """ 78 | orig_shape = (input["height"], input["width"]) 79 | augmented_inputs, tfms = self._get_augmented_inputs(input) 80 | 81 | final_predictions = None 82 | count_predictions = 0 83 | for input, tfm in zip(augmented_inputs, tfms): 84 | count_predictions += 1 85 | with torch.no_grad(): 86 | if final_predictions is None: 87 | if any(isinstance(t, HFlipTransform) for t in tfm.transforms): 88 | final_predictions = self.model([input])[0].pop("sem_seg").flip(dims=[2]) 89 | else: 90 | final_predictions = self.model([input])[0].pop("sem_seg") 91 | else: 92 | if any(isinstance(t, HFlipTransform) for t in tfm.transforms): 93 | final_predictions += self.model([input])[0].pop("sem_seg").flip(dims=[2]) 94 | else: 95 | final_predictions += self.model([input])[0].pop("sem_seg") 96 | 97 | final_predictions = final_predictions / count_predictions 98 | return {"sem_seg": final_predictions} 99 | 100 | def _get_augmented_inputs(self, input): 101 | augmented_inputs = self.tta_mapper(input) 102 | tfms = [x.pop("transforms") for x in augmented_inputs] 103 | return augmented_inputs, tfms 104 | -------------------------------------------------------------------------------- /tapps/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /tapps/utils/misc.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/util/misc.py 3 | """ 4 | Misc functions, including distributed helpers. 5 | 6 | Mostly copy-paste from torchvision references. 7 | """ 8 | from typing import List, Optional 9 | 10 | import torch 11 | import torch.distributed as dist 12 | import torchvision 13 | from torch import Tensor 14 | 15 | 16 | def _max_by_axis(the_list): 17 | # type: (List[List[int]]) -> List[int] 18 | maxes = the_list[0] 19 | for sublist in the_list[1:]: 20 | for index, item in enumerate(sublist): 21 | maxes[index] = max(maxes[index], item) 22 | return maxes 23 | 24 | 25 | class NestedTensor(object): 26 | def __init__(self, tensors, mask: Optional[Tensor]): 27 | self.tensors = tensors 28 | self.mask = mask 29 | 30 | def to(self, device): 31 | # type: (Device) -> NestedTensor # noqa 32 | cast_tensor = self.tensors.to(device) 33 | mask = self.mask 34 | if mask is not None: 35 | assert mask is not None 36 | cast_mask = mask.to(device) 37 | else: 38 | cast_mask = None 39 | return NestedTensor(cast_tensor, cast_mask) 40 | 41 | def decompose(self): 42 | return self.tensors, self.mask 43 | 44 | def __repr__(self): 45 | return str(self.tensors) 46 | 47 | 48 | def nested_tensor_from_tensor_list(tensor_list: List[Tensor]): 49 | # TODO make this more general 50 | if tensor_list[0].ndim == 3: 51 | if torchvision._is_tracing(): 52 | # nested_tensor_from_tensor_list() does not export well to ONNX 53 | # call _onnx_nested_tensor_from_tensor_list() instead 54 | return _onnx_nested_tensor_from_tensor_list(tensor_list) 55 | 56 | # TODO make it support different-sized images 57 | max_size = _max_by_axis([list(img.shape) for img in tensor_list]) 58 | # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list])) 59 | batch_shape = [len(tensor_list)] + max_size 60 | b, c, h, w = batch_shape 61 | dtype = tensor_list[0].dtype 62 | device = tensor_list[0].device 63 | tensor = torch.zeros(batch_shape, dtype=dtype, device=device) 64 | mask = torch.ones((b, h, w), dtype=torch.bool, device=device) 65 | for img, pad_img, m in zip(tensor_list, tensor, mask): 66 | pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) 67 | m[: img.shape[1], : img.shape[2]] = False 68 | else: 69 | raise ValueError("not supported") 70 | return NestedTensor(tensor, mask) 71 | 72 | 73 | # _onnx_nested_tensor_from_tensor_list() is an implementation of 74 | # nested_tensor_from_tensor_list() that is supported by ONNX tracing. 75 | @torch.jit.unused 76 | def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor: 77 | max_size = [] 78 | for i in range(tensor_list[0].dim()): 79 | max_size_i = torch.max( 80 | torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32) 81 | ).to(torch.int64) 82 | max_size.append(max_size_i) 83 | max_size = tuple(max_size) 84 | 85 | # work around for 86 | # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) 87 | # m[: img.shape[1], :img.shape[2]] = False 88 | # which is not yet supported in onnx 89 | padded_imgs = [] 90 | padded_masks = [] 91 | for img in tensor_list: 92 | padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))] 93 | padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0])) 94 | padded_imgs.append(padded_img) 95 | 96 | m = torch.zeros_like(img[0], dtype=torch.int, device=img.device) 97 | padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1) 98 | padded_masks.append(padded_mask.to(torch.bool)) 99 | 100 | tensor = torch.stack(padded_imgs) 101 | mask = torch.stack(padded_masks) 102 | 103 | return NestedTensor(tensor, mask=mask) 104 | 105 | 106 | def is_dist_avail_and_initialized(): 107 | if not dist.is_available(): 108 | return False 109 | if not dist.is_initialized(): 110 | return False 111 | return True 112 | -------------------------------------------------------------------------------- /tools/README.md: -------------------------------------------------------------------------------- 1 | This directory contains few tools for MaskFormer. 2 | 3 | * `convert-torchvision-to-d2.py` 4 | 5 | Tool to convert torchvision pre-trained weights for D2. 6 | 7 | ``` 8 | wget https://download.pytorch.org/models/resnet101-63fe2227.pth 9 | python tools/convert-torchvision-to-d2.py resnet101-63fe2227.pth R-101.pkl 10 | ``` 11 | 12 | * `convert-pretrained-swin-model-to-d2.py` 13 | 14 | Tool to convert Swin Transformer pre-trained weights for D2. 15 | 16 | ``` 17 | pip install timm 18 | 19 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth 20 | python tools/convert-pretrained-swin-model-to-d2.py swin_tiny_patch4_window7_224.pth swin_tiny_patch4_window7_224.pkl 21 | 22 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_small_patch4_window7_224.pth 23 | python tools/convert-pretrained-swin-model-to-d2.py swin_small_patch4_window7_224.pth swin_small_patch4_window7_224.pkl 24 | 25 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window12_384_22k.pth 26 | python tools/convert-pretrained-swin-model-to-d2.py swin_base_patch4_window12_384_22k.pth swin_base_patch4_window12_384_22k.pkl 27 | 28 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth 29 | python tools/convert-pretrained-swin-model-to-d2.py swin_large_patch4_window12_384_22k.pth swin_large_patch4_window12_384_22k.pkl 30 | ``` 31 | 32 | * `evaluate_pq_for_semantic_segmentation.py` 33 | 34 | Tool to evaluate PQ (PQ-stuff) for semantic segmentation predictions. 35 | 36 | Usage: 37 | 38 | ``` 39 | python tools/evaluate_pq_for_semantic_segmentation.py --dataset-name ade20k_sem_seg_val --json-file OUTPUT_DIR/inference/sem_seg_predictions.json 40 | ``` 41 | 42 | where `OUTPUT_DIR` is set in the config file. 43 | 44 | * `evaluate_coco_boundary_ap.py` 45 | 46 | Tool to evaluate Boundary AP for instance segmentation predictions. 47 | 48 | Usage: 49 | 50 | ``` 51 | python tools/coco_instance_evaluation.py --gt-json-file COCO_GT_JSON --dt-json-file COCO_DT_JSON 52 | ``` 53 | 54 | To install Boundary IoU API, run: 55 | 56 | ``` 57 | pip install git+https://github.com/bowenc0221/boundary-iou-api.git 58 | ``` 59 | 60 | * `analyze_model.py` 61 | 62 | Tool to analyze model parameters and flops. 63 | 64 | Usage for semantic segmentation (ADE20K only, use with caution!): 65 | 66 | ``` 67 | python tools/analyze_model.py --num-inputs 1 --tasks flop --use-fixed-input-size --config-file CONFIG_FILE 68 | ``` 69 | 70 | Note that, for semantic segmentation (ADE20K only), we use a dummy image with fixed size that equals to `cfg.INPUT.CROP.SIZE[0] x cfg.INPUT.CROP.SIZE[0]`. 71 | Please do not use `--use-fixed-input-size` for calculating FLOPs on other datasets like Cityscapes! 72 | 73 | Usage for panoptic and instance segmentation: 74 | 75 | ``` 76 | python tools/analyze_model.py --num-inputs 100 --tasks flop --config-file CONFIG_FILE 77 | ``` 78 | 79 | Note that, for panoptic and instance segmentation, we compute the average flops over 100 real validation images. 80 | -------------------------------------------------------------------------------- /tools/convert-pretrained-swin-model-to-d2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 3 | 4 | import pickle as pkl 5 | import sys 6 | 7 | import torch 8 | 9 | """ 10 | Usage: 11 | # download pretrained swin model: 12 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth 13 | # run the conversion 14 | ./convert-pretrained-model-to-d2.py swin_tiny_patch4_window7_224.pth swin_tiny_patch4_window7_224.pkl 15 | # Then, use swin_tiny_patch4_window7_224.pkl with the following changes in config: 16 | MODEL: 17 | WEIGHTS: "/path/to/swin_tiny_patch4_window7_224.pkl" 18 | INPUT: 19 | FORMAT: "RGB" 20 | """ 21 | 22 | if __name__ == "__main__": 23 | input = sys.argv[1] 24 | 25 | obj = torch.load(input, map_location="cpu")["model"] 26 | 27 | res = {"model": obj, "__author__": "third_party", "matching_heuristics": True} 28 | 29 | with open(sys.argv[2], "wb") as f: 30 | pkl.dump(res, f) 31 | -------------------------------------------------------------------------------- /tools/convert-torchvision-to-d2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | 4 | import pickle as pkl 5 | import sys 6 | 7 | import torch 8 | 9 | """ 10 | Usage: 11 | # download one of the ResNet{18,34,50,101,152} models from torchvision: 12 | wget https://download.pytorch.org/models/resnet50-19c8e357.pth -O r50.pth 13 | # run the conversion 14 | ./convert-torchvision-to-d2.py r50.pth r50.pkl 15 | # Then, use r50.pkl with the following changes in config: 16 | MODEL: 17 | WEIGHTS: "/path/to/r50.pkl" 18 | PIXEL_MEAN: [123.675, 116.280, 103.530] 19 | PIXEL_STD: [58.395, 57.120, 57.375] 20 | RESNETS: 21 | DEPTH: 50 22 | STRIDE_IN_1X1: False 23 | INPUT: 24 | FORMAT: "RGB" 25 | """ 26 | 27 | if __name__ == "__main__": 28 | input = sys.argv[1] 29 | 30 | obj = torch.load(input, map_location="cpu") 31 | 32 | newmodel = {} 33 | for k in list(obj.keys()): 34 | old_k = k 35 | if "layer" not in k: 36 | k = "stem." + k 37 | for t in [1, 2, 3, 4]: 38 | k = k.replace("layer{}".format(t), "res{}".format(t + 1)) 39 | for t in [1, 2, 3]: 40 | k = k.replace("bn{}".format(t), "conv{}.norm".format(t)) 41 | k = k.replace("downsample.0", "shortcut") 42 | k = k.replace("downsample.1", "shortcut.norm") 43 | print(old_k, "->", k) 44 | newmodel[k] = obj.pop(old_k).detach().numpy() 45 | 46 | res = {"model": newmodel, "__author__": "torchvision", "matching_heuristics": True} 47 | 48 | with open(sys.argv[2], "wb") as f: 49 | pkl.dump(res, f) 50 | if obj: 51 | print("Unconverted keys:", obj.keys()) 52 | -------------------------------------------------------------------------------- /tools/evaluate_coco_boundary_ap.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 3 | # Modified by Bowen Cheng from: https://github.com/bowenc0221/boundary-iou-api/blob/master/tools/coco_instance_evaluation.py 4 | 5 | """ 6 | Evaluation for COCO val2017: 7 | python ./tools/coco_instance_evaluation.py \ 8 | --gt-json-file COCO_GT_JSON \ 9 | --dt-json-file COCO_DT_JSON 10 | """ 11 | import argparse 12 | import json 13 | 14 | from boundary_iou.coco_instance_api.coco import COCO 15 | from boundary_iou.coco_instance_api.cocoeval import COCOeval 16 | 17 | 18 | def main(): 19 | parser = argparse.ArgumentParser() 20 | parser.add_argument("--gt-json-file", default="") 21 | parser.add_argument("--dt-json-file", default="") 22 | parser.add_argument("--iou-type", default="boundary") 23 | parser.add_argument("--dilation-ratio", default="0.020", type=float) 24 | args = parser.parse_args() 25 | print(args) 26 | 27 | annFile = args.gt_json_file 28 | resFile = args.dt_json_file 29 | dilation_ratio = args.dilation_ratio 30 | if args.iou_type == "boundary": 31 | get_boundary = True 32 | else: 33 | get_boundary = False 34 | cocoGt = COCO(annFile, get_boundary=get_boundary, dilation_ratio=dilation_ratio) 35 | 36 | # remove box predictions 37 | resFile = json.load(open(resFile)) 38 | for c in resFile: 39 | c.pop("bbox", None) 40 | 41 | cocoDt = cocoGt.loadRes(resFile) 42 | cocoEval = COCOeval(cocoGt, cocoDt, iouType=args.iou_type, dilation_ratio=dilation_ratio) 43 | cocoEval.evaluate() 44 | cocoEval.accumulate() 45 | cocoEval.summarize() 46 | 47 | 48 | if __name__ == '__main__': 49 | main() 50 | -------------------------------------------------------------------------------- /utils/panoptic_parts/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include panoptic_parts/specs/dataset_specs/*.yaml 2 | include panoptic_parts/specs/eval_specs/*.yaml -------------------------------------------------------------------------------- /utils/panoptic_parts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tue-mps/tapps/54d8d9f039fa91da6e046bfd8c62c13a9dc5e8b0/utils/panoptic_parts/__init__.py -------------------------------------------------------------------------------- /utils/panoptic_parts/docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /utils/panoptic_parts/docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /utils/panoptic_parts/docs/requirements.txt: -------------------------------------------------------------------------------- 1 | Sphinx==4.5.0 2 | sphinx-rtd-theme==0.5.1 3 | sphinx-autodoc-typehints==1.11.1 4 | recommonmark==0.7.1 5 | sphinx-markdown-tables==0.0.15 -------------------------------------------------------------------------------- /utils/panoptic_parts/docs/source/api_and_code.rst: -------------------------------------------------------------------------------- 1 | API Reference 2 | ============= 3 | 4 | We provide a public, stable API consisting of tested modules. However, in members of the API you may encounter experimental features (e.g. arguments or functions). These have the prefix `experimental\_` and are exempted from stability guarantees. 5 | 6 | The functions of the API are exported (apart from their original modules) also in the panoptic_parts namespace, so they can be imported and used as: 7 | 8 | .. code-block:: python 9 | 10 | import panoptic_parts as pp 11 | pp.decode_uids(uids) 12 | 13 | 14 | 15 | Label format handling 16 | --------------------- 17 | 18 | .. autofunction:: panoptic_parts.utils.format.decode_uids 19 | .. autofunction:: panoptic_parts.utils.format.encode_ids 20 | 21 | Visualization 22 | ------------- 23 | 24 | .. autofunction:: panoptic_parts.utils.visualization.random_colors 25 | .. autofunction:: panoptic_parts.utils.visualization.uid2color 26 | 27 | Misc 28 | ---- 29 | 30 | .. autofunction:: panoptic_parts.utils.utils.safe_write 31 | 32 | 33 | Code Reference 34 | ============== 35 | 36 | Documented/Undocumented functionality of the rest of the code his repo lies here. This functionality will be added to the API in the future. Until then, the following functions may be moved or be unstable. 37 | 38 | Dataset & Evaluation specifications 39 | ----------------------------------- 40 | 41 | .. autoclass:: panoptic_parts.specs.dataset_spec.DatasetSpec 42 | :members: 43 | :undoc-members: 44 | 45 | .. autoclass:: panoptic_parts.specs.eval_spec.PartPQEvalSpec 46 | :members: 47 | :undoc-members: 48 | 49 | .. autoclass:: panoptic_parts.specs.eval_spec.SegmentationPartsEvalSpec 50 | :members: 51 | :undoc-members: 52 | 53 | Visualization 54 | ------------- 55 | 56 | .. autofunction:: panoptic_parts.visualization.visualize_label_with_legend.visualize_from_paths 57 | .. autofunction:: panoptic_parts.utils.visualization.experimental_colorize_label 58 | .. autofunction:: panoptic_parts.utils.visualization._generate_shades 59 | .. autofunction:: panoptic_parts.utils.visualization._num_instances_per_sid 60 | .. autofunction:: panoptic_parts.utils.visualization._num_parts_per_sid 61 | .. autofunction:: panoptic_parts.utils.visualization._sid2iids 62 | .. autofunction:: panoptic_parts.utils.visualization._sid2pids 63 | 64 | Evaluation 65 | ---------- 66 | 67 | .. autofunction:: panoptic_parts.utils.evaluation_PartPQ.evaluate_PartPQ_multicore 68 | .. autoclass:: panoptic_parts.utils.experimental_evaluation_IOU.ConfusionMatrixEvaluator_v2 69 | :members: 70 | :undoc-members: 71 | :show-inheritance: 72 | 73 | Misc 74 | ---- 75 | 76 | .. autofunction:: panoptic_parts.utils.utils.compare_pixelwise 77 | .. autofunction:: panoptic_parts.utils.utils._sparse_ids_mapping_to_dense_ids_mapping -------------------------------------------------------------------------------- /utils/panoptic_parts/docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import os 14 | import sys 15 | from recommonmark.transform import AutoStructify 16 | from recommonmark.parser import CommonMarkParser 17 | sys.path.insert(0, os.path.abspath('../..')) 18 | import panoptic_parts 19 | 20 | 21 | # -- Project information ----------------------------------------------------- 22 | 23 | project = 'Part-aware Panoptic Segmentation' 24 | copyright = '2021, The Panoptic Parts datasets team' 25 | author = 'Panagiotis Meletis and Xiaoxiao (Vincent) Wen' 26 | # version_file = '../../panoptic_parts/version.py' 27 | 28 | 29 | # def get_version(): 30 | # with open(version_file, 'r') as f: 31 | # exec(compile(f.read(), version_file, 'exec')) 32 | # return locals()['__version__'] 33 | 34 | 35 | # # The full version, including alpha/beta/rc tags 36 | # release = get_version() 37 | 38 | release = panoptic_parts.__version__ 39 | 40 | # -- General configuration --------------------------------------------------- 41 | 42 | # Add any Sphinx extension module names here, as strings. They can be 43 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 44 | # ones. Napoleon should be loaded before sphinx_autodoc_typehints. 45 | extensions = [ 46 | 'sphinx.ext.autodoc', 47 | 'sphinx.ext.napoleon', 48 | 'sphinx.ext.viewcode', 49 | 'sphinx_autodoc_typehints', 50 | 'recommonmark', 51 | 'sphinx_markdown_tables', 52 | ] 53 | 54 | # Disable module names in auto documentation 55 | # add_module_names = False 56 | 57 | # sphinx.ext.autodoc options 58 | # set_type_checking_flag = False # defaults to False 59 | typehints_fully_qualified = True # defaults to False 60 | autodoc_preserve_defaults = True 61 | 62 | # Add any paths that contain templates here, relative to this directory. 63 | # templates_path = ['_templates'] 64 | 65 | # The suffix(es) of source filenames. 66 | # You can specify multiple suffix as a list of string: 67 | source_suffix = { 68 | '.rst': 'restructuredtext', 69 | '.md': 'markdown', 70 | } 71 | 72 | 73 | source_parsers = { 74 | '.md': CommonMarkParser, 75 | } 76 | 77 | # The master toctree document. 78 | master_doc = 'index' 79 | 80 | # List of patterns, relative to source directory, that match files and 81 | # directories to ignore when looking for source files. 82 | # This pattern also affects html_static_path and html_extra_path. 83 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 84 | 85 | 86 | # -- Options for HTML output ------------------------------------------------- 87 | 88 | # The theme to use for HTML and HTML Help pages. See the documentation for 89 | # a list of builtin themes. 90 | # 91 | html_theme = 'sphinx_rtd_theme' 92 | 93 | # Add any paths that contain custom static files (such as style sheets) here, 94 | # relative to this directory. They are copied after the builtin static files, 95 | # so a file named "default.css" will overwrite the builtin "default.css". 96 | html_static_path = ['_static'] 97 | 98 | github_doc_root = 'https://github.com/rtfd/recommonmark/tree/master/doc/' 99 | def setup(app): 100 | app.add_config_value('recommonmark_config', { 101 | 'url_resolver': lambda url: github_doc_root + url, 102 | 'auto_toc_tree_section': 'Contents', 103 | }, True) 104 | app.add_transform(AutoStructify) -------------------------------------------------------------------------------- /utils/panoptic_parts/docs/source/contact.md: -------------------------------------------------------------------------------- 1 | ## Contact 2 | 3 | Please feel free to contact us for any suggestions or questions: 4 | 5 | * Panagiotis Meletis: **p**[DOT]**c**[DOT]**meletis**[AT]**tue.nl** 6 | * Xiaoxiao (Vincent) Wen: **wenxx10**[AT]**gmail.com** -------------------------------------------------------------------------------- /utils/panoptic_parts/docs/source/errata_cvpr2021.md: -------------------------------------------------------------------------------- 1 | # CVPR 2021 paper errata 2 | 3 | Here is list of the tables from the paper Part-aware Panoptic Segmentation and the corrected PartPQ results. 4 | -------------------------------------------------------------------------------- /utils/panoptic_parts/docs/source/ground_truth_usage_cases.md: -------------------------------------------------------------------------------- 1 | ## Ground Truth usage cases 2 | 3 | 4 | 5 | 6 | ```eval_rst 7 | We provide for each image a single (image-like) ground truth file encoding semantic-, instance-, and parts- levels annotations. Our compact :doc:`Label format ` together with 8 | :func:`panoptic_parts.utils.format.decode_uids` 9 | function enable easy decoding of the labels for various image understanding tasks including: 10 | ``` 11 | 12 | ```Python 13 | # labels: Python int, or np.ndarray, or tf.Tensor, or torch.tensor 14 | 15 | # Semantic Segmentation 16 | semantic_ids, _, _ = decode_uids(labels) 17 | 18 | # Instance Segmentation 19 | semantic_ids, instance_ids, _ = decode_uids(labels) 20 | 21 | # Panoptic Segmentation 22 | _, _, _, semantic_instance_ids = decode_uids(labels, return_sids_iids=True) 23 | 24 | # Parts Segmentation / Parts Parsing 25 | _, _, _, semantic_parts_ids = decode_uids(labels, return_sids_pids=True) 26 | 27 | # Instance-level Parts Parsing 28 | semantic_ids, instance_ids, parts_ids = decode_uids(labels) 29 | 30 | # Parts-level Panoptic Segmentation 31 | _, _, _, semantic_instance_ids, semantic_parts_ids = decode_uids(labels, return_sids_iids=True, return_sids_pids=True) 32 | 33 | ``` -------------------------------------------------------------------------------- /utils/panoptic_parts/docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. Part-aware Panoptic Segmentation documentation master file, created by 2 | sphinx-quickstart on Thu Jan 28 11:43:38 2021. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to Part-aware Panoptic Segmentation documentation! 7 | =========================================================================== 8 | 9 | .. toctree:: 10 | :maxdepth: 1 11 | :caption: Get Started 12 | 13 | introduction.md 14 | installation.md 15 | label_format.md 16 | 17 | .. toctree:: 18 | :caption: API & Code Reference 19 | 20 | api_and_code.rst 21 | 22 | 23 | .. toctree:: 24 | :caption: Evaluation 25 | :maxdepth: 1 26 | 27 | evaluate_results.md 28 | 29 | .. toctree:: 30 | :maxdepth: 1 31 | :caption: Examples and Tools 32 | 33 | visualization.md 34 | generate_results.md 35 | ground_truth_usage_cases.md 36 | tools.md 37 | scripts.md 38 | 39 | .. toctree:: 40 | :caption: Contact 41 | 42 | ======= 43 | Contact 44 | ======= 45 | Please feel free to contact us for any suggestions or questions. 46 | 47 | **panoptic.parts@outlook.com** 48 | 49 | Correspondence: Panagiotis Meletis, Vincent (Xiaoxiao) Wen 50 | 51 | The Panoptic Parts datasets team 52 | 53 | 54 | Indices and tables 55 | ================== 56 | 57 | * :ref:`genindex` 58 | * :ref:`search` 59 | -------------------------------------------------------------------------------- /utils/panoptic_parts/docs/source/installation.md: -------------------------------------------------------------------------------- 1 | ## Installation 2 | 3 | The code can be installed from the PyPI and requires at least Python 3.7. It is recommended to install it in a Python virtual environment. 4 | 5 | ```shell 6 | pip install panoptic_parts 7 | ``` 8 | 9 | Some functionality requires extra packages to be installed, e.g. evaluation scripts (tqdm) or Pytorch/Tensorflow (torch/tensorflow). These can be installed separately or by downloading the `optional.txt` file from this repo and running the following command in the virtual environment: 10 | 11 | ```shell 12 | pip install -r optional.txt 13 | ``` 14 | 15 | After installation you can use the package as: 16 | 17 | ```python 18 | import panoptic_parts as pp 19 | 20 | print(pp.VERSION) 21 | ``` 22 | 23 | There are three scripts defined as entry points by the package: 24 | 25 | ```shell 26 | pp_merge_to_panoptic 27 | pp_merge_to_pps 28 | pp_visualize_label_with_legend 29 | ``` 30 | -------------------------------------------------------------------------------- /utils/panoptic_parts/docs/source/introduction.md: -------------------------------------------------------------------------------- 1 | # Introduction 2 | 3 | This repository contains code and tools for reading, processing, evaluating on, and visualizing Panoptic Parts datasets. Moreover, it contains code for reproducing our CVPR 2021 paper results. 4 | 5 | ## Datasets 6 | 7 | *Cityscapes-Panoptic-Parts* and *PASCAL-Panoptic-Parts* are created by extending two established datasets for image scene understanding, namely [Cityscapes](https://github.com/mcordts/cityscapesScripts "Cityscapes") and [PASCAL](http://host.robots.ox.ac.uk/pascal/VOC/voc2010/ "PASCAL") datasets. Detailed description of the datasets and various statistics are presented in our technical report in [arxiv](https://arxiv.org/abs/2004.07944 "arxiv.org"). The datasets can be downloaded from: 8 | 9 | - [Cityscapes Panoptic Parts](https://www.cityscapes-dataset.com/login/) 10 | - [PASCAL Panoptic Parts](https://1drv.ms/u/s!AojlpuGgPtL1bHXfIdeL14IeVhI?e=5tNfET) ([alternative link](https://pan.baidu.com/s/1k96Wdg_IyD91kvq87Wy7nw), code: i7ap) 11 | 12 | ## API and code reference 13 | 14 | We provide a public, stable API, and various code utilities that are documented [here](https://panoptic-parts.readthedocs.io/en/stable/api_and_code.html). 15 | 16 | ## Reproducing CVPR 2021 paper 17 | 18 | The part-aware panoptic segmentation results from the paper can be reproduced using [this](https://panoptic-parts.readthedocs.io/en/stable/generate_results.html) guide. 19 | 20 | ## Evaluation metrics 21 | 22 | We provide two metrics for evaluating performance on Panoptic Parts datasets. 23 | 24 | - Part-aware Panoptic Quality (PartPQ): [here](https://panoptic-parts.readthedocs.io/en/stable/evaluate_results.html). 25 | - Intersection over Union (IoU): _TBA_ 26 | 27 | ## Citations 28 | 29 | Please cite us if you find our work useful or you use it in your research: 30 | 31 | ```bibtex 32 | @inproceedings{degeus2021panopticparts, 33 | title = {Part-aware Panoptic Segmentation}, 34 | author = {Daan de Geus and Panagiotis Meletis and Chenyang Lu and Xiaoxiao Wen and Gijs Dubbelman}, 35 | booktitle = {IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, 36 | year = {2021} 37 | } 38 | ``` 39 | 40 | ```bibtex 41 | @article{meletis2020panopticparts, 42 | title = {Cityscapes-Panoptic-Parts and PASCAL-Panoptic-Parts datasets for Scene Understanding}, 43 | author = {Panagiotis Meletis and Xiaoxiao Wen and Chenyang Lu and Daan de Geus and Gijs Dubbelman}, 44 | type = {Technical report}, 45 | institution = {Eindhoven University of Technology}, 46 | date = {16/04/2020}, 47 | url = {https://github.com/tue-mps/panoptic_parts}, 48 | eprint={2004.07944}, 49 | archivePrefix={arXiv}, 50 | primaryClass={cs.CV} 51 | } 52 | ``` 53 | 54 | 55 | 56 | ```eval_rst 57 | .. image:: _static/mps_logo.png 58 | :target: https://www.tue.nl/en/research/research-groups/signal-processing-systems/mobile-perception-systems-lab/ 59 | :alt: MPS 60 | :height: 100 61 | 62 | .. image:: _static/tue_logo.jpg 63 | :target: https://www.tue.nl/ 64 | :alt: TU/e 65 | :height: 100 66 | ``` 67 | -------------------------------------------------------------------------------- /utils/panoptic_parts/docs/source/label_format.md: -------------------------------------------------------------------------------- 1 | # Serialization format: hierarchical information encoding 2 | 3 | The goal of the format is to include (per image) all annotations in a single, image-like file with consistent representations across all abstractions and information levels. This enables easy transfer, reading, and compactly handling annotations. The following hierarchical structure is chosen, which extends the Cityscapes serialization format. 4 | 5 | The goal of the format is to include (per image) all annotations in a single, image-like label file with a consistent encoding across all abstractions and information levels. This enables easy transfer, reading, and compact handling of the annotations. The following hierarchical structure is chosen, which extends the Cityscapes serialization format. 6 | 7 | 8 | ```eval_rst 9 | .. image:: _static/hierarchical_format.jpg 10 | :target: _static/hierarchical_format.jpg 11 | :alt: Hierarchical Label Format 12 | ``` 13 | 14 | We encode three levels of labels: semantic, instance, and parts in a single image-like file. Labels for both datasets follow this format. 15 | Each pixel in our hierarchical label format has an up to 7-digit _universal id_ (_uid_) containing: 16 | 17 | - An up to 2-digit _semantic id_ (_sid_), encoding the semantic-level _things_ or _stuff_ class. 18 | - An up to 3-digit _instance id_ (_iid_), a counter of instances per _things_ class and per image. This is optional. 19 | - An up to 2-digit _part id_ (_pid_), encoding the parts-level semantic class per-instance and per-image. This is optional, but if provided requires also an _iid_. Only _things_ parts are covered by this format. 20 | 21 | We compactly encode the aforementioned _ids_ (_sid_, _iid_, _pid_) into an up to 7-digit _uid_. Starting from the left, the first one or two digits encode the semantic class, the next 3 encode the instance (after zero pre-padding), and the final two encode the parts class (after zero pre-padding). 22 | 23 | Using the above encoding: 24 | 25 | - 1-2 digit _uids_ encode only semantic-level labels 26 | - 4-5 digit _uids_ encode semantic-instance-level labels 27 | - 6-7 digit _uids_ encode semantic-instance-parts-level labels 28 | 29 | For example, in _Cityscapes-Panoptic-Parts_, a _sky_ (_stuff_) pixel will have _uid_ = 23, a _car_ (_things_) pixel that is labeled only on the semantic level will have _uid_ = 26, if it's labeled also on instance level it can have _uid_ = 26002, and a _person_ (_things_) pixel that is labeled on all three levels can have _uid_ = 2401002. 30 | 31 | > The format covers parts-level classes for _stuff_ semantic classes using a dummy instance id (`iid = 0`). Cityscapes Panoptic Parts and PASCAL Panoptic Parts do not currently define any _stuff_ with part-level classes. This is a feature that can be used in future extensions. 32 | 33 | ## Unlabeled/Ignored pixels 34 | 35 | We handle the unlabeled / void / ignored / "do not care pixels" in the three levels as follows: 36 | 37 | - Semantic level: For _Cityscapes-Panoptic-Parts_ we use the original Cityscapes void class. For _PASCAL-Panoptic-Parts_ we use the class with _uid_ = 0. 38 | - Instance level: For instances the void class is not required. If a pixel does not belong to an object or cannot be labeled on instance level then it has only an up to 2-digit _semantic id_. 39 | - Parts level: For both datasets we use the convention that, for each semantic class, the part-level class with _pid_ = 0 represents the void pixels, e.g., for a _person_ pixel, _uid_ = 2401000 represents the void parts pixels of instance 10. The need for a void class arises during the manual annotation process but in principle it is not needed at the parts level. Thus, we try to minimize void parts level pixels and assign them instead only the semantic- or semantic-instance -level labels. 40 | -------------------------------------------------------------------------------- /utils/panoptic_parts/docs/source/scripts.md: -------------------------------------------------------------------------------- 1 | ## Scripts -------------------------------------------------------------------------------- /utils/panoptic_parts/docs/source/tools.md: -------------------------------------------------------------------------------- 1 | ## Tools -------------------------------------------------------------------------------- /utils/panoptic_parts/docs/source/visualization.md: -------------------------------------------------------------------------------- 1 | ## Visualization of ground truth 2 | 3 | ### Cityscapes-Panoptic-Parts 4 | 5 | ```eval_rst 6 | .. list-table:: 7 | :header-rows: 1 8 | 9 | * - 10 | .. image:: _static/aachen_000012_000019_leftImg8bit.jpg 11 | :target: _static/aachen_000012_000019_leftImg8bit.jpg 12 | :alt: aachen_000012_000019_leftImg8bit 13 | 14 | - 15 | .. image:: _static/aachen_000012_000019_uids_pids_colored.png 16 | :target: _static/aachen_000012_000019_uids_pids_colored.png 17 | :alt: aachen_000012_000019_uids_pids_colored 18 | 19 | * - 20 | .. image:: _static/frankfurt_000001_011835_leftImg8bit.jpg 21 | :target: _static/frankfurt_000001_011835_leftImg8bit.jpg 22 | :alt: frankfurt_000001_011835_leftImg8bit 23 | 24 | - 25 | .. image:: _static/frankfurt_000001_011835_uids_pids_colored.png 26 | :target: _static/frankfurt_000001_011835_uids_pids_colored.png 27 | :alt: frankfurt_000001_011835_uids_pids_colored 28 | ``` 29 | 30 | ### PASCAL-Panoptic-Parts 31 | 32 | ```eval_rst 33 | .. list-table:: 34 | :header-rows: 1 35 | 36 | * - 37 | .. image:: _static/2008_000393.jpg 38 | :target: _static/2008_000393.jpg 39 | :alt: 2008_000393 40 | 41 | - 42 | .. image:: _static/2008_000393_colored.png 43 | :target: _static/2008_000393_colored.png 44 | :alt: 2008_000393_colored 45 | 46 | - 47 | .. image:: _static/2008_000716.jpg 48 | :target: _static/2008_000716.jpg 49 | :alt: 2008_000716 50 | 51 | - 52 | .. image:: _static/2008_000716_colored.png 53 | :target: _static/2008_000716_colored.png 54 | :alt: 2008_000716_colored 55 | 56 | * - 57 | .. image:: _static/2008_007456.jpg 58 | :target: _static/2008_007456.jpg 59 | :alt: 2008_007456 60 | 61 | - 62 | .. image:: _static/2008_007456_colored_repainted.png 63 | :target: _static/2008_007456_colored_repainted.png 64 | :alt: 2008_007456_colored_repainted 65 | 66 | - 67 | .. image:: _static/2010_002356.jpg 68 | :target: _static/2010_002356.jpg 69 | :alt: 2010_002356 70 | 71 | - 72 | .. image:: _static/2010_002356_colored.png 73 | :target: _static/2010_002356_colored.png 74 | :alt: 2010_002356_colored 75 | ``` 76 | -------------------------------------------------------------------------------- /utils/panoptic_parts/optional.txt: -------------------------------------------------------------------------------- 1 | tensorflow>=2.4.0 2 | torch>=1.7.0 3 | git+https://github.com/cocodataset/panopticapi.git 4 | tqdm 5 | pycocotools>=2.0.0 -------------------------------------------------------------------------------- /utils/panoptic_parts/panoptic_parts/__init__.py: -------------------------------------------------------------------------------- 1 | from panoptic_parts.utils.format import decode_uids, encode_ids 2 | from panoptic_parts.utils.visualization import uid2color, random_colors 3 | from panoptic_parts.utils.utils import safe_write 4 | 5 | 6 | __version__ = '2.0rc5' 7 | VERSION = __version__ 8 | -------------------------------------------------------------------------------- /utils/panoptic_parts/panoptic_parts/cityscapes_panoptic_parts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tue-mps/tapps/54d8d9f039fa91da6e046bfd8c62c13a9dc5e8b0/utils/panoptic_parts/panoptic_parts/cityscapes_panoptic_parts/__init__.py -------------------------------------------------------------------------------- /utils/panoptic_parts/panoptic_parts/cityscapes_panoptic_parts/dataset_v2.0/README.md: -------------------------------------------------------------------------------- 1 | 2 | # Cityscapes Panoptic Parts annotations 3 | We have manually annotated 5 scene-level classes with 23 part-level classes from Cityscapes vehicle and human categories. 4 | 5 | You can download the dataset from the [Cityscapes Dataset](https://www.cityscapes-dataset.com/login/) website. 6 | 7 | Pixels of humans and vehicles (_sids_: 24, 25, 26, 27, or 28) that are not assigned to any part-level class by the annotation team or it is not clearly visible to which part they belong to, have _pid_ = 0 or they maintain their semantic-level or semantic-instance-level labels. From the perspective of semantics the labels `SS_III_00` and `SS_III` are equivalent. 8 | 9 | ## Human (person (_sid_: 24), rider (_sid_: 25)) pids: 10 | 11 | * 0: unlabeled / void 12 | * 1: torso 13 | * 2: head 14 | * 3: arms 15 | * 4: legs 16 | 17 | > Note: For human and rider scene classes a _pid_ 5 exists in a minority of ground truth files (~10). This _pid_ is an artefact of data preprocessing. These artefact can be automatically set to void _pid_ 0 (unlabeled part) using the decoding functionality provided in the following snippet: 18 | 19 | ```python 20 | uids = np.array(Image.open('gt_filepath.tif'), dtype=np.int32) 21 | dataset_spec = DatasetSpec('cpp_datasetspec.yaml') 22 | _, _, pids = decode_uids(uids, experimental_dataset_spec=dataset_spec, experimental_correct_range=True) 23 | ``` 24 | 25 | ## Vehicle (car (_sid_: 26), truck (_sid_: 27), bus (_sid_: 28)) pids: 26 | 27 | * 0: unlabeled / void 28 | * 1: windows 29 | * 2: wheels 30 | * 3: lights 31 | * 4: license plate 32 | * 5: chassis 33 | 34 | ## Contact 35 | 36 | Please feel free to contact us for any suggestions or questions: 37 | 38 | * Panagiotis Meletis: **p**[DOT]**c**[DOT]**meletis**[AT]**tue.nl** 39 | * Xiaoxiao (Vincent) Wen: **wenxx10**[AT]**gmail.com** 40 | -------------------------------------------------------------------------------- /utils/panoptic_parts/panoptic_parts/evaluation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tue-mps/tapps/54d8d9f039fa91da6e046bfd8c62c13a9dc5e8b0/utils/panoptic_parts/panoptic_parts/evaluation/__init__.py -------------------------------------------------------------------------------- /utils/panoptic_parts/panoptic_parts/evaluation/prepare_data.py: -------------------------------------------------------------------------------- 1 | import json 2 | import glob 3 | import os 4 | import argparse 5 | 6 | from tqdm import tqdm 7 | from PIL import Image 8 | 9 | 10 | def create_image_list(dataset_dir, output_dir, dataset): 11 | """ 12 | :param dataset_dir: path to the PPS ground-truths file for the data split 13 | :param output_dir: directory where the images.json file will be stored 14 | :param dataset: dataset name ('Cityscapes' or 'Pascal') 15 | 16 | :return: 17 | """ 18 | print("Creating images list...") 19 | images_list = list() 20 | 21 | # Get all filenames in the GT directory 22 | filenames = [file for file in glob.glob(dataset_dir + "/*")] 23 | if dataset == 'Cityscapes': 24 | filenames.extend([file for file in glob.glob(dataset_dir + "/*/*")]) 25 | 26 | for filename in tqdm(filenames): 27 | if filename.endswith(str('.tif')): 28 | image_dict = dict() 29 | file_name_gt = os.path.basename(filename) 30 | 31 | # Set names for file_name and image_id 32 | if dataset == 'Cityscapes': 33 | file_name = file_name_gt.replace('_gtFinePanopticParts.tif', '_gtFine_leftImg8bit.png') 34 | image_id = file_name_gt.replace('_gtFinePanopticParts.tif', '') 35 | else: 36 | file_name = file_name_gt.replace('.tif', '.png') 37 | image_id = file_name_gt.replace('.tif', '') 38 | image_dict['file_name'] = file_name 39 | image_dict['id'] = image_id 40 | 41 | # Open gt image and store image dimensions 42 | img = Image.open(filename) 43 | image_dict['width'], image_dict['height'] = img.size[0:2] 44 | 45 | images_list.append(image_dict) 46 | 47 | images_dict = {'images': images_list} 48 | 49 | # Save images.json file 50 | output_path = os.path.join(output_dir, 'images.json') 51 | with open(output_path, 'w') as fp: 52 | json.dump(images_dict, fp) 53 | 54 | print("Created images list and stored at {}.".format(output_path)) 55 | 56 | if __name__ == '__main__': 57 | parser = argparse.ArgumentParser( 58 | description="Creates an images.json file for the Cityscapes Panoptic Parts or Pascal Panoptic Parts dataset." 59 | ) 60 | 61 | parser.add_argument('dataset_dir', type=str, 62 | help="path to the PPS ground-truths file for the data split") 63 | parser.add_argument('output_dir', type=str, 64 | help="directory where the images.json file will be stored") 65 | parser.add_argument('dataset', type=str, 66 | help="dataset name ('Cityscapes' or 'Pascal')") 67 | args = parser.parse_args() 68 | 69 | create_image_list(args.dataset_dir, 70 | args.output_dir, 71 | dataset=args.dataset) 72 | -------------------------------------------------------------------------------- /utils/panoptic_parts/panoptic_parts/merging/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tue-mps/tapps/54d8d9f039fa91da6e046bfd8c62c13a9dc5e8b0/utils/panoptic_parts/panoptic_parts/merging/__init__.py -------------------------------------------------------------------------------- /utils/panoptic_parts/panoptic_parts/pascal_panoptic_parts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tue-mps/tapps/54d8d9f039fa91da6e046bfd8c62c13a9dc5e8b0/utils/panoptic_parts/panoptic_parts/pascal_panoptic_parts/__init__.py -------------------------------------------------------------------------------- /utils/panoptic_parts/panoptic_parts/pascal_panoptic_parts/dataset_v2.0/README.md: -------------------------------------------------------------------------------- 1 | # Contact 2 | 3 | Please feel free to contact us for any suggestions or questions: 4 | 5 | * Panagiotis Meletis: **p**[DOT]**c**[DOT]**meletis**[AT]**tue.nl** 6 | * Xiaoxiao (Vincent) Wen: **wenxx10**[AT]**gmail.com** 7 | -------------------------------------------------------------------------------- /utils/panoptic_parts/panoptic_parts/specs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tue-mps/tapps/54d8d9f039fa91da6e046bfd8c62c13a9dc5e8b0/utils/panoptic_parts/panoptic_parts/specs/__init__.py -------------------------------------------------------------------------------- /utils/panoptic_parts/panoptic_parts/specs/dataset_specs/cpp_datasetspec.yaml: -------------------------------------------------------------------------------- 1 | version: '2.0' 2 | comments: 3 | - refer to ppp_datasetspec for now 4 | name: Cityscapes Panoptic Parts 5 | 6 | # scene_class2part_classes: An ordered mapping from scene-level class to part-level classes. 7 | # Refer to ppp_datasetspec for now. 8 | scene_class2part_classes: { 9 | ego vehicle: [], 10 | rectification border: [], 11 | out of roi: [], 12 | static: [], 13 | dynamic: [], 14 | ground: [], 15 | road: [], 16 | sidewalk: [], 17 | parking: [], 18 | rail track: [], 19 | building: [], 20 | wall: [], 21 | fence: [], 22 | guard rail: [], 23 | bridge: [], 24 | tunnel: [], 25 | pole: [], 26 | polegroup: [], 27 | traffic light: [], 28 | traffic sign: [], 29 | vegetation: [], 30 | terrain: [], 31 | sky: [], 32 | person: [torso, head, arm, leg], 33 | rider: [torso, head, arm, leg], 34 | car: [window, wheel, light, license plate, chassis], 35 | truck: [window, wheel, light, license plate, chassis], 36 | bus: [window, wheel, light, license plate, chassis], 37 | caravan: [], 38 | trailer: [], 39 | train: [], 40 | motorcycle: [], 41 | bicycle: [], 42 | license plate: [], 43 | } 44 | 45 | # Refer to ppp_datasetspec for now. 46 | scene_classes_with_instances: [ 47 | person, rider, car, truck, bus, caravan, trailer, train, motorcycle, bicycle 48 | ] 49 | 50 | # Refer to ppp_datasetspec for now. 51 | scene_class2color: { 52 | ego vehicle: [0, 0, 0], 53 | rectification border: [0, 0, 0], 54 | out of roi: [0, 0, 0], 55 | static: [0, 0, 0], 56 | dynamic: [111, 74, 0], 57 | ground: [81, 0, 81], 58 | road: [128, 64, 128], 59 | sidewalk: [244, 35, 232], 60 | parking: [250, 170, 160], 61 | rail track: [230, 150, 140], 62 | building: [70, 70, 70], 63 | wall: [102, 102, 156], 64 | fence: [190, 153, 153], 65 | guard rail: [180, 165, 180], 66 | bridge: [150, 100, 100], 67 | tunnel: [150, 120, 90], 68 | pole: [153, 153, 153], 69 | polegroup: [153, 153, 153], 70 | traffic light: [250, 170, 30], 71 | traffic sign: [220, 220, 0], 72 | vegetation: [107, 142, 35], 73 | terrain: [152, 251, 152], 74 | sky: [70, 130, 180], 75 | person: [220, 20, 60], 76 | rider: [255, 0, 0], 77 | car: [0, 0, 142], 78 | truck: [0, 0, 70], 79 | bus: [0, 60, 100], 80 | caravan: [0, 0, 90], 81 | trailer: [0, 0, 110], 82 | train: [0, 80, 100], 83 | motorcycle: [0, 0, 230], 84 | bicycle: [119, 11, 32], 85 | license plate: [0, 0, 142], 86 | } 87 | -------------------------------------------------------------------------------- /utils/panoptic_parts/panoptic_parts/specs/eval_specs/ppq_cpp_19_23_cvpr21_default_evalspec.yaml: -------------------------------------------------------------------------------- 1 | version: 2.0 2 | comments: 3 | First version containing all information I (Daan) think is necessary for merging to panoptic and part-aware panoptic (and it should also be usable for PartPQ evaluation) 4 | 5 | dataset_spec_path: utils/panoptic_parts/panoptic_parts/specs/dataset_specs/cpp_datasetspec.yaml 6 | 7 | # To be used for evaluation 8 | ignore_label: 255 9 | 10 | # To be used for evaluation 11 | dataset_sid2eval_sid: { 12 | # evaluated 13 | 7: 7, 8: 8, 11: 11, 12: 12, 13: 13, 14 | 17: 17, 19: 19, 20: 20, 21: 21, 22: 22, 15 | 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 16 | 28: 28, 31: 31, 32: 32, 33: 33, 17 | # ignored 18 | 0: IGNORED, 1: IGNORED, 2: IGNORED, 3: IGNORED, 4: IGNORED, 5: IGNORED, 6: IGNORED, 9: IGNORED, 10: IGNORED, 14: IGNORED, 15: IGNORED, 16: IGNORED, 18: IGNORED, 29: IGNORED, 30: IGNORED, 19 | # default 20 | DEFAULT: IGNORED 21 | } 22 | 23 | # To be used for evaluation 24 | dataset_sid_pid2eval_sid_pid: { 25 | # evaluated 26 | 24_01: 24_01, 24_02: 24_02, 24_03: 24_03, 24_04: 24_04, 27 | 25_01: 25_01, 25_02: 25_02, 25_03: 25_03, 25_04: 25_04, 28 | 26_01: 26_01, 26_02: 26_02, 26_03: 26_03, 26_04: 26_04, 26_05: 26_05, 29 | 27_01: 27_01, 27_02: 27_02, 27_03: 27_03, 27_04: 27_04, 27_05: 27_05, 30 | 28_01: 28_01, 28_02: 28_02, 28_03: 28_03, 28_04: 28_04, 28_05: 28_05, 31 | # ignored 32 | 24: IGNORED, 25: IGNORED, 26: IGNORED, 27: IGNORED, 28: IGNORED, 33 | 0: IGNORED, 1: IGNORED, 2: IGNORED, 3: IGNORED, 4: IGNORED, 5: IGNORED, 6: IGNORED, 9: IGNORED, 10: IGNORED, 14: IGNORED, 15: IGNORED, 16: IGNORED, 18: IGNORED, 29: IGNORED, 30: IGNORED, 34 | # default 35 | DEFAULT: IGNORED 36 | } 37 | 38 | # Used for merging and evaluation 39 | eval_sid_things: [24, 25, 26, 27, 28, 31, 32, 33] 40 | eval_sid_stuff: [7, 8, 11, 12, 13, 17, 19, 20, 21, 22, 23] 41 | eval_sid_parts: [24, 25, 26, 27, 28] 42 | eval_sid_no_parts: [7, 8, 11, 12, 13, 17, 19, 20, 21, 22, 23, 31, 32, 33] 43 | 44 | # Used for merging panoptic & parts 45 | eval_sid_pid2eval_pid_flat: { 46 | 24_01: 1, 24_02: 2, 24_03: 3, 24_04: 4, 47 | 25_01: 5, 25_02: 6, 25_03: 7, 25_04: 8, 48 | 26_01: 9, 26_02: 10, 26_03: 11, 26_04: 12, 26_05: 13, 49 | 27_01: 14, 27_02: 15, 27_03: 16, 27_04: 17, 27_05: 18, 50 | 28_01: 19, 28_02: 20, 28_03: 21, 28_04: 22, 28_05: 23, 51 | } 52 | 53 | 54 | # Names for all labels that are to be evaluated 55 | eval_sid2scene_label: { 56 | 7: road, 8: sidewalk, 11: building, 12: wall, 57 | 13: fence, 17: pole, 19: traffic light, 20: traffic sign, 58 | 21: vegetation, 22: terrain, 23: sky, 24: person, 59 | 25: rider, 26: car, 27: truck, 28: bus, 60 | 31: train, 32: motorcycle, 33: bicycle 61 | } 62 | 63 | eval_pid_flat2scene_part_label: { 64 | 1: person-torso, 2: person-head, 3: person-arms, 4: person-legs, 65 | 5: rider-torso, 6: rider-head, 7: rider-arms, 8: rider-legs, 66 | 9: car-windows, 10: car-wheels, 11: car-lights, 12: car-license_plate, 13: car-chassis, 67 | 14: truck-windows, 15: truck-wheels, 16: truck-lights, 17: truck-license_plate, 18: truck-chassis, 68 | 19: bus-windows, 20: bus-wheels, 21: bus-lights, 22: bus-license_plate, 23: bus-chassis, 69 | } 70 | -------------------------------------------------------------------------------- /utils/panoptic_parts/panoptic_parts/specs/eval_specs/ppq_cpp_19_23_cvpr21_grouped_evalspec.yaml: -------------------------------------------------------------------------------- 1 | version: 1.0 2 | comments: Information required to calculate the PartPQ for CPP 3 | 4 | dataset_spec_path: utils/panoptic_parts/panoptic_parts/specs/dataset_specs/cpp_datasetspec.yaml 5 | 6 | # To be used for evaluation 7 | ignore_label: 255 8 | 9 | # To be used for evaluation 10 | dataset_sid2eval_sid: { 11 | # evaluated 12 | 7: 7, 8: 8, 11: 11, 12: 12, 13: 13, 13 | 17: 17, 19: 19, 20: 20, 21: 21, 22: 22, 14 | 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 15 | 28: 28, 31: 31, 32: 32, 33: 33, 16 | # ignored 17 | 0: IGNORED, 1: IGNORED, 2: IGNORED, 3: IGNORED, 4: IGNORED, 5: IGNORED, 6: IGNORED, 9: IGNORED, 10: IGNORED, 14: IGNORED, 15: IGNORED, 16: IGNORED, 18: IGNORED, 29: IGNORED, 30: IGNORED, 18 | # default 19 | DEFAULT: IGNORED 20 | } 21 | 22 | # To be used for evaluation 23 | dataset_sid_pid2eval_sid_pid: { 24 | # evaluated 25 | 24_01: 24_01, 24_02: 24_02, 24_03: 24_03, 24_04: 24_04, 26 | 25_01: 25_01, 25_02: 25_02, 25_03: 25_03, 25_04: 25_04, 27 | 26_01: 26_01, 26_02: 26_02, 26_03: 26_03, 26_04: 26_04, 26_05: 26_05, 28 | 27_01: 27_01, 27_02: 27_02, 27_03: 27_03, 27_04: 27_04, 27_05: 27_05, 29 | 28_01: 28_01, 28_02: 28_02, 28_03: 28_03, 28_04: 28_04, 28_05: 28_05, 30 | # ignored 31 | 24: IGNORED, 25: IGNORED, 26: IGNORED, 27: IGNORED, 28: IGNORED, 32 | 0: IGNORED, 1: IGNORED, 2: IGNORED, 3: IGNORED, 4: IGNORED, 5: IGNORED, 6: IGNORED, 9: IGNORED, 10: IGNORED, 14: IGNORED, 15: IGNORED, 16: IGNORED, 18: IGNORED, 29: IGNORED, 30: IGNORED, 33 | # default 34 | DEFAULT: IGNORED 35 | } 36 | 37 | # Used for merging and evaluation 38 | eval_sid_things: [24, 25, 26, 27, 28, 31, 32, 33] 39 | eval_sid_stuff: [7, 8, 11, 12, 13, 17, 19, 20, 21, 22, 23] 40 | eval_sid_parts: [24, 25, 26, 27, 28] 41 | eval_sid_no_parts: [7, 8, 11, 12, 13, 17, 19, 20, 21, 22, 23, 31, 32, 33] 42 | 43 | # Used for merging panoptic & parts 44 | eval_sid_pid2eval_pid_flat: { 45 | 24_01: 1, 24_02: 2, 24_03: 3, 24_04: 4, 46 | 25_01: 1, 25_02: 2, 25_03: 3, 25_04: 4, 47 | 26_01: 5, 26_02: 6, 26_03: 7, 26_04: 8, 26_05: 9, 48 | 27_01: 5, 27_02: 6, 27_03: 7, 27_04: 8, 27_05: 9, 49 | 28_01: 5, 28_02: 6, 28_03: 7, 28_04: 8, 28_05: 9, 50 | } 51 | 52 | # Names for all labels that are to be evaluated 53 | eval_sid2scene_label: { 54 | 7: road, 8: sidewalk, 11: building, 12: wall, 55 | 13: fence, 17: pole, 19: traffic light, 20: traffic sign, 56 | 21: vegetation, 22: terrain, 23: sky, 24: person, 57 | 25: rider, 26: car, 27: truck, 28: bus, 58 | 31: train, 32: motorcycle, 33: bicycle 59 | } 60 | 61 | eval_pid_flat2scene_part_label: { 62 | 1: human-torso, 2: human-head, 3: human-arms, 4: human-legs, 63 | 5: vehicle-windows, 6: vehicle-wheels, 7: vehicle-lights, 8: vehicle-license_plate, 9: vehicle-chassis, 64 | } 65 | -------------------------------------------------------------------------------- /utils/panoptic_parts/panoptic_parts/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tue-mps/tapps/54d8d9f039fa91da6e046bfd8c62c13a9dc5e8b0/utils/panoptic_parts/panoptic_parts/utils/__init__.py -------------------------------------------------------------------------------- /utils/panoptic_parts/panoptic_parts/utils/internal/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tue-mps/tapps/54d8d9f039fa91da6e046bfd8c62c13a9dc5e8b0/utils/panoptic_parts/panoptic_parts/utils/internal/__init__.py -------------------------------------------------------------------------------- /utils/panoptic_parts/panoptic_parts/utils/internal/convert_annotations_v1_to_v2.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import os.path as op 4 | import sys 5 | from tqdm import tqdm 6 | 7 | import numpy as np 8 | from PIL import Image 9 | 10 | from panoptic_parts.utils.format import decode_uids, encode_ids 11 | 12 | 13 | def convert(): 14 | basepath = 'pascal_panoptic_parts/releases/20201704/pascal_panoptic_parts_v1' 15 | 16 | filepaths = glob.glob(op.join(basepath, 'training/*.tif')) + glob.glob(op.join(basepath, 'validation/*.tif')) 17 | 18 | for fp in tqdm(filepaths): 19 | uids = np.asarray(Image.open(fp), dtype=np.int32) 20 | # transformation 1 (tvmonitor-unlabeled becomes tvmonitor-frame): {20_XXX, 20_XXX_00} -> 20_XXX_02 21 | sids, iids, pids, sids_iids, sids_pids = decode_uids(uids, return_sids_iids=True, return_sids_pids=True) 22 | pids = np.where(np.logical_and(iids >= 0, 23 | np.logical_or(np.equal(sids_pids, 20), np.equal(sids_pids, 20_00))), 24 | 2, 25 | pids) 26 | uids = encode_ids(sids, iids, pids) 27 | # transformation 1 (remove 00): XX_XXX_00 -> XX_XXX 28 | _, _, pids, sids_iids = decode_uids(uids, return_sids_iids=True) 29 | uids = np.where(np.logical_and(uids >= 1_000_00, np.equal(pids, 0)), 30 | sids_iids, 31 | uids) 32 | 33 | path_new = fp.replace('20201704/pascal_panoptic_parts_v1', '20210503/pascal_panoptic_parts_v2') 34 | assert not op.exists(path_new), f'path {path_new} exists.' 35 | os.makedirs(op.dirname(path_new), exist_ok=True) 36 | Image.fromarray(uids, mode='I').save(path_new, format='TIFF', compression='tiff_lzw') 37 | 38 | 39 | def validate(): 40 | basepath_v1 = 'pascal_panoptic_parts/releases/20201704/pascal_panoptic_parts_v1' 41 | basepath_v2 = 'pascal_panoptic_parts/releases/20210503/pascal_panoptic_parts_v2' 42 | 43 | filepaths_v1 = glob.glob(op.join(basepath_v1, 'training/*.tif')) + glob.glob(op.join(basepath_v1, 'validation/*.tif')) 44 | filepaths_v2 = [fp.replace('20201704/pascal_panoptic_parts_v1', '20210503/pascal_panoptic_parts_v2') for fp in filepaths_v1] 45 | 46 | for i, (f1, f2) in enumerate(zip(filepaths_v1, filepaths_v2)): 47 | l1 = np.asanyarray(Image.open(f1), dtype=np.int32) 48 | l2 = np.asanyarray(Image.open(f2), dtype=np.int32) 49 | # if there are differences print the unique tuples with (uid_l1, uid_l2) corresponding 50 | # to the same spatial position 51 | cond = l1 != l2 52 | if np.any(cond): 53 | uids_tuples = np.unique(np.stack([l1[cond], l2[cond]]), axis=1) 54 | print(i, *(uids_tuples[:, j] for j in range(uids_tuples.shape[1]))) 55 | else: 56 | print('No diff.') 57 | 58 | 59 | if __name__ == '__main__': 60 | # convert() 61 | validate() 62 | -------------------------------------------------------------------------------- /utils/panoptic_parts/panoptic_parts/utils/internal/populate_ppp_official_evalspec.py: -------------------------------------------------------------------------------- 1 | from ruamel.yaml import YAML 2 | 3 | from panoptic_parts.specs.dataset_spec import DatasetSpec 4 | 5 | 6 | with open('ppp_20_58_iou_evalspec.yaml') as fd: 7 | gspec = YAML().load(fd) 8 | 9 | dspec = DatasetSpec(gspec['dataset_spec_path']) 10 | 11 | with open('ppq_ppp_59_57_evalspec.yaml') as fd: 12 | espec = YAML().load(fd) 13 | 14 | 15 | 16 | # dataset_sid_pid2eval_sid_pid 17 | ################################################################################################### 18 | part_groupings = gspec['part_groupings'] 19 | dataset_sid_pid2eval_sid_pid = dict() 20 | for sid_pid, (scene_class, part_class) in dspec.sid_pid2scene_class_part_class.items(): 21 | if sid_pid == 0 or scene_class not in part_groupings.keys(): 22 | continue 23 | sid = sid_pid // 100 24 | pid_new = None 25 | # find the part_class position in the part_groupings dict 26 | for pid_new_cand, (part_class_new, part_classes_old) in enumerate(part_groupings[scene_class].items(), start=1): 27 | for part_class_old in part_classes_old: 28 | if part_class_old == part_class: 29 | pid_new = pid_new_cand 30 | break 31 | else: # ie inner loop DOES NOT break, continue mid loop 32 | continue 33 | break # if inner loop breaks, then break mid loop 34 | else: # ie mid loop DOES NOT break, continue outer loop 35 | continue 36 | dataset_sid_pid2eval_sid_pid[sid_pid] = sid * 100 + pid_new 37 | 38 | # sanity check 39 | esd2epf = espec['eval_sid_pid2eval_pid_flat'] 40 | assert all(v in esd2epf.keys() for v in dataset_sid_pid2eval_sid_pid.values()) 41 | 42 | # print in a friendly copy-paste way to yaml 43 | sid_prev = 0 44 | for k, v in dataset_sid_pid2eval_sid_pid.items(): 45 | sid_cur = k // 100 46 | if sid_cur > sid_prev: 47 | sid_prev = sid_cur 48 | print('\n ', end='') 49 | print('{}_{:02d}'.format(*divmod(k, 100)) + ': ' + '{}_{:02d}'.format(*divmod(v, 100)) + ',', end=' ') 50 | ################################################################################################### 51 | 52 | # eval_sid2scene_label 53 | ################################################################################################### 54 | # eval_sid2dataset_sid = espec['eval_sid2scene_label'] 55 | # eval_sid2scene_label = {es: dspec.scene_class_from_sid(ds) for es, ds in eval_sid2dataset_sid.items()} 56 | ################################################################################################### 57 | 58 | # eval_pid_flat2scene_part_label 59 | ################################################################################################### 60 | eval_pid_flat = espec['eval_pid_flat2scene_part_label'].keys() 61 | eval_pid_flat2eval_sid_pid = {v: k for k, v in espec['eval_sid_pid2eval_pid_flat'].items()} 62 | eval_pid_flat2eval_sid_pid[0] = 0 63 | 64 | part_groupings['UNLABELED'] = {'UNLABELED': ['UNLABELED']} 65 | 66 | eval_pid_flat2scene_part_label = dict() 67 | for k in eval_pid_flat: 68 | eval_sid_pid = eval_pid_flat2eval_sid_pid[k] 69 | eval_sid, eval_pid = divmod(eval_sid_pid, 100) 70 | scene_class = dspec.scene_class_from_sid(eval_sid) 71 | part_class_new2part_classes_old = {'UNLABELED': ['UNLABELED']} 72 | part_class_new2part_classes_old.update(part_groupings[scene_class]) 73 | part_class = list(part_class_new2part_classes_old.keys())[eval_pid] 74 | eval_pid_flat2scene_part_label[k] = f'{scene_class}-{part_class}' 75 | ################################################################################################### 76 | breakpoint() 77 | -------------------------------------------------------------------------------- /utils/panoptic_parts/panoptic_parts/utils/internal/ppq_ppp_20_58_part_groupings.yaml: -------------------------------------------------------------------------------- 1 | version: 2.0 2 | comments: 3 | - The IoU eval specification contains two optional fields. 4 | - scene_class_new2scene_classes_old contains a mapping from the new scene-level classes to at least one of the original scene-level classes of the dataset (dataset_spec.l). Use this mapping to group or ignore scene-level classes. 5 | - part_groupings is a mapping from the new part-level classes to at least one of the original part-level classes of the dataset. 6 | 7 | dataset_spec_path: panoptic_parts/specs/dataset_specs/ppp_datasetspec.yaml 8 | 9 | 10 | # part_groupings provides the information of the grouped part-level classes. 11 | # typing: Dict(scene_class, Dict(part_class_new, part_classes_old)) 12 | # for now only a grouping of the part_classes under the same scene_class is supported 13 | part_groupings: { 14 | aeroplane: { 15 | # UNLABELED: IGNORED, 16 | body: [body], 17 | engine: [engine], 18 | wing: [lwing, rwing], 19 | stern: [stern, tail], 20 | wheel: [wheel], 21 | }, 22 | bicycle: { 23 | wheel: [fwheel, bwheel, chainwheel], 24 | body: [UNLABELED, saddle, handlebar, headlight], 25 | }, 26 | bird: { 27 | # UNLABELED: IGNORED, 28 | head: [head, leye, reye, beak], 29 | wing: [lwing, rwing], 30 | leg: [lleg, lfoot, rleg, rfoot], 31 | torso: [torso, neck, tail], 32 | }, 33 | boat: { 34 | boat: [UNLABELED], 35 | }, 36 | bottle: { 37 | # UNLABELED: IGNORED, 38 | cap: [cap], 39 | body: [body], 40 | }, 41 | bus: { 42 | # UNLABELED: IGNORED, 43 | window: [window], 44 | wheel: [wheel], 45 | body: [frontside, leftside, rightside, backside, roofside, leftmirror, rightmirror, fliplate, bliplate, door, headlight], 46 | }, 47 | car: { 48 | window: [window], 49 | wheel: [wheel], 50 | light: [headlight], 51 | license plate: [fliplate, bliplate], 52 | body: [frontside, leftside, rightside, backside, roofside, leftmirror, rightmirror, door], 53 | }, 54 | cat: { 55 | head: [head, leye, reye, lear, rear, nose], 56 | lower leg: [lfleg, lfpa, rfleg, rfpa, lbleg, lbpa, rbleg, rbpa], 57 | tail: [tail], 58 | torso: [torso, neck], 59 | }, 60 | chair: { 61 | chair: [UNLABELED], 62 | }, 63 | cow: { 64 | head: [head, leye, reye, lear, rear, muzzle, lhorn, rhorn], 65 | tail: [tail], 66 | lower leg: [lfuleg, lflleg, rfuleg, rflleg, lbuleg, lblleg, rbuleg, rblleg], 67 | torso: [torso, neck], 68 | }, 69 | table: { 70 | table: [UNLABELED], 71 | }, 72 | dog: { 73 | head: [head, leye, reye, lear, rear, nose, muzzle], 74 | lower leg: [lfleg, lfpa, rfleg, rfpa, lbleg, lbpa, rbleg, rbpa], 75 | tail: [tail], 76 | torso: [torso, neck], 77 | }, 78 | horse: { 79 | head: [head, leye, reye, lear, rear, muzzle], 80 | tail: [tail], 81 | leg: [lfho, rfho, lbho, rbho, lfuleg, lflleg, rfuleg, rflleg, lbuleg, lblleg, rbuleg, rblleg], 82 | torso: [torso, neck], 83 | }, 84 | motorbike: { 85 | wheel: [fwheel, bwheel], 86 | body: [UNLABELED, handlebar, saddle, headlight], 87 | }, 88 | person: { 89 | head: [head, leye, reye, lear, rear, lebrow, rebrow, nose, mouth, hair], 90 | torso: [neck, torso], 91 | lower arm: [llarm, lhand, rlarm, rhand], 92 | upper arm: [luarm, ruarm], 93 | lower leg: [llleg, lfoot, rlleg, rfoot], 94 | upper leg: [luleg, ruleg], 95 | }, 96 | pottedplant: { 97 | pot: [pot], 98 | plant: [plant], 99 | }, 100 | sheep: { 101 | head: [head, leye, reye, lear, rear, muzzle, lhorn, rhorn], 102 | leg: [lfuleg, lflleg, rfuleg, rflleg, lbuleg, lblleg, rbuleg, rblleg], 103 | torso: [torso, neck, tail], 104 | }, 105 | sofa: { 106 | sofa: [UNLABELED], 107 | }, 108 | train: { 109 | train: [head, hfrontside, hleftside, hrightside, hbackside, hroofside, headlight, coach, cfrontside, cleftside, crightside, cbackside, croofside], 110 | }, 111 | tvmonitor: { 112 | screen: [screen], 113 | frame: [frame], 114 | }, 115 | } 116 | -------------------------------------------------------------------------------- /utils/panoptic_parts/panoptic_parts/visualization/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tue-mps/tapps/54d8d9f039fa91da6e046bfd8c62c13a9dc5e8b0/utils/panoptic_parts/panoptic_parts/visualization/__init__.py -------------------------------------------------------------------------------- /utils/panoptic_parts/panoptic_parts/visualization/visualize_label_with_legend.py: -------------------------------------------------------------------------------- 1 | """ 2 | Run this script as 3 | `python -m panoptic_parts.visualization.visualize_label_with_legend \ 4 | ` 5 | to visualize a label in all three levels (semantic, instance, parts), 6 | together with a legend including all the colors and uids in that label. 7 | """ 8 | import argparse 9 | 10 | import numpy as np 11 | from PIL import Image 12 | import matplotlib.pyplot as plt 13 | 14 | from panoptic_parts.utils.visualization import experimental_colorize_label 15 | from panoptic_parts.utils.format import decode_uids, encode_ids 16 | from panoptic_parts.specs.dataset_spec import DatasetSpec 17 | 18 | 19 | def visualize_from_paths(datasetspec_path, label_path): 20 | """ 21 | Visualizes in a pyplot window a label from the provided path. 22 | 23 | For visualization pixels are colored on: 24 | - semantic-level: according to colors defined in dataspec.sid2scene_color 25 | - semantic-instance-level: with random shades of colors defined in dataspec.sid2scene_color 26 | - semantic-instance-parts-level: with a mixture of parula colormap and the shades above 27 | See panoptic_parts.utils.visualization.uid2color for more information on color generation. 28 | 29 | Args: 30 | datasetspec_path: a YAML file path, including keys: 31 | `sid2scene_color`, `scene_class_part_class_from_sid_pid` 32 | label_path: a label path, will be passed to Pillow.Image.open 33 | """ 34 | spec = DatasetSpec(datasetspec_path) 35 | uids = np.array(Image.open(label_path), dtype=np.int32) 36 | # for PPP, we need to fold groupable parts (see dataset ppp_datasetspec.yaml for more details) 37 | uids = encode_ids(*decode_uids(uids, experimental_dataset_spec=spec, experimental_correct_range=True)) 38 | 39 | uids_sem_inst_parts_colored, uid2color_dct = experimental_colorize_label( 40 | uids, sid2color=spec.sid2scene_color, emphasize_instance_boundaries=True, return_uid2color=True, 41 | experimental_deltas=(60, 60, 60), experimental_alpha=0.5) 42 | 43 | # plot 44 | _, ax1 = plt.subplots() 45 | 46 | # generate legend, h is a hidden rectangle just to create a legend entry 47 | handles = [] 48 | handles_text = [] 49 | uids_unique = np.unique(uids) 50 | for uid in uids_unique: 51 | h = plt.Rectangle((0, 0), 1, 1, fc=list(map(lambda x: x/255, uid2color_dct[uid]))) 52 | handles.append(h) 53 | _, _, _, sid_pid = decode_uids(uid, return_sids_pids=True) 54 | scene_class_part_class = spec.scene_class_part_class_from_sid_pid(sid_pid) 55 | handles_text.append(f'{uid}: {scene_class_part_class}') 56 | 57 | ax1.imshow(uids_sem_inst_parts_colored) 58 | ax1.set_title('labels colored on semantic, instance, and part levels', fontsize='small') 59 | ax1.legend(handles, handles_text, ncol=3, fontsize='small', handlelength=1.0, 60 | loc='center left', bbox_to_anchor=(1.01, 0.5)) 61 | plt.tight_layout() 62 | plt.show() 63 | 64 | 65 | def main(): 66 | parser = argparse.ArgumentParser() 67 | parser.add_argument('datasetspec_path') 68 | parser.add_argument('label_path') 69 | args = parser.parse_args() 70 | visualize_from_paths(args.datasetspec_path, args.label_path) 71 | 72 | return 73 | 74 | 75 | if __name__ == "__main__": 76 | main() 77 | -------------------------------------------------------------------------------- /utils/panoptic_parts/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "setuptools>=51", 4 | "wheel", 5 | "numpy>=1.15" 6 | ] 7 | build-backend = "setuptools.build_meta" 8 | -------------------------------------------------------------------------------- /utils/panoptic_parts/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy>=1.15 2 | Pillow>=8.0 3 | scipy>=1.4 4 | ruamel.yaml>=0.15 5 | matplotlib>=3.3.0 -------------------------------------------------------------------------------- /utils/panoptic_parts/setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = panoptic_parts 3 | version = 2.0rc5 4 | description = Panoptic Parts datasets 5 | long_description = file: README.md 6 | long_description_content_type = text/markdown 7 | author = Panoptic Parts team 8 | author_email = panoptic.parts@outlook.com 9 | maintainer = Panagiotis Meletis 10 | maintainer_email = panoptic.parts@outlook.com 11 | url = https://github.com/pmeletis/panoptic_parts 12 | project_urls = 13 | Documentation = https://panoptic-parts.readthedocs.io 14 | Bug Tracker = https://github.com/pmeletis/panoptic_parts/issues 15 | classifiers = 16 | Programming Language :: Python :: 3 17 | Programming Language :: Python :: 3.7 18 | Operating System :: OS Independent 19 | 20 | [options] 21 | packages = find: 22 | python_requires = >=3.7 23 | install_requires = 24 | numpy>=1.15 25 | Pillow>=8.0 26 | scipy>=1.4 27 | ruamel.yaml>=0.15 28 | matplotlib>=3.3.0 29 | include_package_data = True 30 | 31 | [options.extras_require] 32 | MERGING = 33 | tqdm 34 | pycocotools>=2.0.0 35 | 36 | [options.entry_points] 37 | console_scripts = 38 | pp_merge_to_pps = panoptic_parts.merging.merge_to_pps:main [MERGING] 39 | pp_merge_to_panoptic = panoptic_parts.merging.merge_to_panoptic:main [MERGING] 40 | pp_visualize_label_with_legend = panoptic_parts.visualization.visualize_label_with_legend:main 41 | -------------------------------------------------------------------------------- /utils/panoptic_parts/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tue-mps/tapps/54d8d9f039fa91da6e046bfd8c62c13a9dc5e8b0/utils/panoptic_parts/tests/__init__.py -------------------------------------------------------------------------------- /utils/panoptic_parts/tests/cityscapes_panoptic_parts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tue-mps/tapps/54d8d9f039fa91da6e046bfd8c62c13a9dc5e8b0/utils/panoptic_parts/tests/cityscapes_panoptic_parts/__init__.py -------------------------------------------------------------------------------- /utils/panoptic_parts/tests/cityscapes_panoptic_parts/dataset_sanity_check.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script reads the original labels of Cityscapes (CO) and compares them against 3 | the Cityscapes-Panoptic-Parts (CPP) labels. It verifies that the semantic and instance 4 | level labels of Cityscapes Panoptic Parts (CPP) are equivalent to 5 | original Cityscapes (CO), i.e., sids_iids_CPP == sids_iids_CO. 6 | """ 7 | import sys 8 | assert float(sys.version[:3]) >= 3.6, 'This test uses Python >= 3.6 functionality.' 9 | import os.path as op 10 | import glob 11 | import multiprocessing 12 | 13 | import numpy as np 14 | from PIL import Image 15 | 16 | from panoptic_parts.utils.format import decode_uids 17 | 18 | # find all label paths 19 | BASEPATH_LABELS_ORIGINAL = 'tests/tests_files/cityscapes/gtFine' 20 | labels_paths_original = glob.glob(op.join(BASEPATH_LABELS_ORIGINAL, 'train', '*', '*_instanceIds.png')) 21 | labels_paths_original.extend(glob.glob(op.join(BASEPATH_LABELS_ORIGINAL, 'val', '*', '*_instanceIds.png'))) 22 | print(len(labels_paths_original)) 23 | labels_paths_ours = [ 24 | lp.replace('cityscapes/gtFine', 'cityscapes_panoptic_parts/gtFine_v2').replace('_instanceIds.png', 'PanopticParts.tif') 25 | for lp in labels_paths_original] 26 | print(len(labels_paths_ours)) 27 | 28 | def _sids_iids_are_maintained(inpts): 29 | lp_orig, lp_ours = inpts 30 | labels_orig = np.asarray(Image.open(lp_orig), dtype=np.int32) 31 | labels_ours = np.asarray(Image.open(lp_ours), dtype=np.int32) 32 | _, _, _, sids_iids = decode_uids(labels_ours, return_sids_iids=True) 33 | returns = np.all(np.equal(labels_orig, sids_iids)) 34 | # if not returns: 35 | # print(lp_orig, lp_ours, sep='\n') 36 | # print(np.unique(labels_orig), print(np.unique(sids_iids)), np.unique(labels_ours), sep='\n') 37 | return returns 38 | 39 | # validate labels 40 | with multiprocessing.Pool(multiprocessing.cpu_count()) as pool: 41 | maintained_bools =[mb for mb in pool.imap_unordered( 42 | _sids_iids_are_maintained, zip(labels_paths_original, labels_paths_ours), chunksize=10)] 43 | 44 | print(len(maintained_bools), 'files were verified.') 45 | assert all(maintained_bools), 'some sids_iids are not the same' 46 | -------------------------------------------------------------------------------- /utils/panoptic_parts/tests/cityscapes_panoptic_parts/visualize_from_paths_test.sh: -------------------------------------------------------------------------------- 1 | #! /bin/sh 2 | # Change the paths below for your system and run this script from top-level dir as: 3 | # bash tests/cityscapes_panoptic_parts/visualize_from_paths_test.sh 4 | 5 | python -m panoptic_parts.cityscapes_panoptic_parts.visualize_from_paths \ 6 | tests/tests_files/cityscapes_panoptic_parts/leftImg8bit/train/aachen/aachen_000012_000019_leftImg8bit.png \ 7 | tests/tests_files/cityscapes_panoptic_parts/gtFine_v2/train/aachen/aachen_000012_000019_gtFinePanopticParts.tif \ 8 | panoptic_parts/utils/defs/cpp_20.yaml 9 | -------------------------------------------------------------------------------- /utils/panoptic_parts/tests/cityscapes_panoptic_parts/visualize_label_with_legend_test.sh: -------------------------------------------------------------------------------- 1 | #! /bin/sh 2 | # Change the paths below for your system and run this script from top-level dir as: 3 | # bash tests/cityscapes_panoptic_parts/visualize_label_with_legend_test.sh 4 | 5 | python -m panoptic_parts.cityscapes_panoptic_parts.visualize_label_with_legend \ 6 | tests/tests_files/gtFinePanopticParts/val/munster/munster_000080_000019_gtFinePanopticParts.tif \ 7 | panoptic_parts/utils/defs/cpp_20.yaml 8 | -------------------------------------------------------------------------------- /utils/panoptic_parts/tests/pascal_panoptic_parts/visualize_from_paths_test.sh: -------------------------------------------------------------------------------- 1 | #! /bin/sh 2 | # Change the paths below for your system and run this script from top-level dir as: 3 | # bash tests/pascal_panoptic_parts/visualize_from_paths_test.sh 4 | 5 | python -m panoptic_parts.pascal_panoptic_parts.visualize_from_paths \ 6 | tests/tests_files/pascal_panoptic_parts/images/2010_002877.jpg \ 7 | tests/tests_files/pascal_panoptic_parts/labels/2010_002877.tif \ 8 | panoptic_parts/utils/defs/ppp_100.yaml 9 | -------------------------------------------------------------------------------- /utils/panoptic_parts/tests/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tue-mps/tapps/54d8d9f039fa91da6e046bfd8c62c13a9dc5e8b0/utils/panoptic_parts/tests/utils/__init__.py -------------------------------------------------------------------------------- /utils/panoptic_parts/tests/utils/utils_test.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import tifffile 4 | from panoptic_parts.utils.utils import safe_write 5 | 6 | pth0 = "test.png" 7 | pth1 = "test1.png" 8 | pth2 = "test2.png" 9 | pth3 = "test3.png" 10 | pth4 = "test4.png" 11 | 12 | im = np.random.randint(0, high=255, size=(600, 800, 3), dtype=np.uint8) 13 | 14 | # all following commands should have the same output file size 15 | safe_write(pth0, im) 16 | safe_write(pth1, im, optimize=True) 17 | safe_write(pth2, im, compression_level=9) 18 | tifffile.imwrite(pth3, im) 19 | tifffile.imwrite(pth4, im, compression = 'zlib') 20 | -------------------------------------------------------------------------------- /utils/panoptic_parts/tests/utils/visualization_test.py: -------------------------------------------------------------------------------- 1 | import matplotlib as mpl 2 | import matplotlib.pyplot as plt 3 | 4 | from panoptic_parts.utils.visualization import random_colors, PARULA99_CM 5 | 6 | def random_colors_test(): 7 | colors0 = random_colors(0) 8 | assert len(colors0) == 0 9 | print(colors0) 10 | colors1 = random_colors(1) 11 | assert len(colors1) == 1 12 | print(colors1) 13 | colors10 = random_colors(10) 14 | assert len(colors10) == 10 15 | assert all(isinstance(color, tuple) for color in colors10) 16 | print(colors10) 17 | 18 | 19 | def parula99_cm_test(): 20 | # just a demo function plotting the colormap 21 | fig, ax = plt.subplots(figsize=(6, 1)) 22 | fig.subplots_adjust(bottom=0.5) 23 | norm = mpl.colors.Normalize(vmin=1, vmax=PARULA99_CM.N + 1) 24 | fig.colorbar(mpl.cm.ScalarMappable(norm=norm, cmap=PARULA99_CM), 25 | cax=ax, orientation='horizontal', label='part-level semantic classes') 26 | fig.waitforbuttonpress(30.0) 27 | Nparts = 5 28 | bounds = list(range(1, Nparts + 1)) 29 | norm = mpl.colors.BoundaryNorm(bounds, PARULA99_CM.N, extend='both') 30 | print(*map(norm, range(Nparts + 1 + 1))) 31 | mpl.colorbar.ColorbarBase(ax, cmap=PARULA99_CM, norm=norm, orientation='horizontal') 32 | plt.draw() 33 | fig.waitforbuttonpress(30.0) 34 | 35 | 36 | if __name__ == "__main__": 37 | random_colors_test() 38 | parula99_cm_test() 39 | --------------------------------------------------------------------------------