├── .gitignore
├── INSTALL.md
├── LICENSE
├── LICENSE_MASK2FORMER
├── MODELS.md
├── README.md
├── checkpoints
    └── README.md
├── configs
    ├── ade20k
    │   ├── instance-segmentation
    │   │   ├── Base-ADE20K-InstanceSegmentation.yaml
    │   │   ├── maskformer2_R50_bs16_160k.yaml
    │   │   └── swin
    │   │   │   └── maskformer2_swin_large_IN21k_384_bs16_160k.yaml
    │   ├── panoptic-segmentation
    │   │   ├── Base-ADE20K-PanopticSegmentation.yaml
    │   │   ├── maskformer2_R50_bs16_160k.yaml
    │   │   └── swin
    │   │   │   └── maskformer2_swin_large_IN21k_384_bs16_160k.yaml
    │   └── semantic-segmentation
    │   │   ├── Base-ADE20K-SemanticSegmentation.yaml
    │   │   ├── maskformer2_R101_bs16_90k.yaml
    │   │   ├── maskformer2_R50_bs16_160k.yaml
    │   │   └── swin
    │   │       ├── maskformer2_swin_base_384_bs16_160k_res640.yaml
    │   │       ├── maskformer2_swin_base_IN21k_384_bs16_160k_res640.yaml
    │   │       ├── maskformer2_swin_large_IN21k_384_bs16_160k_res640.yaml
    │   │       ├── maskformer2_swin_small_bs16_160k.yaml
    │   │       └── maskformer2_swin_tiny_bs16_160k.yaml
    ├── cityscapes
    │   ├── instance-segmentation
    │   │   ├── Base-Cityscapes-InstanceSegmentation.yaml
    │   │   ├── maskformer2_R101_bs16_90k.yaml
    │   │   ├── maskformer2_R50_bs16_90k.yaml
    │   │   └── swin
    │   │   │   ├── maskformer2_swin_base_IN21k_384_bs16_90k.yaml
    │   │   │   ├── maskformer2_swin_large_IN21k_384_bs16_90k.yaml
    │   │   │   ├── maskformer2_swin_small_bs16_90k.yaml
    │   │   │   └── maskformer2_swin_tiny_bs16_90k.yaml
    │   ├── panoptic-segmentation
    │   │   ├── Base-Cityscapes-PanopticSegmentation.yaml
    │   │   ├── maskformer2_R101_bs16_90k.yaml
    │   │   ├── maskformer2_R50_bs16_90k.yaml
    │   │   └── swin
    │   │   │   ├── maskformer2_swin_base_IN21k_384_bs16_90k.yaml
    │   │   │   ├── maskformer2_swin_large_IN21k_384_bs16_90k.yaml
    │   │   │   ├── maskformer2_swin_small_bs16_90k.yaml
    │   │   │   └── maskformer2_swin_tiny_bs16_90k.yaml
    │   ├── pps
    │   │   ├── Base-Cityscapes-PPS.yaml
    │   │   ├── tapps_cityscapes_r50_cocoinit.yaml
    │   │   ├── tapps_cityscapes_r50_in1kinit.yaml
    │   │   └── tapps_cityscapes_swinb_cocoinit.yaml
    │   └── semantic-segmentation
    │   │   ├── Base-Cityscapes-SemanticSegmentation.yaml
    │   │   ├── maskformer2_R101_bs16_90k.yaml
    │   │   ├── maskformer2_R50_bs16_90k.yaml
    │   │   └── swin
    │   │       ├── maskformer2_swin_base_IN21k_384_bs16_90k.yaml
    │   │       ├── maskformer2_swin_large_IN21k_384_bs16_90k.yaml
    │   │       ├── maskformer2_swin_small_bs16_90k.yaml
    │   │       └── maskformer2_swin_tiny_bs16_90k.yaml
    ├── coco
    │   ├── instance-segmentation
    │   │   ├── Base-COCO-InstanceSegmentation.yaml
    │   │   ├── maskformer2_R101_bs16_50ep.yaml
    │   │   ├── maskformer2_R50_bs16_50ep.yaml
    │   │   └── swin
    │   │   │   ├── maskformer2_swin_base_384_bs16_50ep.yaml
    │   │   │   ├── maskformer2_swin_base_IN21k_384_bs16_50ep.yaml
    │   │   │   ├── maskformer2_swin_large_IN21k_384_bs16_100ep.yaml
    │   │   │   ├── maskformer2_swin_small_bs16_50ep.yaml
    │   │   │   └── maskformer2_swin_tiny_bs16_50ep.yaml
    │   └── panoptic-segmentation
    │   │   ├── Base-COCO-PanopticSegmentation.yaml
    │   │   ├── maskformer2_R101_bs16_50ep.yaml
    │   │   ├── maskformer2_R50_bs16_50ep.yaml
    │   │   └── swin
    │   │       ├── maskformer2_swin_base_384_bs16_50ep.yaml
    │   │       ├── maskformer2_swin_base_IN21k_384_bs16_50ep.yaml
    │   │       ├── maskformer2_swin_large_IN21k_384_bs16_100ep.yaml
    │   │       ├── maskformer2_swin_small_bs16_50ep.yaml
    │   │       └── maskformer2_swin_tiny_bs16_50ep.yaml
    ├── mapillary-vistas
    │   ├── panoptic-segmentation
    │   │   ├── Base-MapillaryVistas-PanopticSegmentation.yaml
    │   │   ├── maskformer_R50_bs16_300k.yaml
    │   │   └── swin
    │   │   │   └── maskformer2_swin_large_IN21k_384_bs16_300k.yaml
    │   └── semantic-segmentation
    │   │   ├── Base-MapillaryVistas-SemanticSegmentation.yaml
    │   │   ├── maskformer2_R50_bs16_300k.yaml
    │   │   └── swin
    │   │       └── maskformer2_swin_large_IN21k_384_bs16_300k.yaml
    └── pascal
    │   └── pps
    │       ├── Base-Pascal-PPS-LSJ.yaml
    │       ├── pascal_107
    │           ├── tapps_pascal107_r50_cocoinit.yaml
    │           └── tapps_pascal107_r50_in1kinit.yaml
    │       ├── tapps_pascal_r50_cocoinit.yaml
    │       ├── tapps_pascal_r50_in1kinit.yaml
    │       └── tapps_pascal_swinb_cocoinit.yaml
├── datasets
    ├── README.md
    ├── ade20k_instance_catid_mapping.txt
    ├── ade20k_instance_imgCatIds.json
    ├── prepare_ade20k_ins_seg.py
    ├── prepare_ade20k_pan_seg.py
    ├── prepare_ade20k_sem_seg.py
    ├── prepare_cityscapes_pp.py
    ├── prepare_coco_semantic_annos_from_panoptic_annos.py
    ├── prepare_pascal_pp.py
    └── prepare_pascal_pp_107.py
├── eval
    ├── eval_partpq.py
    └── visualize_pps.py
├── inference_single_img.py
├── requirements.txt
├── tapps
    ├── __init__.py
    ├── config.py
    ├── data
    │   ├── __init__.py
    │   ├── dataset_mappers
    │   │   ├── __init__.py
    │   │   ├── augmentations.py
    │   │   ├── coco_instance_new_baseline_dataset_mapper.py
    │   │   ├── coco_panoptic_new_baseline_dataset_mapper.py
    │   │   ├── mask_former_instance_dataset_mapper.py
    │   │   ├── mask_former_panoptic_dataset_mapper.py
    │   │   ├── mask_former_panoptic_parts_dataset_mapper.py
    │   │   ├── mask_former_semantic_dataset_mapper.py
    │   │   └── pascal_panoptic_parts_new_baseline_dataset_mapper.py
    │   └── datasets
    │   │   ├── __init__.py
    │   │   ├── register_ade20k_full.py
    │   │   ├── register_ade20k_instance.py
    │   │   ├── register_ade20k_panoptic.py
    │   │   ├── register_cityscapes_panoptic_parts.py
    │   │   ├── register_coco_panoptic_annos_semseg.py
    │   │   ├── register_coco_stuff_10k.py
    │   │   ├── register_mapillary_vistas.py
    │   │   ├── register_mapillary_vistas_panoptic.py
    │   │   ├── register_pascal_panoptic_parts.py
    │   │   └── register_pascal_panoptic_parts_107.py
    ├── evaluation
    │   ├── __init__.py
    │   └── instance_evaluation.py
    ├── maskformer_model.py
    ├── modeling
    │   ├── __init__.py
    │   ├── backbone
    │   │   ├── __init__.py
    │   │   └── swin.py
    │   ├── criterion.py
    │   ├── matcher.py
    │   ├── meta_arch
    │   │   ├── __init__.py
    │   │   ├── mask_former_head.py
    │   │   └── per_pixel_baseline.py
    │   ├── pixel_decoder
    │   │   ├── __init__.py
    │   │   ├── fpn.py
    │   │   ├── msdeformattn.py
    │   │   └── ops
    │   │   │   ├── functions
    │   │   │       ├── __init__.py
    │   │   │       └── ms_deform_attn_func.py
    │   │   │   ├── make.sh
    │   │   │   ├── modules
    │   │   │       ├── __init__.py
    │   │   │       └── ms_deform_attn.py
    │   │   │   ├── setup.py
    │   │   │   ├── src
    │   │   │       ├── cpu
    │   │   │       │   ├── ms_deform_attn_cpu.cpp
    │   │   │       │   └── ms_deform_attn_cpu.h
    │   │   │       ├── cuda
    │   │   │       │   ├── ms_deform_attn_cuda.cu
    │   │   │       │   ├── ms_deform_attn_cuda.h
    │   │   │       │   └── ms_deform_im2col_cuda.cuh
    │   │   │       ├── ms_deform_attn.h
    │   │   │       └── vision.cpp
    │   │   │   └── test.py
    │   └── transformer_decoder
    │   │   ├── __init__.py
    │   │   ├── mask2former_transformer_decoder.py
    │   │   ├── maskformer_transformer_decoder.py
    │   │   ├── part_decoder.py
    │   │   ├── position_encoding.py
    │   │   └── transformer.py
    ├── test_time_augmentation.py
    └── utils
    │   ├── __init__.py
    │   └── misc.py
├── tools
    ├── README.md
    ├── analyze_model.py
    ├── convert-pretrained-swin-model-to-d2.py
    ├── convert-torchvision-to-d2.py
    ├── evaluate_coco_boundary_ap.py
    └── evaluate_pq_for_semantic_segmentation.py
├── train_net.py
└── utils
    └── panoptic_parts
        ├── LICENSE
        ├── MANIFEST.in
        ├── README.md
        ├── __init__.py
        ├── docs
            ├── Makefile
            ├── make.bat
            ├── requirements.txt
            └── source
            │   ├── api_and_code.rst
            │   ├── conf.py
            │   ├── contact.md
            │   ├── errata_cvpr2021.md
            │   ├── evaluate_results.md
            │   ├── generate_results.md
            │   ├── ground_truth_usage_cases.md
            │   ├── index.rst
            │   ├── installation.md
            │   ├── introduction.md
            │   ├── label_format.md
            │   ├── scripts.md
            │   ├── tools.md
            │   └── visualization.md
        ├── optional.txt
        ├── panoptic_parts
            ├── __init__.py
            ├── cityscapes_panoptic_parts
            │   ├── __init__.py
            │   └── dataset_v2.0
            │   │   └── README.md
            ├── evaluation
            │   ├── __init__.py
            │   ├── eval_PartPQ.py
            │   ├── experimental_eval_PartIOU.py
            │   └── prepare_data.py
            ├── merging
            │   ├── __init__.py
            │   ├── merge_to_panoptic.py
            │   └── merge_to_pps.py
            ├── pascal_panoptic_parts
            │   ├── __init__.py
            │   └── dataset_v2.0
            │   │   └── README.md
            ├── specs
            │   ├── __init__.py
            │   ├── dataset_spec.py
            │   ├── dataset_specs
            │   │   ├── cpp_datasetspec.yaml
            │   │   └── ppp_datasetspec.yaml
            │   ├── eval_spec.py
            │   └── eval_specs
            │   │   ├── ppq_cpp_19_23_cvpr21_default_evalspec.yaml
            │   │   ├── ppq_cpp_19_23_cvpr21_grouped_evalspec.yaml
            │   │   ├── ppq_ppp_59_107_cvpr21_default_evalspec.yaml
            │   │   └── ppq_ppp_59_57_cvpr21_default_evalspec.yaml
            ├── utils
            │   ├── __init__.py
            │   ├── evaluation_PartPQ.py
            │   ├── experimental_evaluation_IOU.py
            │   ├── format.py
            │   ├── internal
            │   │   ├── __init__.py
            │   │   ├── convert_annotations_v1_to_v2.py
            │   │   ├── populate_ppp_official_evalspec.py
            │   │   └── ppq_ppp_20_58_part_groupings.yaml
            │   ├── utils.py
            │   └── visualization.py
            └── visualization
            │   ├── __init__.py
            │   └── visualize_label_with_legend.py
        ├── pyproject.toml
        ├── requirements.txt
        ├── setup.cfg
        └── tests
            ├── __init__.py
            ├── cityscapes_panoptic_parts
                ├── __init__.py
                ├── dataset_sanity_check.py
                ├── visualize_from_paths_test.sh
                └── visualize_label_with_legend_test.sh
            ├── pascal_panoptic_parts
                └── visualize_from_paths_test.sh
            └── utils
                ├── __init__.py
                ├── format_test.py
                ├── utils_test.py
                └── visualization_test.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # output dir
 2 | output
 3 | predictions
 4 | instant_test_output
 5 | inference_test_output
 6 | 
 7 | 
 8 | *.png
 9 | *.json
10 | *.diff
11 | *.jpg
12 | !/projects/DensePose/doc/images/*.jpg
13 | 
14 | # compilation and distribution
15 | __pycache__
16 | _ext
17 | *.pyc
18 | *.pyd
19 | *.so
20 | *.dll
21 | *.egg-info/
22 | build/
23 | dist/
24 | wheels/
25 | 
26 | # pytorch/python/numpy formats
27 | *.pth
28 | *.pkl
29 | *.npy
30 | *.ts
31 | model_ts*.txt
32 | 
33 | # ipython/jupyter notebooks
34 | *.ipynb
35 | **/.ipynb_checkpoints/
36 | 
37 | # Editor temporaries
38 | *.swn
39 | *.swo
40 | *.swp
41 | *~
42 | 
43 | # editor settings
44 | .idea
45 | .vscode
46 | _darcs
47 | 
48 | # project dirs
49 | /checkpoints/*
50 | /detectron2/model_zoo/configs
51 | /datasets/*
52 | !/datasets/*.*
53 | /projects/*/datasets
54 | /models
55 | /snippet


--------------------------------------------------------------------------------
/INSTALL.md:
--------------------------------------------------------------------------------
 1 | ## Installation
 2 | 
 3 | These installation instructions have been slightly adapted from the original [Mask2Former instructions](https://github.com/facebookresearch/Mask2Former/blob/main/INSTALL.md).
 4 | 
 5 | ### Requirements
 6 | - Linux or macOS with Python ≥ 3.6
 7 | - PyTorch ≥ 1.9 and [torchvision](https://github.com/pytorch/vision/) that matches the PyTorch installation.
 8 |   Install them together at [pytorch.org](https://pytorch.org) to make sure of this. Note, please check
 9 |   PyTorch version matches that is required by Detectron2.
10 | - Detectron2: follow [Detectron2 installation instructions](https://detectron2.readthedocs.io/tutorials/install.html).
11 | - OpenCV is optional but needed by demo and visualization
12 | - `pip install -r requirements.txt`
13 | 
14 | ### CUDA kernel for MSDeformAttn
15 | After preparing the required environment, run the following command to compile CUDA kernel for MSDeformAttn:
16 | 
17 | `CUDA_HOME` must be defined and points to the directory of the installed CUDA toolkit.
18 | 
19 | ```bash
20 | cd tapps/modeling/pixel_decoder/ops
21 | sh make.sh
22 | ```
23 | 
24 | #### Building on another system
25 | To build on a system that does not have a GPU device but provide the drivers:
26 | ```bash
27 | TORCH_CUDA_ARCH_LIST='8.0' FORCE_CUDA=1 python setup.py build install
28 | ```
29 | 
30 | ### Example conda environment setup
31 | ```bash
32 | conda create --name tapps python=3.9 -y
33 | conda activate tapps
34 | conda install pytorch==1.13.1 torchvision==0.14.1 pytorch-cuda=11.7 -c pytorch -c nvidia
35 | conda install ruamel.yaml pandas scipy shapely h5py
36 | pip install -U opencv-python
37 | 
38 | # under your working directory
39 | git clone git@github.com:facebookresearch/detectron2.git
40 | cd detectron2
41 | python -m pip install -e .
42 | pip install git+https://github.com/cocodataset/panopticapi.git
43 | pip install git+https://github.com/mcordts/cityscapesScripts.git
44 | 
45 | cd ..
46 | git clone https://github.com/tue-mps/tapps.git
47 | cd tapps
48 | pip install timm submitit cython scikit-image psutil scikit-learn
49 | cd tapps/modeling/pixel_decoder/ops
50 | sh make.sh
51 | ```
52 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2024 Daan de Geus, Eindhoven University of Technology
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/LICENSE_MASK2FORMER:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2022 Meta, Inc.
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/checkpoints/README.md:
--------------------------------------------------------------------------------
 1 | # Download pre-trained model weights
 2 | 
 3 | ## TAPPS model weights
 4 | [Here](../MODELS.md), we list the different models that we release, and provide a download link.
 5 | 
 6 | 
 7 | ## COCO pre-trained model weights
 8 | To initialize a model with COCO (panoptic) pre-trained weights, as done in our work, follow these steps:
 9 | 
10 | 1. Identify the backbone architecture of the model you wish to train (e.g., ResNet-50 or Swin-B)
11 | 2. For this backbone, download the model weights provided in the [original Mask2Former repository](https://github.com/facebookresearch/Mask2Former/blob/main/MODEL_ZOO.md#panoptic-segmentation), trained for COCO panoptic segmentation.
12 | 3. Place these model weights in the `checkpoints` directory, following this structure:
13 | 
14 |     ```
15 |     checkpoints/
16 |         maskformer2_R50_bs16_50ep/
17 |             model_final_94dc52.pkl
18 |         maskformer2_swin_base_IN21k_384_bs16_50ep/
19 |             model_final_54b88a.pkl
20 |     ```
21 | 
22 | Then, you can simply run the training code following [the instructions provided here](../README.md#training). In the default configs, the path to the COCO pre-trained weights is already provided.


--------------------------------------------------------------------------------
/configs/ade20k/instance-segmentation/Base-ADE20K-InstanceSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     # NORM: "SyncBN"
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("ade20k_instance_train",)
18 |   TEST: ("ade20k_instance_val",)
19 | SOLVER:
20 |   IMS_PER_BATCH: 16
21 |   BASE_LR: 0.0001
22 |   MAX_ITER: 160000
23 |   WARMUP_FACTOR: 1.0
24 |   WARMUP_ITERS: 0
25 |   WEIGHT_DECAY: 0.05
26 |   OPTIMIZER: "ADAMW"
27 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
28 |   BACKBONE_MULTIPLIER: 0.1
29 |   CLIP_GRADIENTS:
30 |     ENABLED: True
31 |     CLIP_TYPE: "full_model"
32 |     CLIP_VALUE: 0.01
33 |     NORM_TYPE: 2.0
34 |   AMP:
35 |     ENABLED: True
36 | INPUT:
37 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
38 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
39 |   MIN_SIZE_TEST: 640
40 |   MAX_SIZE_TRAIN: 2560
41 |   MAX_SIZE_TEST: 2560
42 |   CROP:
43 |     ENABLED: True
44 |     TYPE: "absolute"
45 |     SIZE: (640, 640)
46 |     SINGLE_CATEGORY_MAX_AREA: 1.0
47 |   COLOR_AUG_SSD: True
48 |   SIZE_DIVISIBILITY: 640  # used in dataset mapper
49 |   FORMAT: "RGB"
50 |   DATASET_MAPPER_NAME: "mask_former_instance"
51 | TEST:
52 |   EVAL_PERIOD: 5000
53 |   AUG:
54 |     ENABLED: False
55 |     MIN_SIZES: [320, 480, 640, 800, 960, 1120]
56 |     MAX_SIZE: 4480
57 |     FLIP: True
58 | DATALOADER:
59 |   FILTER_EMPTY_ANNOTATIONS: True
60 |   NUM_WORKERS: 4
61 | VERSION: 2
62 | 


--------------------------------------------------------------------------------
/configs/ade20k/instance-segmentation/maskformer2_R50_bs16_160k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-ADE20K-InstanceSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 255
 7 |     NUM_CLASSES: 100
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     DEEP_SUPERVISION: True
22 |     NO_OBJECT_WEIGHT: 0.1
23 |     CLASS_WEIGHT: 2.0
24 |     MASK_WEIGHT: 5.0
25 |     DICE_WEIGHT: 5.0
26 |     HIDDEN_DIM: 256
27 |     NUM_OBJECT_QUERIES: 100
28 |     NHEADS: 8
29 |     DROPOUT: 0.0
30 |     DIM_FEEDFORWARD: 2048
31 |     ENC_LAYERS: 0
32 |     PRE_NORM: False
33 |     ENFORCE_INPUT_PROJ: False
34 |     SIZE_DIVISIBILITY: 32
35 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
36 |     TRAIN_NUM_POINTS: 12544
37 |     OVERSAMPLE_RATIO: 3.0
38 |     IMPORTANCE_SAMPLE_RATIO: 0.75
39 |     TEST:
40 |       SEMANTIC_ON: True
41 |       INSTANCE_ON: True
42 |       PANOPTIC_ON: True
43 |       OVERLAP_THRESHOLD: 0.8
44 |       OBJECT_MASK_THRESHOLD: 0.8
45 | 


--------------------------------------------------------------------------------
/configs/ade20k/instance-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_160k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 200
19 | 


--------------------------------------------------------------------------------
/configs/ade20k/panoptic-segmentation/Base-ADE20K-PanopticSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     # NORM: "SyncBN"
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("ade20k_panoptic_train",)
18 |   TEST: ("ade20k_panoptic_val",)
19 | SOLVER:
20 |   IMS_PER_BATCH: 16
21 |   BASE_LR: 0.0001
22 |   MAX_ITER: 160000
23 |   WARMUP_FACTOR: 1.0
24 |   WARMUP_ITERS: 0
25 |   WEIGHT_DECAY: 0.05
26 |   OPTIMIZER: "ADAMW"
27 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
28 |   BACKBONE_MULTIPLIER: 0.1
29 |   CLIP_GRADIENTS:
30 |     ENABLED: True
31 |     CLIP_TYPE: "full_model"
32 |     CLIP_VALUE: 0.01
33 |     NORM_TYPE: 2.0
34 |   AMP:
35 |     ENABLED: True
36 | INPUT:
37 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
38 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
39 |   MIN_SIZE_TEST: 640
40 |   MAX_SIZE_TRAIN: 2560
41 |   MAX_SIZE_TEST: 2560
42 |   CROP:
43 |     ENABLED: True
44 |     TYPE: "absolute"
45 |     SIZE: (640, 640)
46 |     SINGLE_CATEGORY_MAX_AREA: 1.0
47 |   COLOR_AUG_SSD: True
48 |   SIZE_DIVISIBILITY: 640  # used in dataset mapper
49 |   FORMAT: "RGB"
50 |   DATASET_MAPPER_NAME: "mask_former_panoptic"
51 | TEST:
52 |   EVAL_PERIOD: 5000
53 |   AUG:
54 |     ENABLED: False
55 |     MIN_SIZES: [320, 480, 640, 800, 960, 1120]
56 |     MAX_SIZE: 4480
57 |     FLIP: True
58 | DATALOADER:
59 |   FILTER_EMPTY_ANNOTATIONS: True
60 |   NUM_WORKERS: 4
61 | VERSION: 2
62 | 


--------------------------------------------------------------------------------
/configs/ade20k/panoptic-segmentation/maskformer2_R50_bs16_160k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-ADE20K-PanopticSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 255
 7 |     NUM_CLASSES: 150
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     DEEP_SUPERVISION: True
22 |     NO_OBJECT_WEIGHT: 0.1
23 |     CLASS_WEIGHT: 2.0
24 |     MASK_WEIGHT: 5.0
25 |     DICE_WEIGHT: 5.0
26 |     HIDDEN_DIM: 256
27 |     NUM_OBJECT_QUERIES: 100
28 |     NHEADS: 8
29 |     DROPOUT: 0.0
30 |     DIM_FEEDFORWARD: 2048
31 |     ENC_LAYERS: 0
32 |     PRE_NORM: False
33 |     ENFORCE_INPUT_PROJ: False
34 |     SIZE_DIVISIBILITY: 32
35 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
36 |     TRAIN_NUM_POINTS: 12544
37 |     OVERSAMPLE_RATIO: 3.0
38 |     IMPORTANCE_SAMPLE_RATIO: 0.75
39 |     TEST:
40 |       SEMANTIC_ON: True
41 |       INSTANCE_ON: True
42 |       PANOPTIC_ON: True
43 |       OVERLAP_THRESHOLD: 0.8
44 |       OBJECT_MASK_THRESHOLD: 0.8
45 | 


--------------------------------------------------------------------------------
/configs/ade20k/panoptic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_160k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 200
19 | 


--------------------------------------------------------------------------------
/configs/ade20k/semantic-segmentation/Base-ADE20K-SemanticSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     # NORM: "SyncBN"
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("ade20k_sem_seg_train",)
18 |   TEST: ("ade20k_sem_seg_val",)
19 | SOLVER:
20 |   IMS_PER_BATCH: 16
21 |   BASE_LR: 0.0001
22 |   MAX_ITER: 160000
23 |   WARMUP_FACTOR: 1.0
24 |   WARMUP_ITERS: 0
25 |   WEIGHT_DECAY: 0.05
26 |   OPTIMIZER: "ADAMW"
27 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
28 |   BACKBONE_MULTIPLIER: 0.1
29 |   CLIP_GRADIENTS:
30 |     ENABLED: True
31 |     CLIP_TYPE: "full_model"
32 |     CLIP_VALUE: 0.01
33 |     NORM_TYPE: 2.0
34 |   AMP:
35 |     ENABLED: True
36 | INPUT:
37 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 512) for x in range(5, 21)]"]
38 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
39 |   MIN_SIZE_TEST: 512
40 |   MAX_SIZE_TRAIN: 2048
41 |   MAX_SIZE_TEST: 2048
42 |   CROP:
43 |     ENABLED: True
44 |     TYPE: "absolute"
45 |     SIZE: (512, 512)
46 |     SINGLE_CATEGORY_MAX_AREA: 1.0
47 |   COLOR_AUG_SSD: True
48 |   SIZE_DIVISIBILITY: 512  # used in dataset mapper
49 |   FORMAT: "RGB"
50 |   DATASET_MAPPER_NAME: "mask_former_semantic"
51 | TEST:
52 |   EVAL_PERIOD: 5000
53 |   AUG:
54 |     ENABLED: False
55 |     MIN_SIZES: [256, 384, 512, 640, 768, 896]
56 |     MAX_SIZE: 3584
57 |     FLIP: True
58 | DATALOADER:
59 |   FILTER_EMPTY_ANNOTATIONS: True
60 |   NUM_WORKERS: 4
61 | VERSION: 2
62 | 


--------------------------------------------------------------------------------
/configs/ade20k/semantic-segmentation/maskformer2_R101_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: maskformer2_R50_bs16_160k.yaml
 2 | MODEL:
 3 |   WEIGHTS: "R-101.pkl"
 4 |   RESNETS:
 5 |     DEPTH: 101
 6 |     STEM_TYPE: "basic"  # not used
 7 |     STEM_OUT_CHANNELS: 64
 8 |     STRIDE_IN_1X1: False
 9 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
10 |     NORM: "SyncBN"
11 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
12 | 


--------------------------------------------------------------------------------
/configs/ade20k/semantic-segmentation/maskformer2_R50_bs16_160k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-ADE20K-SemanticSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 255
 7 |     NUM_CLASSES: 150
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     DEEP_SUPERVISION: True
22 |     NO_OBJECT_WEIGHT: 0.1
23 |     CLASS_WEIGHT: 2.0
24 |     MASK_WEIGHT: 5.0
25 |     DICE_WEIGHT: 5.0
26 |     HIDDEN_DIM: 256
27 |     NUM_OBJECT_QUERIES: 100
28 |     NHEADS: 8
29 |     DROPOUT: 0.0
30 |     DIM_FEEDFORWARD: 2048
31 |     ENC_LAYERS: 0
32 |     PRE_NORM: False
33 |     ENFORCE_INPUT_PROJ: False
34 |     SIZE_DIVISIBILITY: 32
35 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
36 |     TRAIN_NUM_POINTS: 12544
37 |     OVERSAMPLE_RATIO: 3.0
38 |     IMPORTANCE_SAMPLE_RATIO: 0.75
39 |     TEST:
40 |       SEMANTIC_ON: True
41 |       INSTANCE_ON: False
42 |       PANOPTIC_ON: False
43 |       OVERLAP_THRESHOLD: 0.8
44 |       OBJECT_MASK_THRESHOLD: 0.8
45 | 


--------------------------------------------------------------------------------
/configs/ade20k/semantic-segmentation/swin/maskformer2_swin_base_384_bs16_160k_res640.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_base_patch4_window12_384.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | INPUT:
18 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
19 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
20 |   MIN_SIZE_TEST: 640
21 |   MAX_SIZE_TRAIN: 2560
22 |   MAX_SIZE_TEST: 2560
23 |   CROP:
24 |     ENABLED: True
25 |     TYPE: "absolute"
26 |     SIZE: (640, 640)
27 |     SINGLE_CATEGORY_MAX_AREA: 1.0
28 |   COLOR_AUG_SSD: True
29 |   SIZE_DIVISIBILITY: 640  # used in dataset mapper
30 |   FORMAT: "RGB"
31 | TEST:
32 |   EVAL_PERIOD: 5000
33 |   AUG:
34 |     ENABLED: False
35 |     MIN_SIZES: [320, 480, 640, 800, 960, 1120]
36 |     MAX_SIZE: 4480
37 |     FLIP: True
38 | 


--------------------------------------------------------------------------------
/configs/ade20k/semantic-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_160k_res640.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_base_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | INPUT:
18 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
19 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
20 |   MIN_SIZE_TEST: 640
21 |   MAX_SIZE_TRAIN: 2560
22 |   MAX_SIZE_TEST: 2560
23 |   CROP:
24 |     ENABLED: True
25 |     TYPE: "absolute"
26 |     SIZE: (640, 640)
27 |     SINGLE_CATEGORY_MAX_AREA: 1.0
28 |   COLOR_AUG_SSD: True
29 |   SIZE_DIVISIBILITY: 640  # used in dataset mapper
30 |   FORMAT: "RGB"
31 | TEST:
32 |   EVAL_PERIOD: 5000
33 |   AUG:
34 |     ENABLED: False
35 |     MIN_SIZES: [320, 480, 640, 800, 960, 1120]
36 |     MAX_SIZE: 4480
37 |     FLIP: True
38 | 


--------------------------------------------------------------------------------
/configs/ade20k/semantic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_160k_res640.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | INPUT:
18 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
19 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
20 |   MIN_SIZE_TEST: 640
21 |   MAX_SIZE_TRAIN: 2560
22 |   MAX_SIZE_TEST: 2560
23 |   CROP:
24 |     ENABLED: True
25 |     TYPE: "absolute"
26 |     SIZE: (640, 640)
27 |     SINGLE_CATEGORY_MAX_AREA: 1.0
28 |   COLOR_AUG_SSD: True
29 |   SIZE_DIVISIBILITY: 640  # used in dataset mapper
30 |   FORMAT: "RGB"
31 | TEST:
32 |   EVAL_PERIOD: 5000
33 |   AUG:
34 |     ENABLED: False
35 |     MIN_SIZES: [320, 480, 640, 800, 960, 1120]
36 |     MAX_SIZE: 4480
37 |     FLIP: True
38 | 


--------------------------------------------------------------------------------
/configs/ade20k/semantic-segmentation/swin/maskformer2_swin_small_bs16_160k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_small_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/configs/ade20k/semantic-segmentation/swin/maskformer2_swin_tiny_bs16_160k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 6, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/configs/cityscapes/instance-segmentation/Base-Cityscapes-InstanceSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     NORM: "SyncBN"  # use syncbn for cityscapes dataset
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("cityscapes_fine_instance_seg_train",)
18 |   TEST: ("cityscapes_fine_instance_seg_val",)
19 | SOLVER:
20 |   IMS_PER_BATCH: 16
21 |   BASE_LR: 0.0001
22 |   MAX_ITER: 90000
23 |   WARMUP_FACTOR: 1.0
24 |   WARMUP_ITERS: 0
25 |   WEIGHT_DECAY: 0.05
26 |   OPTIMIZER: "ADAMW"
27 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
28 |   BACKBONE_MULTIPLIER: 0.1
29 |   CLIP_GRADIENTS:
30 |     ENABLED: True
31 |     CLIP_TYPE: "full_model"
32 |     CLIP_VALUE: 0.01
33 |     NORM_TYPE: 2.0
34 |   AMP:
35 |     ENABLED: True
36 | INPUT:
37 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1024) for x in range(5, 21)]"]
38 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
39 |   MIN_SIZE_TEST: 1024
40 |   MAX_SIZE_TRAIN: 4096
41 |   MAX_SIZE_TEST: 2048
42 |   CROP:
43 |     ENABLED: True
44 |     TYPE: "absolute"
45 |     SIZE: (512, 1024)
46 |     SINGLE_CATEGORY_MAX_AREA: 1.0
47 |   COLOR_AUG_SSD: True
48 |   SIZE_DIVISIBILITY: -1
49 |   FORMAT: "RGB"
50 |   DATASET_MAPPER_NAME: "mask_former_instance"
51 | TEST:
52 |   EVAL_PERIOD: 5000
53 |   AUG:
54 |     ENABLED: False
55 |     MIN_SIZES: [512, 768, 1024, 1280, 1536, 1792]
56 |     MAX_SIZE: 4096
57 |     FLIP: True
58 | DATALOADER:
59 |   FILTER_EMPTY_ANNOTATIONS: True
60 |   NUM_WORKERS: 4
61 | VERSION: 2
62 | 


--------------------------------------------------------------------------------
/configs/cityscapes/instance-segmentation/maskformer2_R101_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   WEIGHTS: "R-101.pkl"
 4 |   RESNETS:
 5 |     DEPTH: 101
 6 |     STEM_TYPE: "basic"  # not used
 7 |     STEM_OUT_CHANNELS: 64
 8 |     STRIDE_IN_1X1: False
 9 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
10 |     NORM: "SyncBN"
11 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
12 | 


--------------------------------------------------------------------------------
/configs/cityscapes/instance-segmentation/maskformer2_R50_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-Cityscapes-InstanceSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 255
 7 |     NUM_CLASSES: 8
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     DEEP_SUPERVISION: True
22 |     NO_OBJECT_WEIGHT: 0.1
23 |     CLASS_WEIGHT: 2.0
24 |     MASK_WEIGHT: 5.0
25 |     DICE_WEIGHT: 5.0
26 |     HIDDEN_DIM: 256
27 |     NUM_OBJECT_QUERIES: 100
28 |     NHEADS: 8
29 |     DROPOUT: 0.0
30 |     DIM_FEEDFORWARD: 2048
31 |     ENC_LAYERS: 0
32 |     PRE_NORM: False
33 |     ENFORCE_INPUT_PROJ: False
34 |     SIZE_DIVISIBILITY: 32
35 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
36 |     TRAIN_NUM_POINTS: 12544
37 |     OVERSAMPLE_RATIO: 3.0
38 |     IMPORTANCE_SAMPLE_RATIO: 0.75
39 |     TEST:
40 |       SEMANTIC_ON: False
41 |       INSTANCE_ON: True
42 |       PANOPTIC_ON: False
43 |       OVERLAP_THRESHOLD: 0.8
44 |       OBJECT_MASK_THRESHOLD: 0.8
45 | 


--------------------------------------------------------------------------------
/configs/cityscapes/instance-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_base_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | 


--------------------------------------------------------------------------------
/configs/cityscapes/instance-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 200
19 | 


--------------------------------------------------------------------------------
/configs/cityscapes/instance-segmentation/swin/maskformer2_swin_small_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_small_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/configs/cityscapes/instance-segmentation/swin/maskformer2_swin_tiny_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 6, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/configs/cityscapes/panoptic-segmentation/Base-Cityscapes-PanopticSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     NORM: "SyncBN"  # use syncbn for cityscapes dataset
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("cityscapes_fine_panoptic_train",)
18 |   TEST: ("cityscapes_fine_panoptic_val",)
19 | SOLVER:
20 |   IMS_PER_BATCH: 16
21 |   BASE_LR: 0.0001
22 |   MAX_ITER: 90000
23 |   WARMUP_FACTOR: 1.0
24 |   WARMUP_ITERS: 0
25 |   WEIGHT_DECAY: 0.05
26 |   OPTIMIZER: "ADAMW"
27 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
28 |   BACKBONE_MULTIPLIER: 0.1
29 |   CLIP_GRADIENTS:
30 |     ENABLED: True
31 |     CLIP_TYPE: "full_model"
32 |     CLIP_VALUE: 0.01
33 |     NORM_TYPE: 2.0
34 |   AMP:
35 |     ENABLED: True
36 | INPUT:
37 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1024) for x in range(5, 21)]"]
38 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
39 |   MIN_SIZE_TEST: 1024
40 |   MAX_SIZE_TRAIN: 4096
41 |   MAX_SIZE_TEST: 2048
42 |   CROP:
43 |     ENABLED: True
44 |     TYPE: "absolute"
45 |     SIZE: (512, 1024)
46 |     SINGLE_CATEGORY_MAX_AREA: 1.0
47 |   COLOR_AUG_SSD: True
48 |   SIZE_DIVISIBILITY: -1
49 |   FORMAT: "RGB"
50 |   DATASET_MAPPER_NAME: "mask_former_panoptic"
51 | TEST:
52 |   EVAL_PERIOD: 5000
53 |   AUG:
54 |     ENABLED: False
55 |     MIN_SIZES: [512, 768, 1024, 1280, 1536, 1792]
56 |     MAX_SIZE: 4096
57 |     FLIP: True
58 | DATALOADER:
59 |   FILTER_EMPTY_ANNOTATIONS: True
60 |   NUM_WORKERS: 4
61 | VERSION: 2
62 | 


--------------------------------------------------------------------------------
/configs/cityscapes/panoptic-segmentation/maskformer2_R101_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   WEIGHTS: "R-101.pkl"
 4 |   RESNETS:
 5 |     DEPTH: 101
 6 |     STEM_TYPE: "basic"  # not used
 7 |     STEM_OUT_CHANNELS: 64
 8 |     STRIDE_IN_1X1: False
 9 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
10 |     NORM: "SyncBN"
11 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
12 | 


--------------------------------------------------------------------------------
/configs/cityscapes/panoptic-segmentation/maskformer2_R50_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-Cityscapes-PanopticSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 255
 7 |     NUM_CLASSES: 19
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     DEEP_SUPERVISION: True
22 |     NO_OBJECT_WEIGHT: 0.1
23 |     CLASS_WEIGHT: 2.0
24 |     MASK_WEIGHT: 5.0
25 |     DICE_WEIGHT: 5.0
26 |     HIDDEN_DIM: 256
27 |     NUM_OBJECT_QUERIES: 100
28 |     NHEADS: 8
29 |     DROPOUT: 0.0
30 |     DIM_FEEDFORWARD: 2048
31 |     ENC_LAYERS: 0
32 |     PRE_NORM: False
33 |     ENFORCE_INPUT_PROJ: False
34 |     SIZE_DIVISIBILITY: 32
35 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
36 |     TRAIN_NUM_POINTS: 12544
37 |     OVERSAMPLE_RATIO: 3.0
38 |     IMPORTANCE_SAMPLE_RATIO: 0.75
39 |     TEST:
40 |       SEMANTIC_ON: True
41 |       INSTANCE_ON: True
42 |       PANOPTIC_ON: True
43 |       OVERLAP_THRESHOLD: 0.8
44 |       OBJECT_MASK_THRESHOLD: 0.8
45 | 


--------------------------------------------------------------------------------
/configs/cityscapes/panoptic-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_base_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | 


--------------------------------------------------------------------------------
/configs/cityscapes/panoptic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 200
19 | 


--------------------------------------------------------------------------------
/configs/cityscapes/panoptic-segmentation/swin/maskformer2_swin_small_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_small_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/configs/cityscapes/panoptic-segmentation/swin/maskformer2_swin_tiny_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 6, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/configs/cityscapes/pps/Base-Cityscapes-PPS.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     NORM: "SyncBN"  # use syncbn for cityscapes dataset
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("cityscapes_panoptic_parts_train",)
18 |   TEST: ("cityscapes_panoptic_parts_val",)
19 |   NAME: "Cityscapes"
20 | SOLVER:
21 |   IMS_PER_BATCH: 16
22 |   BASE_LR: 0.0001
23 |   MAX_ITER: 90000
24 |   CHECKPOINT_PERIOD: 20000
25 |   WARMUP_FACTOR: 1.0
26 |   WARMUP_ITERS: 0
27 |   WEIGHT_DECAY: 0.05
28 |   OPTIMIZER: "ADAMW"
29 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
30 |   BACKBONE_MULTIPLIER: 0.1
31 |   CLIP_GRADIENTS:
32 |     ENABLED: True
33 |     CLIP_TYPE: "full_model"
34 |     CLIP_VALUE: 0.01
35 |     NORM_TYPE: 2.0
36 |   AMP:
37 |     ENABLED: True
38 | INPUT:
39 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1024) for x in range(5, 21)]"]
40 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
41 |   MIN_SIZE_TEST: 1024
42 |   MAX_SIZE_TRAIN: 4096
43 |   MAX_SIZE_TEST: 2048
44 |   CROP:
45 |     ENABLED: True
46 |     TYPE: "absolute"
47 |     SIZE: (512, 1024)
48 |     SINGLE_CATEGORY_MAX_AREA: 1.0
49 |   COLOR_AUG_SSD: True
50 |   SIZE_DIVISIBILITY: -1
51 |   FORMAT: "RGB"
52 |   DATASET_MAPPER_NAME: "mask_former_panoptic_parts"
53 | TEST:
54 |   EVAL_PERIOD: 20000
55 |   AUG:
56 |     ENABLED: False
57 |     MIN_SIZES: [512, 768, 1024, 1280, 1536, 1792]
58 |     MAX_SIZE: 4096
59 |     FLIP: True
60 | DATALOADER:
61 |   FILTER_EMPTY_ANNOTATIONS: True
62 |   NUM_WORKERS: 4
63 | VERSION: 2
64 | PPS_EVAL_SPEC: "utils/panoptic_parts/panoptic_parts/specs/eval_specs/ppq_cpp_19_23_cvpr21_grouped_evalspec.yaml"


--------------------------------------------------------------------------------
/configs/cityscapes/pps/tapps_cityscapes_r50_cocoinit.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-Cityscapes-PPS.yaml
 2 | OUTPUT_DIR: "output/tapps_cityscapes_r50_cocoinit/"
 3 | MODEL:
 4 |   WEIGHTS: "checkpoints/maskformer2_R50_bs16_50ep/model_final_94dc52.pkl"
 5 |   META_ARCHITECTURE: "MaskFormer"
 6 |   SEM_SEG_HEAD:
 7 |     NAME: "MaskFormerHead"
 8 |     IGNORE_VALUE: 255
 9 |     NUM_CLASSES: 19
10 |     LOSS_WEIGHT: 1.0
11 |     CONVS_DIM: 256
12 |     MASK_DIM: 256
13 |     NORM: "GN"
14 |     # pixel decoder
15 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
16 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
17 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
18 |     COMMON_STRIDE: 4
19 |     TRANSFORMER_ENC_LAYERS: 6
20 |   RESNETS:
21 |     NORM: "SyncBN"
22 |   MASK_FORMER:
23 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
24 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
25 |     DEEP_SUPERVISION: True
26 |     NO_OBJECT_WEIGHT: 0.1
27 |     CLASS_WEIGHT: 2.0
28 |     MASK_WEIGHT: 5.0
29 |     DICE_WEIGHT: 5.0
30 |     HIDDEN_DIM: 256
31 |     NUM_OBJECT_QUERIES: 100
32 |     NHEADS: 8
33 |     DROPOUT: 0.0
34 |     DIM_FEEDFORWARD: 2048
35 |     ENC_LAYERS: 0
36 |     PRE_NORM: False
37 |     ENFORCE_INPUT_PROJ: False
38 |     SIZE_DIVISIBILITY: 32
39 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
40 |     TRAIN_NUM_POINTS: 12544
41 |     OVERSAMPLE_RATIO: 3.0
42 |     IMPORTANCE_SAMPLE_RATIO: 0.75
43 |     PARTS_ON: True
44 |     NUM_PART_CLASSES: 9
45 |     LOSS_WEIGHT_PARTS: 1.0
46 |     LOSS_WEIGHT_PANOPTIC: 1.0
47 |     PARTS_CONF_THRESHOLD: 0.1
48 |     TEST:
49 |       SEMANTIC_ON: False
50 |       INSTANCE_ON: False
51 |       PANOPTIC_ON: True
52 |       PARTS_ON: True
53 |       OVERLAP_THRESHOLD: 0.8
54 |       OBJECT_MASK_THRESHOLD: 0.8
55 | 


--------------------------------------------------------------------------------
/configs/cityscapes/pps/tapps_cityscapes_r50_in1kinit.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-Cityscapes-PPS.yaml
 2 | OUTPUT_DIR: "output/tapps_cityscapes_r50_in1kinit/"
 3 | MODEL:
 4 |   META_ARCHITECTURE: "MaskFormer"
 5 |   SEM_SEG_HEAD:
 6 |     NAME: "MaskFormerHead"
 7 |     IGNORE_VALUE: 255
 8 |     NUM_CLASSES: 19
 9 |     LOSS_WEIGHT: 1.0
10 |     CONVS_DIM: 256
11 |     MASK_DIM: 256
12 |     NORM: "GN"
13 |     # pixel decoder
14 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
15 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
16 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
17 |     COMMON_STRIDE: 4
18 |     TRANSFORMER_ENC_LAYERS: 6
19 |   RESNETS:
20 |     NORM: "SyncBN"
21 |   MASK_FORMER:
22 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
23 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
24 |     DEEP_SUPERVISION: True
25 |     NO_OBJECT_WEIGHT: 0.1
26 |     CLASS_WEIGHT: 2.0
27 |     MASK_WEIGHT: 5.0
28 |     DICE_WEIGHT: 5.0
29 |     HIDDEN_DIM: 256
30 |     NUM_OBJECT_QUERIES: 100
31 |     NHEADS: 8
32 |     DROPOUT: 0.0
33 |     DIM_FEEDFORWARD: 2048
34 |     ENC_LAYERS: 0
35 |     PRE_NORM: False
36 |     ENFORCE_INPUT_PROJ: False
37 |     SIZE_DIVISIBILITY: 32
38 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
39 |     TRAIN_NUM_POINTS: 12544
40 |     OVERSAMPLE_RATIO: 3.0
41 |     IMPORTANCE_SAMPLE_RATIO: 0.75
42 |     PARTS_ON: True
43 |     NUM_PART_CLASSES: 9
44 |     LOSS_WEIGHT_PARTS: 1.0
45 |     LOSS_WEIGHT_PANOPTIC: 1.0
46 |     PARTS_CONF_THRESHOLD: 0.1
47 |     TEST:
48 |       SEMANTIC_ON: False
49 |       INSTANCE_ON: False
50 |       PANOPTIC_ON: True
51 |       PARTS_ON: True
52 |       OVERLAP_THRESHOLD: 0.8
53 |       OBJECT_MASK_THRESHOLD: 0.8


--------------------------------------------------------------------------------
/configs/cityscapes/pps/tapps_cityscapes_swinb_cocoinit.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-Cityscapes-PPS.yaml
 2 | OUTPUT_DIR: "output/tapps_cityscapes_swinb_cocoinit/"
 3 | MODEL:
 4 |   WEIGHTS: "checkpoints/maskformer2_swin_base_IN21k_384_bs16_50ep/model_final_54b88a.pkl"
 5 |   META_ARCHITECTURE: "MaskFormer"
 6 |   BACKBONE:
 7 |     NAME: "D2SwinTransformer"
 8 |   SWIN:
 9 |     EMBED_DIM: 128
10 |     DEPTHS: [ 2, 2, 18, 2 ]
11 |     NUM_HEADS: [ 4, 8, 16, 32 ]
12 |     WINDOW_SIZE: 12
13 |     APE: False
14 |     DROP_PATH_RATE: 0.3
15 |     PATCH_NORM: True
16 |     PRETRAIN_IMG_SIZE: 384
17 |   PIXEL_MEAN: [ 123.675, 116.280, 103.530 ]
18 |   PIXEL_STD: [ 58.395, 57.120, 57.375 ]
19 |   SEM_SEG_HEAD:
20 |     NAME: "MaskFormerHead"
21 |     IGNORE_VALUE: 255
22 |     NUM_CLASSES: 19
23 |     LOSS_WEIGHT: 1.0
24 |     CONVS_DIM: 256
25 |     MASK_DIM: 256
26 |     NORM: "GN"
27 |     # pixel decoder
28 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
29 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
30 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
31 |     COMMON_STRIDE: 4
32 |     TRANSFORMER_ENC_LAYERS: 6
33 |   MASK_FORMER:
34 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
35 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
36 |     DEEP_SUPERVISION: True
37 |     NO_OBJECT_WEIGHT: 0.1
38 |     CLASS_WEIGHT: 2.0
39 |     MASK_WEIGHT: 5.0
40 |     DICE_WEIGHT: 5.0
41 |     HIDDEN_DIM: 256
42 |     NUM_OBJECT_QUERIES: 100
43 |     NHEADS: 8
44 |     DROPOUT: 0.0
45 |     DIM_FEEDFORWARD: 2048
46 |     ENC_LAYERS: 0
47 |     PRE_NORM: False
48 |     ENFORCE_INPUT_PROJ: False
49 |     SIZE_DIVISIBILITY: 32
50 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
51 |     TRAIN_NUM_POINTS: 12544
52 |     OVERSAMPLE_RATIO: 3.0
53 |     IMPORTANCE_SAMPLE_RATIO: 0.75
54 |     PARTS_ON: True
55 |     NUM_PART_CLASSES: 9
56 |     LOSS_WEIGHT_PARTS: 1.0
57 |     LOSS_WEIGHT_PANOPTIC: 1.0
58 |     PARTS_CONF_THRESHOLD: 0.1
59 |     TEST:
60 |       SEMANTIC_ON: False
61 |       INSTANCE_ON: False
62 |       PANOPTIC_ON: True
63 |       PARTS_ON: True
64 |       OVERLAP_THRESHOLD: 0.8
65 |       OBJECT_MASK_THRESHOLD: 0.8
66 | 
67 | 


--------------------------------------------------------------------------------
/configs/cityscapes/semantic-segmentation/Base-Cityscapes-SemanticSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     NORM: "SyncBN"  # use syncbn for cityscapes dataset
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("cityscapes_fine_sem_seg_train",)
18 |   TEST: ("cityscapes_fine_sem_seg_val",)
19 | SOLVER:
20 |   IMS_PER_BATCH: 16
21 |   BASE_LR: 0.0001
22 |   MAX_ITER: 90000
23 |   WARMUP_FACTOR: 1.0
24 |   WARMUP_ITERS: 0
25 |   WEIGHT_DECAY: 0.05
26 |   OPTIMIZER: "ADAMW"
27 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
28 |   BACKBONE_MULTIPLIER: 0.1
29 |   CLIP_GRADIENTS:
30 |     ENABLED: True
31 |     CLIP_TYPE: "full_model"
32 |     CLIP_VALUE: 0.01
33 |     NORM_TYPE: 2.0
34 |   AMP:
35 |     ENABLED: True
36 | INPUT:
37 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1024) for x in range(5, 21)]"]
38 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
39 |   MIN_SIZE_TEST: 1024
40 |   MAX_SIZE_TRAIN: 4096
41 |   MAX_SIZE_TEST: 2048
42 |   CROP:
43 |     ENABLED: True
44 |     TYPE: "absolute"
45 |     SIZE: (512, 1024)
46 |     SINGLE_CATEGORY_MAX_AREA: 1.0
47 |   COLOR_AUG_SSD: True
48 |   SIZE_DIVISIBILITY: -1
49 |   FORMAT: "RGB"
50 |   DATASET_MAPPER_NAME: "mask_former_semantic"
51 | TEST:
52 |   EVAL_PERIOD: 5000
53 |   AUG:
54 |     ENABLED: False
55 |     MIN_SIZES: [512, 768, 1024, 1280, 1536, 1792]
56 |     MAX_SIZE: 4096
57 |     FLIP: True
58 | DATALOADER:
59 |   FILTER_EMPTY_ANNOTATIONS: True
60 |   NUM_WORKERS: 4
61 | VERSION: 2
62 | 


--------------------------------------------------------------------------------
/configs/cityscapes/semantic-segmentation/maskformer2_R101_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   WEIGHTS: "R-101.pkl"
 4 |   RESNETS:
 5 |     DEPTH: 101
 6 |     STEM_TYPE: "basic"  # not used
 7 |     STEM_OUT_CHANNELS: 64
 8 |     STRIDE_IN_1X1: False
 9 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
10 |     NORM: "SyncBN"
11 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
12 | 


--------------------------------------------------------------------------------
/configs/cityscapes/semantic-segmentation/maskformer2_R50_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-Cityscapes-SemanticSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 255
 7 |     NUM_CLASSES: 19
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     DEEP_SUPERVISION: True
22 |     NO_OBJECT_WEIGHT: 0.1
23 |     CLASS_WEIGHT: 2.0
24 |     MASK_WEIGHT: 5.0
25 |     DICE_WEIGHT: 5.0
26 |     HIDDEN_DIM: 256
27 |     NUM_OBJECT_QUERIES: 100
28 |     NHEADS: 8
29 |     DROPOUT: 0.0
30 |     DIM_FEEDFORWARD: 2048
31 |     ENC_LAYERS: 0
32 |     PRE_NORM: False
33 |     ENFORCE_INPUT_PROJ: False
34 |     SIZE_DIVISIBILITY: 32
35 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
36 |     TRAIN_NUM_POINTS: 12544
37 |     OVERSAMPLE_RATIO: 3.0
38 |     IMPORTANCE_SAMPLE_RATIO: 0.75
39 |     TEST:
40 |       SEMANTIC_ON: True
41 |       INSTANCE_ON: False
42 |       PANOPTIC_ON: False
43 |       OVERLAP_THRESHOLD: 0.8
44 |       OBJECT_MASK_THRESHOLD: 0.8
45 | 


--------------------------------------------------------------------------------
/configs/cityscapes/semantic-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_base_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | 


--------------------------------------------------------------------------------
/configs/cityscapes/semantic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 100
19 | 


--------------------------------------------------------------------------------
/configs/cityscapes/semantic-segmentation/swin/maskformer2_swin_small_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_small_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/configs/cityscapes/semantic-segmentation/swin/maskformer2_swin_tiny_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 6, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/configs/coco/instance-segmentation/Base-COCO-InstanceSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     # NORM: "SyncBN"
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("coco_2017_train",)
18 |   TEST: ("coco_2017_val",)
19 | SOLVER:
20 |   IMS_PER_BATCH: 16
21 |   BASE_LR: 0.0001
22 |   STEPS: (327778, 355092)
23 |   MAX_ITER: 368750
24 |   WARMUP_FACTOR: 1.0
25 |   WARMUP_ITERS: 10
26 |   WEIGHT_DECAY: 0.05
27 |   OPTIMIZER: "ADAMW"
28 |   BACKBONE_MULTIPLIER: 0.1
29 |   CLIP_GRADIENTS:
30 |     ENABLED: True
31 |     CLIP_TYPE: "full_model"
32 |     CLIP_VALUE: 0.01
33 |     NORM_TYPE: 2.0
34 |   AMP:
35 |     ENABLED: True
36 | INPUT:
37 |   IMAGE_SIZE: 1024
38 |   MIN_SCALE: 0.1
39 |   MAX_SCALE: 2.0
40 |   FORMAT: "RGB"
41 |   DATASET_MAPPER_NAME: "coco_instance_lsj"
42 | TEST:
43 |   EVAL_PERIOD: 5000
44 | DATALOADER:
45 |   FILTER_EMPTY_ANNOTATIONS: True
46 |   NUM_WORKERS: 4
47 | VERSION: 2
48 | 


--------------------------------------------------------------------------------
/configs/coco/instance-segmentation/maskformer2_R101_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   WEIGHTS: "R-101.pkl"
 4 |   RESNETS:
 5 |     DEPTH: 101
 6 |     STEM_TYPE: "basic"  # not used
 7 |     STEM_OUT_CHANNELS: 64
 8 |     STRIDE_IN_1X1: False
 9 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
10 |     # NORM: "SyncBN"
11 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
12 | 


--------------------------------------------------------------------------------
/configs/coco/instance-segmentation/maskformer2_R50_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-COCO-InstanceSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 255
 7 |     NUM_CLASSES: 80
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     DEEP_SUPERVISION: True
22 |     NO_OBJECT_WEIGHT: 0.1
23 |     CLASS_WEIGHT: 2.0
24 |     MASK_WEIGHT: 5.0
25 |     DICE_WEIGHT: 5.0
26 |     HIDDEN_DIM: 256
27 |     NUM_OBJECT_QUERIES: 100
28 |     NHEADS: 8
29 |     DROPOUT: 0.0
30 |     DIM_FEEDFORWARD: 2048
31 |     ENC_LAYERS: 0
32 |     PRE_NORM: False
33 |     ENFORCE_INPUT_PROJ: False
34 |     SIZE_DIVISIBILITY: 32
35 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
36 |     TRAIN_NUM_POINTS: 12544
37 |     OVERSAMPLE_RATIO: 3.0
38 |     IMPORTANCE_SAMPLE_RATIO: 0.75
39 |     TEST:
40 |       SEMANTIC_ON: False
41 |       INSTANCE_ON: True
42 |       PANOPTIC_ON: False
43 |       OVERLAP_THRESHOLD: 0.8
44 |       OBJECT_MASK_THRESHOLD: 0.8
45 | 


--------------------------------------------------------------------------------
/configs/coco/instance-segmentation/swin/maskformer2_swin_base_384_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_base_patch4_window12_384.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | 


--------------------------------------------------------------------------------
/configs/coco/instance-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_base_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | 


--------------------------------------------------------------------------------
/configs/coco/instance-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_100ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 200
19 | SOLVER:
20 |   STEPS: (655556, 710184)
21 |   MAX_ITER: 737500
22 | 


--------------------------------------------------------------------------------
/configs/coco/instance-segmentation/swin/maskformer2_swin_small_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_small_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/configs/coco/instance-segmentation/swin/maskformer2_swin_tiny_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 6, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/configs/coco/panoptic-segmentation/Base-COCO-PanopticSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     # NORM: "SyncBN"
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("coco_2017_train_panoptic",)
18 |   TEST: ("coco_2017_val_panoptic",)  # to evaluate instance and semantic performance as well
19 | SOLVER:
20 |   IMS_PER_BATCH: 16
21 |   BASE_LR: 0.0001
22 |   STEPS: (327778, 355092)
23 |   MAX_ITER: 368750
24 |   WARMUP_FACTOR: 1.0
25 |   WARMUP_ITERS: 10
26 |   WEIGHT_DECAY: 0.05
27 |   OPTIMIZER: "ADAMW"
28 |   BACKBONE_MULTIPLIER: 0.1
29 |   CLIP_GRADIENTS:
30 |     ENABLED: True
31 |     CLIP_TYPE: "full_model"
32 |     CLIP_VALUE: 0.01
33 |     NORM_TYPE: 2.0
34 |   AMP:
35 |     ENABLED: True
36 | INPUT:
37 |   IMAGE_SIZE: 1024
38 |   MIN_SCALE: 0.1
39 |   MAX_SCALE: 2.0
40 |   FORMAT: "RGB"
41 |   DATASET_MAPPER_NAME: "coco_panoptic_lsj"
42 | TEST:
43 |   EVAL_PERIOD: 5000
44 | DATALOADER:
45 |   FILTER_EMPTY_ANNOTATIONS: True
46 |   NUM_WORKERS: 4
47 | VERSION: 2
48 | 


--------------------------------------------------------------------------------
/configs/coco/panoptic-segmentation/maskformer2_R101_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   WEIGHTS: "R-101.pkl"
 4 |   RESNETS:
 5 |     DEPTH: 101
 6 |     STEM_TYPE: "basic"  # not used
 7 |     STEM_OUT_CHANNELS: 64
 8 |     STRIDE_IN_1X1: False
 9 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
10 |     # NORM: "SyncBN"
11 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
12 | 


--------------------------------------------------------------------------------
/configs/coco/panoptic-segmentation/maskformer2_R50_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-COCO-PanopticSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
 7 |     IGNORE_VALUE: 255
 8 |     NUM_CLASSES: 133
 9 |     LOSS_WEIGHT: 1.0
10 |     CONVS_DIM: 256
11 |     MASK_DIM: 256
12 |     NORM: "GN"
13 |     # pixel decoder
14 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
15 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
16 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
17 |     COMMON_STRIDE: 4
18 |     TRANSFORMER_ENC_LAYERS: 6
19 |   MASK_FORMER:
20 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
21 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
22 |     DEEP_SUPERVISION: True
23 |     NO_OBJECT_WEIGHT: 0.1
24 |     CLASS_WEIGHT: 2.0
25 |     MASK_WEIGHT: 5.0
26 |     DICE_WEIGHT: 5.0
27 |     HIDDEN_DIM: 256
28 |     NUM_OBJECT_QUERIES: 100
29 |     NHEADS: 8
30 |     DROPOUT: 0.0
31 |     DIM_FEEDFORWARD: 2048
32 |     ENC_LAYERS: 0
33 |     PRE_NORM: False
34 |     ENFORCE_INPUT_PROJ: False
35 |     SIZE_DIVISIBILITY: 32
36 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
37 |     TRAIN_NUM_POINTS: 12544
38 |     OVERSAMPLE_RATIO: 3.0
39 |     IMPORTANCE_SAMPLE_RATIO: 0.75
40 |     TEST:
41 |       SEMANTIC_ON: True
42 |       INSTANCE_ON: True
43 |       PANOPTIC_ON: True
44 |       OVERLAP_THRESHOLD: 0.8
45 |       OBJECT_MASK_THRESHOLD: 0.8
46 | 


--------------------------------------------------------------------------------
/configs/coco/panoptic-segmentation/swin/maskformer2_swin_base_384_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_base_patch4_window12_384.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | 


--------------------------------------------------------------------------------
/configs/coco/panoptic-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_base_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | 


--------------------------------------------------------------------------------
/configs/coco/panoptic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_100ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 200
19 | SOLVER:
20 |   STEPS: (655556, 710184)
21 |   MAX_ITER: 737500
22 | 


--------------------------------------------------------------------------------
/configs/coco/panoptic-segmentation/swin/maskformer2_swin_small_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_small_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/configs/coco/panoptic-segmentation/swin/maskformer2_swin_tiny_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 6, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/configs/mapillary-vistas/panoptic-segmentation/Base-MapillaryVistas-PanopticSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     # NORM: "SyncBN"
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("mapillary_vistas_panoptic_train",)
18 |   TEST: ("mapillary_vistas_panoptic_val",)
19 | SOLVER:
20 |   IMS_PER_BATCH: 16
21 |   BASE_LR: 0.0001
22 |   MAX_ITER: 300000
23 |   WARMUP_FACTOR: 1.0
24 |   WARMUP_ITERS: 0
25 |   WEIGHT_DECAY: 0.05
26 |   OPTIMIZER: "ADAMW"
27 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
28 |   BACKBONE_MULTIPLIER: 0.1
29 |   CLIP_GRADIENTS:
30 |     ENABLED: True
31 |     CLIP_TYPE: "full_model"
32 |     CLIP_VALUE: 0.01
33 |     NORM_TYPE: 2.0
34 |   AMP:
35 |     ENABLED: True
36 | INPUT:
37 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 2048) for x in range(5, 21)]"]
38 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
39 |   MIN_SIZE_TEST: 2048
40 |   MAX_SIZE_TRAIN: 8192
41 |   MAX_SIZE_TEST: 2048
42 |   CROP:
43 |     ENABLED: True
44 |     TYPE: "absolute"
45 |     SIZE: (1024, 1024)
46 |     SINGLE_CATEGORY_MAX_AREA: 1.0
47 |   COLOR_AUG_SSD: True
48 |   SIZE_DIVISIBILITY: 1024  # used in dataset mapper
49 |   FORMAT: "RGB"
50 |   DATASET_MAPPER_NAME: "mask_former_panoptic"
51 | TEST:
52 |   EVAL_PERIOD: 0
53 | DATALOADER:
54 |   FILTER_EMPTY_ANNOTATIONS: True
55 |   NUM_WORKERS: 10
56 | VERSION: 2
57 | 


--------------------------------------------------------------------------------
/configs/mapillary-vistas/panoptic-segmentation/maskformer_R50_bs16_300k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-MapillaryVistas-PanopticSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 65
 7 |     NUM_CLASSES: 65
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     DEEP_SUPERVISION: True
22 |     NO_OBJECT_WEIGHT: 0.1
23 |     CLASS_WEIGHT: 2.0
24 |     MASK_WEIGHT: 5.0
25 |     DICE_WEIGHT: 5.0
26 |     HIDDEN_DIM: 256
27 |     NUM_OBJECT_QUERIES: 100
28 |     NHEADS: 8
29 |     DROPOUT: 0.0
30 |     DIM_FEEDFORWARD: 2048
31 |     ENC_LAYERS: 0
32 |     PRE_NORM: False
33 |     ENFORCE_INPUT_PROJ: False
34 |     SIZE_DIVISIBILITY: 32
35 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
36 |     TRAIN_NUM_POINTS: 12544
37 |     OVERSAMPLE_RATIO: 3.0
38 |     IMPORTANCE_SAMPLE_RATIO: 0.75
39 |     TEST:
40 |       SEMANTIC_ON: True
41 |       INSTANCE_ON: False
42 |       PANOPTIC_ON: True
43 |       OVERLAP_THRESHOLD: 0.8
44 |       OBJECT_MASK_THRESHOLD: 0.0
45 | 


--------------------------------------------------------------------------------
/configs/mapillary-vistas/panoptic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_300k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer_R50_bs16_300k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 200
19 | 


--------------------------------------------------------------------------------
/configs/mapillary-vistas/semantic-segmentation/Base-MapillaryVistas-SemanticSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     # NORM: "SyncBN"
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("mapillary_vistas_sem_seg_train",)
18 |   TEST: ("mapillary_vistas_sem_seg_val",)
19 | SOLVER:
20 |   IMS_PER_BATCH: 16
21 |   BASE_LR: 0.0001
22 |   MAX_ITER: 300000
23 |   WARMUP_FACTOR: 1.0
24 |   WARMUP_ITERS: 0
25 |   WEIGHT_DECAY: 0.05
26 |   OPTIMIZER: "ADAMW"
27 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
28 |   BACKBONE_MULTIPLIER: 0.1
29 |   CLIP_GRADIENTS:
30 |     ENABLED: True
31 |     CLIP_TYPE: "full_model"
32 |     CLIP_VALUE: 0.01
33 |     NORM_TYPE: 2.0
34 |   AMP:
35 |     ENABLED: True
36 | INPUT:
37 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 2048) for x in range(5, 21)]"]
38 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
39 |   MIN_SIZE_TEST: 2048
40 |   MAX_SIZE_TRAIN: 8192
41 |   MAX_SIZE_TEST: 2048
42 |   CROP:
43 |     ENABLED: True
44 |     TYPE: "absolute"
45 |     SIZE: (1024, 1024)
46 |     SINGLE_CATEGORY_MAX_AREA: 1.0
47 |   COLOR_AUG_SSD: True
48 |   SIZE_DIVISIBILITY: 1024  # used in dataset mapper
49 |   FORMAT: "RGB"
50 |   DATASET_MAPPER_NAME: "mask_former_semantic"
51 | TEST:
52 |   EVAL_PERIOD: 0
53 | DATALOADER:
54 |   FILTER_EMPTY_ANNOTATIONS: True
55 |   NUM_WORKERS: 10
56 | VERSION: 2
57 | 


--------------------------------------------------------------------------------
/configs/mapillary-vistas/semantic-segmentation/maskformer2_R50_bs16_300k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-MapillaryVistas-SemanticSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 65
 7 |     NUM_CLASSES: 65
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     DEEP_SUPERVISION: True
22 |     NO_OBJECT_WEIGHT: 0.1
23 |     CLASS_WEIGHT: 2.0
24 |     MASK_WEIGHT: 5.0
25 |     DICE_WEIGHT: 5.0
26 |     HIDDEN_DIM: 256
27 |     NUM_OBJECT_QUERIES: 100
28 |     NHEADS: 8
29 |     DROPOUT: 0.0
30 |     DIM_FEEDFORWARD: 2048
31 |     ENC_LAYERS: 0
32 |     PRE_NORM: False
33 |     ENFORCE_INPUT_PROJ: False
34 |     SIZE_DIVISIBILITY: 32
35 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
36 |     TRAIN_NUM_POINTS: 12544
37 |     OVERSAMPLE_RATIO: 3.0
38 |     IMPORTANCE_SAMPLE_RATIO: 0.75
39 |     TEST:
40 |       SEMANTIC_ON: True
41 |       INSTANCE_ON: False
42 |       PANOPTIC_ON: False
43 |       OVERLAP_THRESHOLD: 0.8
44 |       OBJECT_MASK_THRESHOLD: 0.0
45 | 


--------------------------------------------------------------------------------
/configs/mapillary-vistas/semantic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_300k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_300k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 100
19 | 


--------------------------------------------------------------------------------
/configs/pascal/pps/Base-Pascal-PPS-LSJ.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     # NORM: "SyncBN"  # use syncbn for cityscapes dataset
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("pascal_panoptic_parts_train",)
18 |   TEST: ("pascal_panoptic_parts_val",)
19 |   NAME: "Pascal"
20 | SOLVER:
21 |   IMS_PER_BATCH: 16
22 |   BASE_LR: 0.0001
23 |   MAX_ITER: 60000
24 |   CHECKPOINT_PERIOD: 10000
25 |   WARMUP_FACTOR: 1.0
26 |   WARMUP_ITERS: 100
27 |   WEIGHT_DECAY: 0.05
28 |   OPTIMIZER: "ADAMW"
29 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
30 |   BACKBONE_MULTIPLIER: 0.1
31 |   CLIP_GRADIENTS:
32 |     ENABLED: True
33 |     CLIP_TYPE: "full_model"
34 |     CLIP_VALUE: 0.01
35 |     NORM_TYPE: 2.0
36 |   AMP:
37 |     ENABLED: True
38 | INPUT:
39 |   IMAGE_SIZE: 1024
40 |   MIN_SCALE: 0.1
41 |   MAX_SCALE: 2.0
42 |   FORMAT: "RGB"
43 |   DATASET_MAPPER_NAME: "pascal_panoptic_parts_lsj"
44 | TEST:
45 |   EVAL_PERIOD: 20000
46 | DATALOADER:
47 |   FILTER_EMPTY_ANNOTATIONS: True
48 |   NUM_WORKERS: 4
49 | VERSION: 2
50 | PPS_EVAL_SPEC: "utils/panoptic_parts/panoptic_parts/specs/eval_specs/ppq_ppp_59_57_cvpr21_default_evalspec.yaml"


--------------------------------------------------------------------------------
/configs/pascal/pps/pascal_107/tapps_pascal107_r50_cocoinit.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../Base-Pascal-PPS-LSJ.yaml
 2 | OUTPUT_DIR: "output/tapps_pascal107_r50_cocoinit/"
 3 | DATASETS:
 4 |   TRAIN: ("pascal_panoptic_parts_107_train",)
 5 |   TEST: ("pascal_panoptic_parts_107_val",)
 6 |   NAME: "Pascal107"
 7 | SOLVER:
 8 |   MAX_ITER: 10000
 9 | MODEL:
10 |   WEIGHTS: "checkpoints/maskformer2_R50_bs16_50ep/model_final_94dc52.pkl"
11 |   META_ARCHITECTURE: "MaskFormer"
12 |   SEM_SEG_HEAD:
13 |     NAME: "MaskFormerHead"
14 |     IGNORE_VALUE: 255
15 |     NUM_CLASSES: 59
16 |     LOSS_WEIGHT: 1.0
17 |     CONVS_DIM: 256
18 |     MASK_DIM: 256
19 |     NORM: "GN"
20 |     # pixel decoder
21 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
22 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
23 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
24 |     COMMON_STRIDE: 4
25 |     TRANSFORMER_ENC_LAYERS: 6
26 |   MASK_FORMER:
27 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
28 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
29 |     DEEP_SUPERVISION: True
30 |     NO_OBJECT_WEIGHT: 0.1
31 |     CLASS_WEIGHT: 2.0
32 |     MASK_WEIGHT: 5.0
33 |     DICE_WEIGHT: 5.0
34 |     HIDDEN_DIM: 256
35 |     NUM_OBJECT_QUERIES: 100
36 |     NHEADS: 8
37 |     DROPOUT: 0.0
38 |     DIM_FEEDFORWARD: 2048
39 |     ENC_LAYERS: 0
40 |     PRE_NORM: False
41 |     ENFORCE_INPUT_PROJ: False
42 |     SIZE_DIVISIBILITY: 32
43 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
44 |     TRAIN_NUM_POINTS: 12544
45 |     OVERSAMPLE_RATIO: 3.0
46 |     IMPORTANCE_SAMPLE_RATIO: 0.75
47 |     PARTS_ON: True
48 |     NUM_PART_CLASSES: 107
49 |     LOSS_WEIGHT_PARTS: 1.0
50 |     LOSS_WEIGHT_PANOPTIC: 1.0
51 |     PARTS_CONF_THRESHOLD: 0.1
52 |     TEST:
53 |       SEMANTIC_ON: True
54 |       INSTANCE_ON: False
55 |       PANOPTIC_ON: True
56 |       PARTS_ON: True
57 |       OVERLAP_THRESHOLD: 0.8
58 |       OBJECT_MASK_THRESHOLD: 0.5
59 | PPS_EVAL_SPEC: "utils/panoptic_parts/panoptic_parts/specs/eval_specs/ppq_ppp_59_107_cvpr21_default_evalspec.yaml"


--------------------------------------------------------------------------------
/configs/pascal/pps/pascal_107/tapps_pascal107_r50_in1kinit.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../Base-Pascal-PPS-LSJ.yaml
 2 | OUTPUT_DIR: "output/tapps_pascal107_r50_in1kinit/"
 3 | DATASETS:
 4 |   TRAIN: ("pascal_panoptic_parts_107_train",)
 5 |   TEST: ("pascal_panoptic_parts_107_val",)
 6 |   NAME: "Pascal107"
 7 | MODEL:
 8 |   META_ARCHITECTURE: "MaskFormer"
 9 |   SEM_SEG_HEAD:
10 |     NAME: "MaskFormerHead"
11 |     IGNORE_VALUE: 255
12 |     NUM_CLASSES: 59
13 |     LOSS_WEIGHT: 1.0
14 |     CONVS_DIM: 256
15 |     MASK_DIM: 256
16 |     NORM: "GN"
17 |     # pixel decoder
18 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
19 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
20 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
21 |     COMMON_STRIDE: 4
22 |     TRANSFORMER_ENC_LAYERS: 6
23 |   MASK_FORMER:
24 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
25 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
26 |     DEEP_SUPERVISION: True
27 |     NO_OBJECT_WEIGHT: 0.1
28 |     CLASS_WEIGHT: 2.0
29 |     MASK_WEIGHT: 5.0
30 |     DICE_WEIGHT: 5.0
31 |     HIDDEN_DIM: 256
32 |     NUM_OBJECT_QUERIES: 100
33 |     NHEADS: 8
34 |     DROPOUT: 0.0
35 |     DIM_FEEDFORWARD: 2048
36 |     ENC_LAYERS: 0
37 |     PRE_NORM: False
38 |     ENFORCE_INPUT_PROJ: False
39 |     SIZE_DIVISIBILITY: 32
40 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
41 |     TRAIN_NUM_POINTS: 12544
42 |     OVERSAMPLE_RATIO: 3.0
43 |     IMPORTANCE_SAMPLE_RATIO: 0.75
44 |     PARTS_ON: True
45 |     NUM_PART_CLASSES: 107
46 |     LOSS_WEIGHT_PARTS: 1.0
47 |     LOSS_WEIGHT_PANOPTIC: 1.0
48 |     PARTS_CONF_THRESHOLD: 0.1
49 |     TEST:
50 |       SEMANTIC_ON: True
51 |       INSTANCE_ON: False
52 |       PANOPTIC_ON: True
53 |       PARTS_ON: True
54 |       OVERLAP_THRESHOLD: 0.8
55 |       OBJECT_MASK_THRESHOLD: 0.5
56 | PPS_EVAL_SPEC: "utils/panoptic_parts/panoptic_parts/specs/eval_specs/ppq_ppp_59_107_cvpr21_default_evalspec.yaml"


--------------------------------------------------------------------------------
/configs/pascal/pps/tapps_pascal_r50_cocoinit.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-Pascal-PPS-LSJ.yaml
 2 | OUTPUT_DIR: "output/tapps_pascal_r50_cocoinit/"
 3 | SOLVER:
 4 |   MAX_ITER: 10000
 5 | MODEL:
 6 |   WEIGHTS: "checkpoints/maskformer2_R50_bs16_50ep/model_final_94dc52.pkl"
 7 |   META_ARCHITECTURE: "MaskFormer"
 8 |   SEM_SEG_HEAD:
 9 |     NAME: "MaskFormerHead"
10 |     IGNORE_VALUE: 255
11 |     NUM_CLASSES: 59
12 |     LOSS_WEIGHT: 1.0
13 |     CONVS_DIM: 256
14 |     MASK_DIM: 256
15 |     NORM: "GN"
16 |     # pixel decoder
17 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
18 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
19 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
20 |     COMMON_STRIDE: 4
21 |     TRANSFORMER_ENC_LAYERS: 6
22 |   MASK_FORMER:
23 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
24 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
25 |     DEEP_SUPERVISION: True
26 |     NO_OBJECT_WEIGHT: 0.1
27 |     CLASS_WEIGHT: 2.0
28 |     MASK_WEIGHT: 5.0
29 |     DICE_WEIGHT: 5.0
30 |     HIDDEN_DIM: 256
31 |     NUM_OBJECT_QUERIES: 100
32 |     NHEADS: 8
33 |     DROPOUT: 0.0
34 |     DIM_FEEDFORWARD: 2048
35 |     ENC_LAYERS: 0
36 |     PRE_NORM: False
37 |     ENFORCE_INPUT_PROJ: False
38 |     SIZE_DIVISIBILITY: 32
39 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
40 |     TRAIN_NUM_POINTS: 12544
41 |     OVERSAMPLE_RATIO: 3.0
42 |     IMPORTANCE_SAMPLE_RATIO: 0.75
43 |     PARTS_ON: True
44 |     NUM_PART_CLASSES: 57
45 |     LOSS_WEIGHT_PARTS: 1.0
46 |     LOSS_WEIGHT_PANOPTIC: 1.0
47 |     PARTS_CONF_THRESHOLD: 0.1
48 |     TEST:
49 |       SEMANTIC_ON: True
50 |       INSTANCE_ON: False
51 |       PANOPTIC_ON: True
52 |       PARTS_ON: True
53 |       OVERLAP_THRESHOLD: 0.8
54 |       OBJECT_MASK_THRESHOLD: 0.5


--------------------------------------------------------------------------------
/configs/pascal/pps/tapps_pascal_r50_in1kinit.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-Pascal-PPS-LSJ.yaml
 2 | OUTPUT_DIR: "output/tapps_pascal_r50_in1kinit/"
 3 | MODEL:
 4 |   META_ARCHITECTURE: "MaskFormer"
 5 |   SEM_SEG_HEAD:
 6 |     NAME: "MaskFormerHead"
 7 |     IGNORE_VALUE: 255
 8 |     NUM_CLASSES: 59
 9 |     LOSS_WEIGHT: 1.0
10 |     CONVS_DIM: 256
11 |     MASK_DIM: 256
12 |     NORM: "GN"
13 |     # pixel decoder
14 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
15 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
16 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
17 |     COMMON_STRIDE: 4
18 |     TRANSFORMER_ENC_LAYERS: 6
19 |   MASK_FORMER:
20 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
21 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
22 |     DEEP_SUPERVISION: True
23 |     NO_OBJECT_WEIGHT: 0.1
24 |     CLASS_WEIGHT: 2.0
25 |     MASK_WEIGHT: 5.0
26 |     DICE_WEIGHT: 5.0
27 |     HIDDEN_DIM: 256
28 |     NUM_OBJECT_QUERIES: 100
29 |     NHEADS: 8
30 |     DROPOUT: 0.0
31 |     DIM_FEEDFORWARD: 2048
32 |     ENC_LAYERS: 0
33 |     PRE_NORM: False
34 |     ENFORCE_INPUT_PROJ: False
35 |     SIZE_DIVISIBILITY: 32
36 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
37 |     TRAIN_NUM_POINTS: 12544
38 |     OVERSAMPLE_RATIO: 3.0
39 |     IMPORTANCE_SAMPLE_RATIO: 0.75
40 |     PARTS_ON: True
41 |     NUM_PART_CLASSES: 57
42 |     LOSS_WEIGHT_PARTS: 1.0
43 |     LOSS_WEIGHT_PANOPTIC: 1.0
44 |     PARTS_CONF_THRESHOLD: 0.1
45 |     TEST:
46 |       SEMANTIC_ON: True
47 |       INSTANCE_ON: False
48 |       PANOPTIC_ON: True
49 |       PARTS_ON: True
50 |       OVERLAP_THRESHOLD: 0.8
51 |       OBJECT_MASK_THRESHOLD: 0.5


--------------------------------------------------------------------------------
/configs/pascal/pps/tapps_pascal_swinb_cocoinit.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-Pascal-PPS-LSJ.yaml
 2 | OUTPUT_DIR: "output/tapps_pascal_swinb_cocoinit/"
 3 | SOLVER:
 4 |   MAX_ITER: 10000
 5 | MODEL:
 6 |   WEIGHTS: "checkpoints/maskformer2_swin_base_IN21k_384_bs16_50ep/model_final_54b88a.pkl"
 7 |   BACKBONE:
 8 |     NAME: "D2SwinTransformer"
 9 |   SWIN:
10 |     EMBED_DIM: 128
11 |     DEPTHS: [ 2, 2, 18, 2 ]
12 |     NUM_HEADS: [ 4, 8, 16, 32 ]
13 |     WINDOW_SIZE: 12
14 |     APE: False
15 |     DROP_PATH_RATE: 0.3
16 |     PATCH_NORM: True
17 |     PRETRAIN_IMG_SIZE: 384
18 |   PIXEL_MEAN: [ 123.675, 116.280, 103.530 ]
19 |   PIXEL_STD: [ 58.395, 57.120, 57.375 ]
20 |   META_ARCHITECTURE: "MaskFormer"
21 |   SEM_SEG_HEAD:
22 |     NAME: "MaskFormerHead"
23 |     IGNORE_VALUE: 255
24 |     NUM_CLASSES: 59
25 |     LOSS_WEIGHT: 1.0
26 |     CONVS_DIM: 256
27 |     MASK_DIM: 256
28 |     NORM: "GN"
29 |     # pixel decoder
30 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
31 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
32 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
33 |     COMMON_STRIDE: 4
34 |     TRANSFORMER_ENC_LAYERS: 6
35 |   MASK_FORMER:
36 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
37 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
38 |     DEEP_SUPERVISION: True
39 |     NO_OBJECT_WEIGHT: 0.1
40 |     CLASS_WEIGHT: 2.0
41 |     MASK_WEIGHT: 5.0
42 |     DICE_WEIGHT: 5.0
43 |     HIDDEN_DIM: 256
44 |     NUM_OBJECT_QUERIES: 100
45 |     NHEADS: 8
46 |     DROPOUT: 0.0
47 |     DIM_FEEDFORWARD: 2048
48 |     ENC_LAYERS: 0
49 |     PRE_NORM: False
50 |     ENFORCE_INPUT_PROJ: False
51 |     SIZE_DIVISIBILITY: 32
52 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
53 |     TRAIN_NUM_POINTS: 12544
54 |     OVERSAMPLE_RATIO: 3.0
55 |     IMPORTANCE_SAMPLE_RATIO: 0.75
56 |     PARTS_ON: True
57 |     NUM_PART_CLASSES: 57
58 |     LOSS_WEIGHT_PARTS: 1.0
59 |     LOSS_WEIGHT_PANOPTIC: 1.0
60 |     PARTS_CONF_THRESHOLD: 0.1
61 |     TEST:
62 |       SEMANTIC_ON: True
63 |       INSTANCE_ON: False
64 |       PANOPTIC_ON: True
65 |       PARTS_ON: True
66 |       OVERLAP_THRESHOLD: 0.8
67 |       OBJECT_MASK_THRESHOLD: 0.5


--------------------------------------------------------------------------------
/datasets/README.md:
--------------------------------------------------------------------------------
  1 | # Prepare Datasets for TAPPS
  2 | 
  3 | TAPPS has builtin support for two datasets: Cityscapes Panoptic Parts (Cityscapes-PP) and Pascal Panoptic Parts (Pascal-PP)
  4 | The datasets are assumed to exist in a directory specified by the environment variable
  5 | `DETECTRON2_DATASETS`.
  6 | Under this directory, detectron2 will look for datasets in the structure described below.
  7 | ```
  8 | $DETECTRON2_DATASETS/
  9 |   cityscapes/
 10 |   pascal/
 11 | ```
 12 | 
 13 | You can set the location for builtin datasets by `export DETECTRON2_DATASETS=/path/to/datasets`.
 14 | If left unset, the default is `./datasets` relative to your current working directory.
 15 | 
 16 | ## Expected dataset structure for [Cityscapes-PP](https://github.com/pmeletis/panoptic_parts):
 17 | 
 18 | First download the [Cityscapes dataset](https://cityscapes-dataset.com/downloads/) and put the data in the `cityscapes` directory. Download `gtFine_trainvaltest.zip`, `leftImg8bit_trainvaltest.zip`, and `gtFinePanopticParts.zip`. Structure it as below:
 19 | 
 20 | ```
 21 | cityscapes/
 22 |   gtFine/
 23 |     train/
 24 |       aachen/
 25 |       ...
 26 |     val/
 27 |     test/
 28 |   leftImg8bit/
 29 |     train/
 30 |     val/
 31 |     test/
 32 |   gtFinePanopticParts/
 33 |     train/
 34 |     val/  
 35 | ```
 36 | In any directory, clone cityscapesScripts by:
 37 | ```bash
 38 | git clone https://github.com/mcordts/cityscapesScripts.git
 39 | ```
 40 | 
 41 | To create labelTrainIds.png, first prepare the above structure, then run cityscapesScripts with:
 42 | ```bash
 43 | CITYSCAPES_DATASET=/path/to/abovementioned/cityscapes python cityscapesscripts/preparation/createTrainIdLabelImgs.py
 44 | ```
 45 | 
 46 | To generate Cityscapes panoptic dataset, run cityscapesScripts with:
 47 | ```bash
 48 | CITYSCAPES_DATASET=/path/to/abovementioned/cityscapes python cityscapesscripts/preparation/createPanopticImgs.py
 49 | ```
 50 | 
 51 | To prepare the part segmentation files necessary for training, run:
 52 | ```bash
 53 | python datasets/prepare_cityscapes_pp.py
 54 | ```
 55 | 
 56 | After doing this, the data should be in the following structure:
 57 | ```
 58 | cityscapes/
 59 |   gtFine/
 60 |     train/
 61 |       aachen/
 62 |       ...
 63 |     val/
 64 |     test/
 65 |     cityscapes_panoptic_train.json
 66 |     cityscapes_panoptic_train/
 67 |     cityscapes_panoptic_val.json
 68 |     cityscapes_panoptic_val/
 69 |     cityscapes_panoptic_test.json
 70 |     cityscapes_panoptic_test/
 71 |   leftImg8bit/
 72 |     train/
 73 |     val/
 74 |     test/
 75 |   gtFinePanopticParts/
 76 |     train/
 77 |     val/
 78 |   gtFineParts/
 79 |     train/
 80 |     val/
 81 |   images_val.json
 82 |   images_train.json  
 83 | ```
 84 | 
 85 | ## Expected dataset structure for [Pascal-PP](https://github.com/pmeletis/panoptic_parts):
 86 | 
 87 | Download the [Pascal-PP labels](https://github.com/pmeletis/panoptic_parts) and the [Pascal VOC 2010 images](http://host.robots.ox.ac.uk/pascal/VOC/voc2010/). Organize the data in the following structure:
 88 | 
 89 | ```
 90 | pascal/
 91 |   JPEGImages/    # From VOC2010
 92 |   labels/        # From pascal_panoptic_parts_v2.0
 93 |     training/
 94 |     validation/
 95 | ```
 96 | 
 97 | To generate the panoptic, semantic and part segmentation annotations and split the images into training and validation splits, run:
 98 | ```bash
 99 | python datasets/prepare_pascal_pp.py
100 | ```
101 | 
102 | Afterwards, the data should have the following structure:
103 | ```
104 | pascal/
105 |   images/
106 |     training/
107 |     validation/
108 |   labels/
109 |     training/
110 |     validation/
111 |   panoptic/
112 |     training/
113 |     validation/
114 |     panoptic_training.json
115 |     panoptic_validation.json
116 |   semantic/
117 |     training/
118 |     validation/
119 |   parts/
120 |     training/
121 |     validation/
122 |   images_training.json
123 |   images_validation.json
124 | ```
125 | 
126 | Note: if you wish to use the Pascal-PP-107 labels (instead of the default Pascal-PP-57), also run:
127 | ```bash
128 | python datasets/prepare_pascal_pp_107.py
129 | ```


--------------------------------------------------------------------------------
/datasets/ade20k_instance_catid_mapping.txt:
--------------------------------------------------------------------------------
  1 | Instacne100	SceneParse150	FullADE20K
  2 | 1		8		165
  3 | 2		9		3055
  4 | 3		11		350
  5 | 4		13		1831
  6 | 5		15		774
  7 | 5		15		783
  8 | 6		16		2684
  9 | 7		19		687
 10 | 8		20		471
 11 | 9		21		401
 12 | 10		23		1735
 13 | 11		24		2473
 14 | 12		25		2329
 15 | 13		28		1564
 16 | 14		31		57
 17 | 15		32		2272
 18 | 16		33		907
 19 | 17		34		724
 20 | 18		36		2985
 21 | 18		36		533
 22 | 19		37		1395
 23 | 20		38		155
 24 | 21		39		2053
 25 | 22		40		689
 26 | 23		42		266
 27 | 24		43		581
 28 | 25		44		2380
 29 | 26		45		491
 30 | 27		46		627
 31 | 28		48		2388
 32 | 29		50		943
 33 | 30		51		2096
 34 | 31		54		2530
 35 | 32		56		420
 36 | 33		57		1948
 37 | 34		58		1869
 38 | 35		59		2251
 39 | 36		63		239
 40 | 37		65		571
 41 | 38		66		2793
 42 | 39		67		978
 43 | 40		68		236
 44 | 41		70		181
 45 | 42		71		629
 46 | 43		72		2598
 47 | 44		73		1744
 48 | 45		74		1374
 49 | 46		75		591
 50 | 47		76		2679
 51 | 48		77		223
 52 | 49		79		47
 53 | 50		81		327
 54 | 51		82		2821
 55 | 52		83		1451
 56 | 53		84		2880
 57 | 54		86		480
 58 | 55		87		77
 59 | 56		88		2616
 60 | 57		89		246
 61 | 57		89		247
 62 | 58		90		2733
 63 | 59		91		14
 64 | 60		93		38
 65 | 61		94		1936
 66 | 62		96		120
 67 | 63		98		1702
 68 | 64		99		249
 69 | 65		103		2928
 70 | 66		104		2337
 71 | 67		105		1023
 72 | 68		108		2989
 73 | 69		109		1930
 74 | 70		111		2586
 75 | 71		112		131
 76 | 72		113		146
 77 | 73		116		95
 78 | 74		117		1563
 79 | 75		119		1708
 80 | 76		120		103
 81 | 77		121		1002
 82 | 78		122		2569
 83 | 79		124		2833
 84 | 80		125		1551
 85 | 81		126		1981
 86 | 82		127		29
 87 | 83		128		187
 88 | 84		130		747
 89 | 85		131		2254
 90 | 86		133		2262
 91 | 87		134		1260
 92 | 88		135		2243
 93 | 89		136		2932
 94 | 90		137		2836
 95 | 91		138		2850
 96 | 92		139		64
 97 | 93		140		894
 98 | 94		143		1919
 99 | 95		144		1583
100 | 96		145		318
101 | 97		147		2046
102 | 98		148		1098
103 | 99		149		530
104 | 100		150		954
105 | 


--------------------------------------------------------------------------------
/datasets/prepare_ade20k_sem_seg.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | # Copyright (c) Facebook, Inc. and its affiliates.
 4 | import os
 5 | from pathlib import Path
 6 | 
 7 | import numpy as np
 8 | import tqdm
 9 | from PIL import Image
10 | 
11 | 
12 | def convert(input, output):
13 |     img = np.asarray(Image.open(input))
14 |     assert img.dtype == np.uint8
15 |     img = img - 1  # 0 (ignore) becomes 255. others are shifted by 1
16 |     Image.fromarray(img).save(output)
17 | 
18 | 
19 | if __name__ == "__main__":
20 |     dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "ADEChallengeData2016"
21 |     for name in ["training", "validation"]:
22 |         annotation_dir = dataset_dir / "annotations" / name
23 |         output_dir = dataset_dir / "annotations_detectron2" / name
24 |         output_dir.mkdir(parents=True, exist_ok=True)
25 |         for file in tqdm.tqdm(list(annotation_dir.iterdir())):
26 |             output_file = output_dir / file.name
27 |             convert(file, output_file)
28 | 


--------------------------------------------------------------------------------
/datasets/prepare_coco_semantic_annos_from_panoptic_annos.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | # Copyright (c) Facebook, Inc. and its affiliates.
 4 | 
 5 | import functools
 6 | import json
 7 | import multiprocessing as mp
 8 | import numpy as np
 9 | import os
10 | import time
11 | from fvcore.common.download import download
12 | from panopticapi.utils import rgb2id
13 | from PIL import Image
14 | 
15 | from detectron2.data.datasets.builtin_meta import COCO_CATEGORIES
16 | 
17 | 
18 | def _process_panoptic_to_semantic(input_panoptic, output_semantic, segments, id_map):
19 |     panoptic = np.asarray(Image.open(input_panoptic), dtype=np.uint32)
20 |     panoptic = rgb2id(panoptic)
21 |     output = np.zeros_like(panoptic, dtype=np.uint8) + 255
22 |     for seg in segments:
23 |         cat_id = seg["category_id"]
24 |         new_cat_id = id_map[cat_id]
25 |         output[panoptic == seg["id"]] = new_cat_id
26 |     Image.fromarray(output).save(output_semantic)
27 | 
28 | 
29 | def separate_coco_semantic_from_panoptic(panoptic_json, panoptic_root, sem_seg_root, categories):
30 |     """
31 |     Create semantic segmentation annotations from panoptic segmentation
32 |     annotations, to be used by PanopticFPN.
33 |     It maps all thing categories to class 0, and maps all unlabeled pixels to class 255.
34 |     It maps all stuff categories to contiguous ids starting from 1.
35 |     Args:
36 |         panoptic_json (str): path to the panoptic json file, in COCO's format.
37 |         panoptic_root (str): a directory with panoptic annotation files, in COCO's format.
38 |         sem_seg_root (str): a directory to output semantic annotation files
39 |         categories (list[dict]): category metadata. Each dict needs to have:
40 |             "id": corresponds to the "category_id" in the json annotations
41 |             "isthing": 0 or 1
42 |     """
43 |     os.makedirs(sem_seg_root, exist_ok=True)
44 | 
45 |     id_map = {}  # map from category id to id in the output semantic annotation
46 |     assert len(categories) <= 254
47 |     for i, k in enumerate(categories):
48 |         id_map[k["id"]] = i
49 |     # what is id = 0?
50 |     # id_map[0] = 255
51 |     print(id_map)
52 | 
53 |     with open(panoptic_json) as f:
54 |         obj = json.load(f)
55 | 
56 |     pool = mp.Pool(processes=max(mp.cpu_count() // 2, 4))
57 | 
58 |     def iter_annotations():
59 |         for anno in obj["annotations"]:
60 |             file_name = anno["file_name"]
61 |             segments = anno["segments_info"]
62 |             input = os.path.join(panoptic_root, file_name)
63 |             output = os.path.join(sem_seg_root, file_name)
64 |             yield input, output, segments
65 | 
66 |     print("Start writing to {} ...".format(sem_seg_root))
67 |     start = time.time()
68 |     pool.starmap(
69 |         functools.partial(_process_panoptic_to_semantic, id_map=id_map),
70 |         iter_annotations(),
71 |         chunksize=100,
72 |     )
73 |     print("Finished. time: {:.2f}s".format(time.time() - start))
74 | 
75 | 
76 | if __name__ == "__main__":
77 |     dataset_dir = os.path.join(os.getenv("DETECTRON2_DATASETS", "datasets"), "coco")
78 |     for s in ["val2017", "train2017"]:
79 |         separate_coco_semantic_from_panoptic(
80 |             os.path.join(dataset_dir, "annotations/panoptic_{}.json".format(s)),
81 |             os.path.join(dataset_dir, "panoptic_{}".format(s)),
82 |             os.path.join(dataset_dir, "panoptic_semseg_{}".format(s)),
83 |             COCO_CATEGORIES,
84 |         )
85 | 


--------------------------------------------------------------------------------
/eval/eval_partpq.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import argparse
 4 | 
 5 | sys.path.append("utils/panoptic_parts")
 6 | from panoptic_parts.evaluation import eval_PartPQ
 7 | 
 8 | def eval_partpq(save_dir, dataset):
 9 |   root = os.getenv("DETECTRON2_DATASETS", "datasets")
10 | 
11 |   if dataset in ['pascal', 'Pascal', 'pascal57', 'Pascal57']:
12 |     eval_spec_path = "utils/panoptic_parts/panoptic_parts/specs/eval_specs/ppq_ppp_59_57_cvpr21_default_evalspec.yaml"
13 |     gt_path = os.path.join(root, "pascal/labels/validation")
14 |     images_json = os.path.join(root, "pascal/images_validation.json")
15 |   elif dataset in ['pascal107, Pascal107']:
16 |     eval_spec_path = "utils/panoptic_parts/panoptic_parts/specs/eval_specs/ppq_ppp_59_107_cvpr21_default_evalspec.yaml"
17 |     gt_path = os.path.join(root, "pascal/labels/validation")
18 |     images_json = os.path.join(root, "pascal/images_validation.json")
19 |   elif dataset in ['cityscapes', 'Cityscapes']:
20 |     eval_spec_path = "utils/panoptic_parts/panoptic_parts/specs/eval_specs/ppq_cpp_19_23_cvpr21_grouped_evalspec.yaml"
21 |     gt_path = os.path.join(root, "cityscapes/gtFinePanopticParts/val/")
22 |     images_json = os.path.join(root, "cityscapes/images_val.json")
23 |   else:
24 |     raise NotImplementedError(f"Only implemented for Pascal, Pascal107 and Cityscapes, not {dataset}.")
25 | 
26 |   pps_pred_path = os.path.join(save_dir, "pps")
27 |   results_dir = os.path.join(save_dir, "results")
28 | 
29 |   # Eval PPS predictions with PartPQ
30 |   results = eval_PartPQ.evaluate(eval_spec_path,
31 |                                  gt_path,
32 |                                  pps_pred_path,
33 |                                  images_json,
34 |                                  save_dir=results_dir,
35 |                                  return_results=True)
36 | 
37 |   part_pq = results[0][0]["PartPQ"]
38 |   part_pq_p = results[0][1]["PartPQ_parts"]
39 |   part_pq_np = results[0][2]["PartPQ_noparts"]
40 |   metrics = {"part_pq": part_pq, "part_pq_p": part_pq_p, "part_pq_np": part_pq_np}
41 |   print(metrics)
42 | 
43 | 
44 | if __name__ == "__main__":
45 |   parser = argparse.ArgumentParser()
46 |   parser.add_argument('--save_dir', type=str)
47 |   parser.add_argument('--dataset', type=str)
48 |   args = parser.parse_args()
49 | 
50 |   save_dir = args.save_dir
51 |   dataset = args.dataset
52 | 
53 |   eval_partpq(save_dir,
54 |               dataset)
55 | 


--------------------------------------------------------------------------------
/eval/visualize_pps.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import numpy as np
  4 | import sys
  5 | import psutil
  6 | 
  7 | from PIL import Image
  8 | from functools import partial
  9 | 
 10 | import multiprocessing
 11 | from multiprocessing import Pool
 12 | 
 13 | sys.path.append("utils/panoptic_parts")
 14 | from panoptic_parts.utils.format import encode_ids
 15 | from panoptic_parts.utils.visualization import experimental_colorize_label
 16 | from panoptic_parts.specs.eval_spec import PartPQEvalSpec
 17 | 
 18 | 
 19 | def colorize_pps_and_store_single(file, predictions_path, sid2color, save_path, eval_spec, dataset, cpu_aff=None):
 20 |   predictions_np = np.array(Image.open(os.path.join(predictions_path, file)), dtype=np.int32)
 21 |   sids = predictions_np[..., 0]
 22 |   iids = predictions_np[..., 1]
 23 |   pids = predictions_np[..., 2]
 24 | 
 25 |   sids_no_parts = eval_spec.eval_sid_no_parts
 26 |   sids_stuff = eval_spec.eval_sid_stuff
 27 | 
 28 |   sids_wo_parts = np.isin(sids, sids_no_parts)
 29 |   sids_w_stuff = np.isin(sids, sids_stuff)
 30 |   iids[sids_w_stuff] = -1
 31 |   pids[sids_wo_parts] = -1
 32 | 
 33 |   sids[sids == 255] = 0
 34 |   pids[pids == 255] = -1
 35 |   iids[iids == 255] = -1
 36 |   uids = encode_ids(sids, iids, pids)
 37 | 
 38 |   if cpu_aff is not None:
 39 |     process = psutil.Process()
 40 |     process.cpu_affinity(cpu_aff)
 41 | 
 42 |   if dataset == 'cityscapes' or dataset == 'Cityscapes':
 43 |     is_cpp = True
 44 |   else:
 45 |     is_cpp = False
 46 | 
 47 |   pps_colors = experimental_colorize_label(uids, sid2color=sid2color, is_cpp=is_cpp)
 48 |   pps_colors_img = Image.fromarray(pps_colors.astype(np.uint8))
 49 |   pps_colors_img.save(os.path.join(save_path, file))
 50 | 
 51 | 
 52 | def convert_pps_to_colors_and_store(predictions_path, save_path, eval_spec_path, dataset):
 53 |   eval_spec = PartPQEvalSpec(eval_spec_path)
 54 |   sid2color = eval_spec.dataset_spec.sid2scene_color
 55 | 
 56 |   files = list()
 57 |   for file in os.listdir(predictions_path):
 58 |     if file.endswith(".png"):
 59 |       files.append(file)
 60 | 
 61 |   if not os.path.exists(save_path):
 62 |     os.mkdir(save_path)
 63 | 
 64 |   # for file in tqdm(files):
 65 |   process = psutil.Process()
 66 |   cpu_aff = process.cpu_affinity()
 67 | 
 68 |   num_cpus = round(multiprocessing.cpu_count() / 2)
 69 | 
 70 |   colorize_pps_and_store_single_fn = partial(colorize_pps_and_store_single,
 71 |                                              predictions_path=predictions_path,
 72 |                                              sid2color=sid2color,
 73 |                                              save_path=save_path,
 74 |                                              cpu_aff=cpu_aff,
 75 |                                              eval_spec=eval_spec,
 76 |                                              dataset=dataset)
 77 |   print(f"Now visualizing {len(files)} PPS predictions... this could take a while.")
 78 |   with Pool(num_cpus) as p:
 79 |     p.map(colorize_pps_and_store_single_fn, files)
 80 | 
 81 | 
 82 | def visualize(pred_dir, save_dir, dataset):
 83 |   if dataset in ['pascal', 'Pascal', 'pascal57', 'Pascal57']:
 84 |     eval_spec_path = "utils/panoptic_parts/panoptic_parts/specs/eval_specs/ppq_ppp_59_57_cvpr21_default_evalspec.yaml"
 85 |   elif dataset in ['pascal107, Pascal107']:
 86 |     eval_spec_path = "utils/panoptic_parts/panoptic_parts/specs/eval_specs/ppq_ppp_59_107_cvpr21_default_evalspec.yaml"
 87 |   elif dataset in ['cityscapes', 'Cityscapes']:
 88 |     eval_spec_path = "utils/panoptic_parts/panoptic_parts/specs/eval_specs/ppq_cpp_19_23_cvpr21_grouped_evalspec.yaml"
 89 |   else:
 90 |     raise NotImplementedError(f"Only implemented for Pascal, Pascal107 and Cityscapes, not {dataset}.")
 91 | 
 92 |   if not os.path.isdir(save_dir):
 93 |     os.mkdir(save_dir)
 94 | 
 95 |   convert_pps_to_colors_and_store(pred_dir, save_dir, eval_spec_path, dataset=dataset)
 96 | 
 97 | if __name__ == "__main__":
 98 |   parser = argparse.ArgumentParser()
 99 |   parser.add_argument('--pred_dir', type=str)
100 |   parser.add_argument('--save_dir', type=str)
101 |   parser.add_argument('--dataset', type=str)
102 |   args = parser.parse_args()
103 | 
104 |   pred_dir = args.pred_dir
105 |   save_dir = args.save_dir
106 |   dataset = args.dataset
107 | 
108 |   visualize(pred_dir,
109 |             save_dir,
110 |             dataset)
111 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | cython
2 | scipy
3 | shapely
4 | timm
5 | h5py
6 | submitit
7 | scikit-image
8 | psutil
9 | scikit-learn


--------------------------------------------------------------------------------
/tapps/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | from . import data  # register all new datasets
 3 | from . import modeling
 4 | 
 5 | # config
 6 | from .config import add_maskformer2_config
 7 | 
 8 | # dataset loading
 9 | from .data.dataset_mappers.coco_instance_new_baseline_dataset_mapper import COCOInstanceNewBaselineDatasetMapper
10 | from .data.dataset_mappers.coco_panoptic_new_baseline_dataset_mapper import COCOPanopticNewBaselineDatasetMapper
11 | from .data.dataset_mappers.mask_former_instance_dataset_mapper import (
12 |     MaskFormerInstanceDatasetMapper,
13 | )
14 | from .data.dataset_mappers.mask_former_panoptic_dataset_mapper import (
15 |     MaskFormerPanopticDatasetMapper,
16 | )
17 | 
18 | from .data.dataset_mappers.mask_former_panoptic_parts_dataset_mapper import (
19 |     MaskFormerPanopticPartsDatasetMapper,
20 | )
21 | 
22 | from .data.dataset_mappers.mask_former_semantic_dataset_mapper import (
23 |     MaskFormerSemanticDatasetMapper,
24 | )
25 | 
26 | from .data.dataset_mappers.pascal_panoptic_parts_new_baseline_dataset_mapper import (
27 |     PascalPanopticPartsNewBaselineDatasetMapper,
28 | )
29 | 
30 | from .data.datasets.register_pascal_panoptic_parts import register_all_pascal_panoptic_parts
31 | from .data.datasets.register_cityscapes_panoptic_parts import register_all_cityscapes_panoptic_parts
32 | from .data.datasets.register_pascal_panoptic_parts_107 import register_all_pascal_panoptic_parts_107
33 | 
34 | # models
35 | from .maskformer_model import MaskFormer
36 | from .test_time_augmentation import SemanticSegmentorWithTTA
37 | 
38 | # evaluation
39 | from .evaluation.instance_evaluation import InstanceSegEvaluator
40 | 


--------------------------------------------------------------------------------
/tapps/data/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from . import datasets
3 | 


--------------------------------------------------------------------------------
/tapps/data/dataset_mappers/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/tapps/data/dataset_mappers/augmentations.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Copyright (c) Facebook, Inc. and its affiliates.
  3 | """
  4 | Implement many useful :class:`Augmentation`.
  5 | """
  6 | 
  7 | import numpy as np
  8 | from typing import List, Optional, Union
  9 | 
 10 | 
 11 | from fvcore.transforms.transform import (
 12 |     Transform,
 13 |     TransformList,
 14 | )
 15 | 
 16 | from detectron2.data.transforms.augmentation import Augmentation, AugmentationList
 17 | 
 18 | __all__ = [
 19 |     "AugInput",
 20 | ]
 21 | 
 22 | 
 23 | def _check_img_dtype(img):
 24 |   assert isinstance(img, np.ndarray), "[Augmentation] Needs an numpy array, but got a {}!".format(
 25 |     type(img)
 26 |   )
 27 |   assert not isinstance(img.dtype, np.integer) or (
 28 |       img.dtype == np.uint8
 29 |   ), "[Augmentation] Got image of type {}, use uint8 or floating points instead!".format(
 30 |     img.dtype
 31 |   )
 32 |   assert img.ndim in [2, 3], img.ndim
 33 | 
 34 | 
 35 | 
 36 | class AugInput:
 37 |   """
 38 |   Input that can be used with :meth:`Augmentation.__call__`.
 39 |   This is a standard implementation for the majority of use cases.
 40 |   This class provides the standard attributes **"image", "boxes", "sem_seg"**
 41 |   defined in :meth:`__init__` and they may be needed by different augmentations.
 42 |   Most augmentation policies do not need attributes beyond these three.
 43 | 
 44 |   After applying augmentations to these attributes (using :meth:`AugInput.transform`),
 45 |   the returned transforms can then be used to transform other data structures that users have.
 46 | 
 47 |   Examples:
 48 |   ::
 49 |       input = AugInput(image, boxes=boxes)
 50 |       tfms = augmentation(input)
 51 |       transformed_image = input.image
 52 |       transformed_boxes = input.boxes
 53 |       transformed_other_data = tfms.apply_other(other_data)
 54 | 
 55 |   An extended project that works with new data types may implement augmentation policies
 56 |   that need other inputs. An algorithm may need to transform inputs in a way different
 57 |   from the standard approach defined in this class. In those rare situations, users can
 58 |   implement a class similar to this class, that satify the following condition:
 59 | 
 60 |   * The input must provide access to these data in the form of attribute access
 61 |     (``getattr``).  For example, if an :class:`Augmentation` to be applied needs "image"
 62 |     and "sem_seg" arguments, its input must have the attribute "image" and "sem_seg".
 63 |   * The input must have a ``transform(tfm: Transform) -> None`` method which
 64 |     in-place transforms all its attributes.
 65 |   """
 66 | 
 67 |   def __init__(
 68 |       self,
 69 |       image: np.ndarray,
 70 |       *,
 71 |       boxes: Optional[np.ndarray] = None,
 72 |       sem_seg: Optional[np.ndarray] = None,
 73 |       part_seg: Optional[np.ndarray] = None,
 74 |       ratio = None
 75 |   ):
 76 |     """
 77 |     Args:
 78 |         image (ndarray): (H,W) or (H,W,C) ndarray of type uint8 in range [0, 255], or
 79 |             floating point in range [0, 1] or [0, 255]. The meaning of C is up
 80 |             to users.
 81 |         boxes (ndarray or None): Nx4 float32 boxes in XYXY_ABS mode
 82 |         sem_seg (ndarray or None): HxW uint8 semantic segmentation mask. Each element
 83 |             is an integer label of pixel.
 84 |         part_seg (ndarray or None): HxW uint8 part segmentation mask. Each element
 85 |             is an integer label of pixel.
 86 |     """
 87 |     _check_img_dtype(image)
 88 |     self.image = image
 89 |     self.boxes = boxes
 90 |     self.sem_seg = sem_seg
 91 |     self.part_seg = part_seg
 92 |     self.ratio = ratio
 93 | 
 94 |   def transform(self, tfm: Transform) -> None:
 95 |     """
 96 |     In-place transform all attributes of this class.
 97 | 
 98 |     By "in-place", it means after calling this method, accessing an attribute such
 99 |     as ``self.image`` will return transformed data.
100 |     """
101 |     self.image = tfm.apply_image(self.image)
102 |     if self.boxes is not None:
103 |       self.boxes = tfm.apply_box(self.boxes)
104 |     if self.sem_seg is not None:
105 |       self.sem_seg = tfm.apply_segmentation(self.sem_seg)
106 |     if self.part_seg is not None:
107 |       self.part_seg = tfm.apply_segmentation(self.part_seg)
108 | 
109 |   def apply_augmentations(
110 |       self, augmentations: List[Union[Augmentation, Transform]]
111 |   ) -> TransformList:
112 |     """
113 |     Equivalent of ``AugmentationList(augmentations)(self)``
114 |     """
115 |     return AugmentationList(augmentations)(self)
116 | 


--------------------------------------------------------------------------------
/tapps/data/datasets/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | from . import (
 3 |     register_ade20k_full,
 4 |     register_ade20k_panoptic,
 5 |     register_coco_stuff_10k,
 6 |     register_mapillary_vistas,
 7 |     register_coco_panoptic_annos_semseg,
 8 |     register_ade20k_instance,
 9 |     register_mapillary_vistas_panoptic,
10 |     register_cityscapes_panoptic_parts,
11 |     register_pascal_panoptic_parts,
12 |     register_pascal_panoptic_parts_107,
13 | )
14 | 


--------------------------------------------------------------------------------
/tapps/evaluation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tue-mps/tapps/54d8d9f039fa91da6e046bfd8c62c13a9dc5e8b0/tapps/evaluation/__init__.py


--------------------------------------------------------------------------------
/tapps/modeling/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from .backbone.swin import D2SwinTransformer
3 | from .pixel_decoder.fpn import BasePixelDecoder
4 | from .pixel_decoder.msdeformattn import MSDeformAttnPixelDecoder
5 | from .meta_arch.mask_former_head import MaskFormerHead
6 | from .meta_arch.per_pixel_baseline import PerPixelBaselineHead, PerPixelBaselinePlusHead
7 | 


--------------------------------------------------------------------------------
/tapps/modeling/backbone/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/tapps/modeling/meta_arch/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/tapps/modeling/pixel_decoder/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/tapps/modeling/pixel_decoder/ops/functions/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from .ms_deform_attn_func import MSDeformAttnFunction
13 | 
14 | 


--------------------------------------------------------------------------------
/tapps/modeling/pixel_decoder/ops/functions/ms_deform_attn_func.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from __future__ import absolute_import
13 | from __future__ import print_function
14 | from __future__ import division
15 | 
16 | import torch
17 | import torch.nn.functional as F
18 | from torch.autograd import Function
19 | from torch.autograd.function import once_differentiable
20 | 
21 | try:
22 |     import MultiScaleDeformableAttention as MSDA
23 | except ModuleNotFoundError as e:
24 |     info_string = (
25 |         "\n\nPlease compile MultiScaleDeformableAttention CUDA op with the following commands:\n"
26 |         "\t`cd tapps/modeling/pixel_decoder/ops`\n"
27 |         "\t`sh make.sh`\n"
28 |     )
29 |     raise ModuleNotFoundError(info_string)
30 | 
31 | 
32 | class MSDeformAttnFunction(Function):
33 |     @staticmethod
34 |     def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step):
35 |         ctx.im2col_step = im2col_step
36 |         output = MSDA.ms_deform_attn_forward(
37 |             value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step)
38 |         ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights)
39 |         return output
40 | 
41 |     @staticmethod
42 |     @once_differentiable
43 |     def backward(ctx, grad_output):
44 |         value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors
45 |         grad_value, grad_sampling_loc, grad_attn_weight = \
46 |             MSDA.ms_deform_attn_backward(
47 |                 value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step)
48 | 
49 |         return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
50 | 
51 | 
52 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
53 |     # for debug and test only,
54 |     # need to use cuda version instead
55 |     N_, S_, M_, D_ = value.shape
56 |     _, Lq_, M_, L_, P_, _ = sampling_locations.shape
57 |     value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
58 |     sampling_grids = 2 * sampling_locations - 1
59 |     sampling_value_list = []
60 |     for lid_, (H_, W_) in enumerate(value_spatial_shapes):
61 |         # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
62 |         value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_)
63 |         # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
64 |         sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)
65 |         # N_*M_, D_, Lq_, P_
66 |         sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,
67 |                                           mode='bilinear', padding_mode='zeros', align_corners=False)
68 |         sampling_value_list.append(sampling_value_l_)
69 |     # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
70 |     attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_)
71 |     output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)
72 |     return output.transpose(1, 2).contiguous()
73 | 


--------------------------------------------------------------------------------
/tapps/modeling/pixel_decoder/ops/make.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # ------------------------------------------------------------------------------------------------
 3 | # Deformable DETR
 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | # ------------------------------------------------------------------------------------------------
 7 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | # ------------------------------------------------------------------------------------------------
 9 | 
10 | # Copyright (c) Facebook, Inc. and its affiliates.
11 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
12 | 
13 | python setup.py build install
14 | 


--------------------------------------------------------------------------------
/tapps/modeling/pixel_decoder/ops/modules/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from .ms_deform_attn import MSDeformAttn
13 | 


--------------------------------------------------------------------------------
/tapps/modeling/pixel_decoder/ops/setup.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | import os
13 | import glob
14 | 
15 | import torch
16 | 
17 | from torch.utils.cpp_extension import CUDA_HOME
18 | from torch.utils.cpp_extension import CppExtension
19 | from torch.utils.cpp_extension import CUDAExtension
20 | 
21 | from setuptools import find_packages
22 | from setuptools import setup
23 | 
24 | requirements = ["torch", "torchvision"]
25 | 
26 | def get_extensions():
27 |     this_dir = os.path.dirname(os.path.abspath(__file__))
28 |     extensions_dir = os.path.join(this_dir, "src")
29 | 
30 |     main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
31 |     source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
32 |     source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
33 | 
34 |     sources = main_file + source_cpu
35 |     extension = CppExtension
36 |     extra_compile_args = {"cxx": []}
37 |     define_macros = []
38 | 
39 |     # Force cuda since torch ask for a device, not if cuda is in fact available.
40 |     if (os.environ.get('FORCE_CUDA') or torch.cuda.is_available()) and CUDA_HOME is not None:
41 |         extension = CUDAExtension
42 |         sources += source_cuda
43 |         define_macros += [("WITH_CUDA", None)]
44 |         extra_compile_args["nvcc"] = [
45 |             "-DCUDA_HAS_FP16=1",
46 |             "-D__CUDA_NO_HALF_OPERATORS__",
47 |             "-D__CUDA_NO_HALF_CONVERSIONS__",
48 |             "-D__CUDA_NO_HALF2_OPERATORS__",
49 |         ]
50 |     else:
51 |         if CUDA_HOME is None:
52 |             raise NotImplementedError('CUDA_HOME is None. Please set environment variable CUDA_HOME.')
53 |         else:
54 |             raise NotImplementedError('No CUDA runtime is found. Please set FORCE_CUDA=1 or test it by running torch.cuda.is_available().')
55 | 
56 |     sources = [os.path.join(extensions_dir, s) for s in sources]
57 |     include_dirs = [extensions_dir]
58 |     ext_modules = [
59 |         extension(
60 |             "MultiScaleDeformableAttention",
61 |             sources,
62 |             include_dirs=include_dirs,
63 |             define_macros=define_macros,
64 |             extra_compile_args=extra_compile_args,
65 |         )
66 |     ]
67 |     return ext_modules
68 | 
69 | setup(
70 |     name="MultiScaleDeformableAttention",
71 |     version="1.0",
72 |     author="Weijie Su",
73 |     url="https://github.com/fundamentalvision/Deformable-DETR",
74 |     description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention",
75 |     packages=find_packages(exclude=("configs", "tests",)),
76 |     ext_modules=get_extensions(),
77 |     cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
78 | )
79 | 


--------------------------------------------------------------------------------
/tapps/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.cpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #include <vector>
17 | 
18 | #include <ATen/ATen.h>
19 | #include <ATen/cuda/CUDAContext.h>
20 | 
21 | 
22 | at::Tensor
23 | ms_deform_attn_cpu_forward(
24 |     const at::Tensor &value, 
25 |     const at::Tensor &spatial_shapes,
26 |     const at::Tensor &level_start_index,
27 |     const at::Tensor &sampling_loc,
28 |     const at::Tensor &attn_weight,
29 |     const int im2col_step)
30 | {
31 |     AT_ERROR("Not implement on cpu");
32 | }
33 | 
34 | std::vector<at::Tensor>
35 | ms_deform_attn_cpu_backward(
36 |     const at::Tensor &value, 
37 |     const at::Tensor &spatial_shapes,
38 |     const at::Tensor &level_start_index,
39 |     const at::Tensor &sampling_loc,
40 |     const at::Tensor &attn_weight,
41 |     const at::Tensor &grad_output,
42 |     const int im2col_step)
43 | {
44 |     AT_ERROR("Not implement on cpu");
45 | }
46 | 
47 | 


--------------------------------------------------------------------------------
/tapps/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #pragma once
17 | #include <torch/extension.h>
18 | 
19 | at::Tensor
20 | ms_deform_attn_cpu_forward(
21 |     const at::Tensor &value, 
22 |     const at::Tensor &spatial_shapes,
23 |     const at::Tensor &level_start_index,
24 |     const at::Tensor &sampling_loc,
25 |     const at::Tensor &attn_weight,
26 |     const int im2col_step);
27 | 
28 | std::vector<at::Tensor>
29 | ms_deform_attn_cpu_backward(
30 |     const at::Tensor &value, 
31 |     const at::Tensor &spatial_shapes,
32 |     const at::Tensor &level_start_index,
33 |     const at::Tensor &sampling_loc,
34 |     const at::Tensor &attn_weight,
35 |     const at::Tensor &grad_output,
36 |     const int im2col_step);
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/tapps/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #pragma once
17 | #include <torch/extension.h>
18 | 
19 | at::Tensor ms_deform_attn_cuda_forward(
20 |     const at::Tensor &value, 
21 |     const at::Tensor &spatial_shapes,
22 |     const at::Tensor &level_start_index,
23 |     const at::Tensor &sampling_loc,
24 |     const at::Tensor &attn_weight,
25 |     const int im2col_step);
26 | 
27 | std::vector<at::Tensor> ms_deform_attn_cuda_backward(
28 |     const at::Tensor &value, 
29 |     const at::Tensor &spatial_shapes,
30 |     const at::Tensor &level_start_index,
31 |     const at::Tensor &sampling_loc,
32 |     const at::Tensor &attn_weight,
33 |     const at::Tensor &grad_output,
34 |     const int im2col_step);
35 | 
36 | 


--------------------------------------------------------------------------------
/tapps/modeling/pixel_decoder/ops/src/ms_deform_attn.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #pragma once
17 | 
18 | #include "cpu/ms_deform_attn_cpu.h"
19 | 
20 | #ifdef WITH_CUDA
21 | #include "cuda/ms_deform_attn_cuda.h"
22 | #endif
23 | 
24 | 
25 | at::Tensor
26 | ms_deform_attn_forward(
27 |     const at::Tensor &value, 
28 |     const at::Tensor &spatial_shapes,
29 |     const at::Tensor &level_start_index,
30 |     const at::Tensor &sampling_loc,
31 |     const at::Tensor &attn_weight,
32 |     const int im2col_step)
33 | {
34 |     if (value.type().is_cuda())
35 |     {
36 | #ifdef WITH_CUDA
37 |         return ms_deform_attn_cuda_forward(
38 |             value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step);
39 | #else
40 |         AT_ERROR("Not compiled with GPU support");
41 | #endif
42 |     }
43 |     AT_ERROR("Not implemented on the CPU");
44 | }
45 | 
46 | std::vector<at::Tensor>
47 | ms_deform_attn_backward(
48 |     const at::Tensor &value, 
49 |     const at::Tensor &spatial_shapes,
50 |     const at::Tensor &level_start_index,
51 |     const at::Tensor &sampling_loc,
52 |     const at::Tensor &attn_weight,
53 |     const at::Tensor &grad_output,
54 |     const int im2col_step)
55 | {
56 |     if (value.type().is_cuda())
57 |     {
58 | #ifdef WITH_CUDA
59 |         return ms_deform_attn_cuda_backward(
60 |             value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step);
61 | #else
62 |         AT_ERROR("Not compiled with GPU support");
63 | #endif
64 |     }
65 |     AT_ERROR("Not implemented on the CPU");
66 | }
67 | 
68 | 


--------------------------------------------------------------------------------
/tapps/modeling/pixel_decoder/ops/src/vision.cpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #include "ms_deform_attn.h"
17 | 
18 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
19 |   m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward");
20 |   m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward");
21 | }
22 | 


--------------------------------------------------------------------------------
/tapps/modeling/pixel_decoder/ops/test.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from __future__ import absolute_import
13 | from __future__ import print_function
14 | from __future__ import division
15 | 
16 | import time
17 | import torch
18 | import torch.nn as nn
19 | from torch.autograd import gradcheck
20 | 
21 | from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch
22 | 
23 | 
24 | N, M, D = 1, 2, 2
25 | Lq, L, P = 2, 2, 2
26 | shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda()
27 | level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1]))
28 | S = sum([(H*W).item() for H, W in shapes])
29 | 
30 | 
31 | torch.manual_seed(3)
32 | 
33 | 
34 | @torch.no_grad()
35 | def check_forward_equal_with_pytorch_double():
36 |     value = torch.rand(N, S, M, D).cuda() * 0.01
37 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
38 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
39 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
40 |     im2col_step = 2
41 |     output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu()
42 |     output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu()
43 |     fwdok = torch.allclose(output_cuda, output_pytorch)
44 |     max_abs_err = (output_cuda - output_pytorch).abs().max()
45 |     max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
46 | 
47 |     print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
48 | 
49 | 
50 | @torch.no_grad()
51 | def check_forward_equal_with_pytorch_float():
52 |     value = torch.rand(N, S, M, D).cuda() * 0.01
53 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
54 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
55 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
56 |     im2col_step = 2
57 |     output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu()
58 |     output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu()
59 |     fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3)
60 |     max_abs_err = (output_cuda - output_pytorch).abs().max()
61 |     max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
62 | 
63 |     print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
64 | 
65 | 
66 | def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True):
67 | 
68 |     value = torch.rand(N, S, M, channels).cuda() * 0.01
69 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
70 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
71 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
72 |     im2col_step = 2
73 |     func = MSDeformAttnFunction.apply
74 | 
75 |     value.requires_grad = grad_value
76 |     sampling_locations.requires_grad = grad_sampling_loc
77 |     attention_weights.requires_grad = grad_attn_weight
78 | 
79 |     gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step))
80 | 
81 |     print(f'* {gradok} check_gradient_numerical(D={channels})')
82 | 
83 | 
84 | if __name__ == '__main__':
85 |     check_forward_equal_with_pytorch_double()
86 |     check_forward_equal_with_pytorch_float()
87 | 
88 |     for channels in [30, 32, 64, 71, 1025, 2048, 3096]:
89 |         check_gradient_numerical(channels, True, True, True)
90 | 
91 | 
92 | 
93 | 


--------------------------------------------------------------------------------
/tapps/modeling/transformer_decoder/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from .maskformer_transformer_decoder import StandardTransformerDecoder
3 | from .mask2former_transformer_decoder import MultiScaleMaskedTransformerDecoder
4 | from .part_decoder import PartDecoder
5 | 


--------------------------------------------------------------------------------
/tapps/modeling/transformer_decoder/part_decoder.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import fvcore.nn.weight_init as weight_init
 3 | from typing import Optional
 4 | import torch
 5 | from torch import nn, Tensor
 6 | from torch.nn import functional as F
 7 | import numpy as np
 8 | 
 9 | 
10 | class MLP(nn.Module):
11 |   """ Very simple multi-layer perceptron (also called FFN)"""
12 | 
13 |   def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
14 |     super().__init__()
15 |     self.num_layers = num_layers
16 |     h = [hidden_dim] * (num_layers - 1)
17 |     self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
18 | 
19 |   def forward(self, x):
20 |     for i, layer in enumerate(self.layers):
21 |       x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
22 |     return x
23 | 
24 | 
25 | class PartDecoder(nn.Module):
26 |   def __init__(self,
27 |                num_part_classes,
28 |                input_dim,
29 |                hidden_dim,
30 |                mask_dim,
31 |                ):
32 |     super().__init__()
33 | 
34 |     self.num_part_classes = num_part_classes
35 | 
36 |     self.mask_head = MLP(input_dim, hidden_dim, mask_dim * num_part_classes, num_layers=3)
37 | 
38 |   def forward(self, queries, num_parts_per_query, mask_features, part_ids_per_query=None):
39 |     # queries shape: [Nb, num_queries (padded to max), hidden_dim]
40 | 
41 |     mask_embeds = self.mask_head(queries)
42 |     mask_embeds = torch.tensor_split(mask_embeds, self.num_part_classes, dim=2)
43 | 
44 |     # mask_embeds_total is [Nb, num_queries, num_partcls, num_channels]
45 |     mask_embeds_total = torch.stack(mask_embeds, dim=2)
46 |     embeds_shape = mask_embeds_total.shape
47 | 
48 |     # mask_embeds_total is [Nb, num_queries * num_partcls, num_channels]
49 |     mask_embeds_total = mask_embeds_total.view(embeds_shape[0],
50 |                                                embeds_shape[1] * embeds_shape[2],
51 |                                                embeds_shape[3])
52 | 
53 |     # outputs_mask shape is [Nb, num_queries * num_partcls, height, width]
54 |     outputs_mask = torch.einsum("bqc,bchw->bqhw", mask_embeds_total, mask_features)
55 |     # outputs_mask shape is [Nb, num_queries, num_partcls, height, width]
56 |     outputs_mask = outputs_mask.view(embeds_shape[0],
57 |                                      embeds_shape[1],
58 |                                      embeds_shape[2],
59 |                                      outputs_mask.shape[2],
60 |                                      outputs_mask.shape[3])
61 | 
62 |     # num_parts_per_query: list of length batch_size
63 |     gather_batch_dim = []
64 |     gather_num_queries = []
65 |     gather_num_partcls = []
66 | 
67 |     for i, num_parts in enumerate(num_parts_per_query):
68 |       if len(num_parts) != 0:
69 |         if part_ids_per_query is None:
70 |           idx_partcls = torch.cat([torch.arange(0, num_part) for num_part in num_parts], dim=0)
71 |         else:
72 |           idx_partcls = torch.cat([pt_idx for pt_idx in part_ids_per_query[i]])
73 |         idx_queries = torch.cat([torch.full_like(torch.arange(0, num_part), e, dtype=torch.long)
74 |                                  for e, num_part in enumerate(num_parts)], dim=0)
75 |         idx_batch = torch.full_like(idx_partcls, fill_value=i, dtype=torch.long)
76 | 
77 |         gather_batch_dim.append(idx_batch)
78 |         gather_num_partcls.append(idx_partcls)
79 |         gather_num_queries.append(idx_queries)
80 | 
81 |     if len(gather_batch_dim) != 0:
82 |       gather_batch_dim = torch.cat(gather_batch_dim, dim=0)
83 |       gather_num_queries = torch.cat(gather_num_queries, dim=0)
84 |       gather_num_partcls = torch.cat(gather_num_partcls, dim=0)
85 | 
86 |     else:
87 |       gather_batch_dim = torch.zeros([0], dtype=torch.long, device=mask_features.device)
88 |       gather_num_queries = torch.zeros([0], dtype=torch.long, device=mask_features.device)
89 |       gather_num_partcls = torch.zeros([0], dtype=torch.long, device=mask_features.device)
90 | 
91 |     output_masks = outputs_mask[gather_batch_dim, gather_num_queries, gather_num_partcls]
92 | 
93 |     return output_masks


--------------------------------------------------------------------------------
/tapps/modeling/transformer_decoder/position_encoding.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py
 3 | """
 4 | Various positional encodings for the transformer.
 5 | """
 6 | import math
 7 | 
 8 | import torch
 9 | from torch import nn
10 | 
11 | 
12 | class PositionEmbeddingSine(nn.Module):
13 |     """
14 |     This is a more standard version of the position embedding, very similar to the one
15 |     used by the Attention is all you need paper, generalized to work on images.
16 |     """
17 | 
18 |     def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
19 |         super().__init__()
20 |         self.num_pos_feats = num_pos_feats
21 |         self.temperature = temperature
22 |         self.normalize = normalize
23 |         if scale is not None and normalize is False:
24 |             raise ValueError("normalize should be True if scale is passed")
25 |         if scale is None:
26 |             scale = 2 * math.pi
27 |         self.scale = scale
28 | 
29 |     def forward(self, x, mask=None):
30 |         if mask is None:
31 |             mask = torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool)
32 |         not_mask = ~mask
33 |         y_embed = not_mask.cumsum(1, dtype=torch.float32)
34 |         x_embed = not_mask.cumsum(2, dtype=torch.float32)
35 |         if self.normalize:
36 |             eps = 1e-6
37 |             y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
38 |             x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
39 | 
40 |         dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
41 |         dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
42 | 
43 |         pos_x = x_embed[:, :, :, None] / dim_t
44 |         pos_y = y_embed[:, :, :, None] / dim_t
45 |         pos_x = torch.stack(
46 |             (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4
47 |         ).flatten(3)
48 |         pos_y = torch.stack(
49 |             (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4
50 |         ).flatten(3)
51 |         pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
52 |         return pos
53 |     
54 |     def __repr__(self, _repr_indent=4):
55 |         head = "Positional encoding " + self.__class__.__name__
56 |         body = [
57 |             "num_pos_feats: {}".format(self.num_pos_feats),
58 |             "temperature: {}".format(self.temperature),
59 |             "normalize: {}".format(self.normalize),
60 |             "scale: {}".format(self.scale),
61 |         ]
62 |         # _repr_indent = 4
63 |         lines = [head] + [" " * _repr_indent + line for line in body]
64 |         return "\n".join(lines)
65 | 


--------------------------------------------------------------------------------
/tapps/test_time_augmentation.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | import copy
  3 | import logging
  4 | from itertools import count
  5 | 
  6 | import numpy as np
  7 | import torch
  8 | from fvcore.transforms import HFlipTransform
  9 | from torch import nn
 10 | from torch.nn.parallel import DistributedDataParallel
 11 | 
 12 | from detectron2.data.detection_utils import read_image
 13 | from detectron2.modeling import DatasetMapperTTA
 14 | 
 15 | 
 16 | __all__ = [
 17 |     "SemanticSegmentorWithTTA",
 18 | ]
 19 | 
 20 | 
 21 | class SemanticSegmentorWithTTA(nn.Module):
 22 |     """
 23 |     A SemanticSegmentor with test-time augmentation enabled.
 24 |     Its :meth:`__call__` method has the same interface as :meth:`SemanticSegmentor.forward`.
 25 |     """
 26 | 
 27 |     def __init__(self, cfg, model, tta_mapper=None, batch_size=1):
 28 |         """
 29 |         Args:
 30 |             cfg (CfgNode):
 31 |             model (SemanticSegmentor): a SemanticSegmentor to apply TTA on.
 32 |             tta_mapper (callable): takes a dataset dict and returns a list of
 33 |                 augmented versions of the dataset dict. Defaults to
 34 |                 `DatasetMapperTTA(cfg)`.
 35 |             batch_size (int): batch the augmented images into this batch size for inference.
 36 |         """
 37 |         super().__init__()
 38 |         if isinstance(model, DistributedDataParallel):
 39 |             model = model.module
 40 |         self.cfg = cfg.clone()
 41 | 
 42 |         self.model = model
 43 | 
 44 |         if tta_mapper is None:
 45 |             tta_mapper = DatasetMapperTTA(cfg)
 46 |         self.tta_mapper = tta_mapper
 47 |         self.batch_size = batch_size
 48 | 
 49 |     def __call__(self, batched_inputs):
 50 |         """
 51 |         Same input/output format as :meth:`SemanticSegmentor.forward`
 52 |         """
 53 | 
 54 |         def _maybe_read_image(dataset_dict):
 55 |             ret = copy.copy(dataset_dict)
 56 |             if "image" not in ret:
 57 |                 image = read_image(ret.pop("file_name"), self.model.input_format)
 58 |                 image = torch.from_numpy(np.ascontiguousarray(image.transpose(2, 0, 1)))  # CHW
 59 |                 ret["image"] = image
 60 |             if "height" not in ret and "width" not in ret:
 61 |                 ret["height"] = image.shape[1]
 62 |                 ret["width"] = image.shape[2]
 63 |             return ret
 64 | 
 65 |         processed_results = []
 66 |         for x in batched_inputs:
 67 |             result = self._inference_one_image(_maybe_read_image(x))
 68 |             processed_results.append(result)
 69 |         return processed_results
 70 | 
 71 |     def _inference_one_image(self, input):
 72 |         """
 73 |         Args:
 74 |             input (dict): one dataset dict with "image" field being a CHW tensor
 75 |         Returns:
 76 |             dict: one output dict
 77 |         """
 78 |         orig_shape = (input["height"], input["width"])
 79 |         augmented_inputs, tfms = self._get_augmented_inputs(input)
 80 | 
 81 |         final_predictions = None
 82 |         count_predictions = 0
 83 |         for input, tfm in zip(augmented_inputs, tfms):
 84 |             count_predictions += 1
 85 |             with torch.no_grad():
 86 |                 if final_predictions is None:
 87 |                     if any(isinstance(t, HFlipTransform) for t in tfm.transforms):
 88 |                         final_predictions = self.model([input])[0].pop("sem_seg").flip(dims=[2])
 89 |                     else:
 90 |                         final_predictions = self.model([input])[0].pop("sem_seg")
 91 |                 else:
 92 |                     if any(isinstance(t, HFlipTransform) for t in tfm.transforms):
 93 |                         final_predictions += self.model([input])[0].pop("sem_seg").flip(dims=[2])
 94 |                     else:
 95 |                         final_predictions += self.model([input])[0].pop("sem_seg")
 96 | 
 97 |         final_predictions = final_predictions / count_predictions
 98 |         return {"sem_seg": final_predictions}
 99 | 
100 |     def _get_augmented_inputs(self, input):
101 |         augmented_inputs = self.tta_mapper(input)
102 |         tfms = [x.pop("transforms") for x in augmented_inputs]
103 |         return augmented_inputs, tfms
104 | 


--------------------------------------------------------------------------------
/tapps/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/tapps/utils/misc.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/util/misc.py
  3 | """
  4 | Misc functions, including distributed helpers.
  5 | 
  6 | Mostly copy-paste from torchvision references.
  7 | """
  8 | from typing import List, Optional
  9 | 
 10 | import torch
 11 | import torch.distributed as dist
 12 | import torchvision
 13 | from torch import Tensor
 14 | 
 15 | 
 16 | def _max_by_axis(the_list):
 17 |     # type: (List[List[int]]) -> List[int]
 18 |     maxes = the_list[0]
 19 |     for sublist in the_list[1:]:
 20 |         for index, item in enumerate(sublist):
 21 |             maxes[index] = max(maxes[index], item)
 22 |     return maxes
 23 | 
 24 | 
 25 | class NestedTensor(object):
 26 |     def __init__(self, tensors, mask: Optional[Tensor]):
 27 |         self.tensors = tensors
 28 |         self.mask = mask
 29 | 
 30 |     def to(self, device):
 31 |         # type: (Device) -> NestedTensor # noqa
 32 |         cast_tensor = self.tensors.to(device)
 33 |         mask = self.mask
 34 |         if mask is not None:
 35 |             assert mask is not None
 36 |             cast_mask = mask.to(device)
 37 |         else:
 38 |             cast_mask = None
 39 |         return NestedTensor(cast_tensor, cast_mask)
 40 | 
 41 |     def decompose(self):
 42 |         return self.tensors, self.mask
 43 | 
 44 |     def __repr__(self):
 45 |         return str(self.tensors)
 46 | 
 47 | 
 48 | def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
 49 |     # TODO make this more general
 50 |     if tensor_list[0].ndim == 3:
 51 |         if torchvision._is_tracing():
 52 |             # nested_tensor_from_tensor_list() does not export well to ONNX
 53 |             # call _onnx_nested_tensor_from_tensor_list() instead
 54 |             return _onnx_nested_tensor_from_tensor_list(tensor_list)
 55 | 
 56 |         # TODO make it support different-sized images
 57 |         max_size = _max_by_axis([list(img.shape) for img in tensor_list])
 58 |         # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list]))
 59 |         batch_shape = [len(tensor_list)] + max_size
 60 |         b, c, h, w = batch_shape
 61 |         dtype = tensor_list[0].dtype
 62 |         device = tensor_list[0].device
 63 |         tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
 64 |         mask = torch.ones((b, h, w), dtype=torch.bool, device=device)
 65 |         for img, pad_img, m in zip(tensor_list, tensor, mask):
 66 |             pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
 67 |             m[: img.shape[1], : img.shape[2]] = False
 68 |     else:
 69 |         raise ValueError("not supported")
 70 |     return NestedTensor(tensor, mask)
 71 | 
 72 | 
 73 | # _onnx_nested_tensor_from_tensor_list() is an implementation of
 74 | # nested_tensor_from_tensor_list() that is supported by ONNX tracing.
 75 | @torch.jit.unused
 76 | def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor:
 77 |     max_size = []
 78 |     for i in range(tensor_list[0].dim()):
 79 |         max_size_i = torch.max(
 80 |             torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32)
 81 |         ).to(torch.int64)
 82 |         max_size.append(max_size_i)
 83 |     max_size = tuple(max_size)
 84 | 
 85 |     # work around for
 86 |     # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
 87 |     # m[: img.shape[1], :img.shape[2]] = False
 88 |     # which is not yet supported in onnx
 89 |     padded_imgs = []
 90 |     padded_masks = []
 91 |     for img in tensor_list:
 92 |         padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))]
 93 |         padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0]))
 94 |         padded_imgs.append(padded_img)
 95 | 
 96 |         m = torch.zeros_like(img[0], dtype=torch.int, device=img.device)
 97 |         padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1)
 98 |         padded_masks.append(padded_mask.to(torch.bool))
 99 | 
100 |     tensor = torch.stack(padded_imgs)
101 |     mask = torch.stack(padded_masks)
102 | 
103 |     return NestedTensor(tensor, mask=mask)
104 | 
105 | 
106 | def is_dist_avail_and_initialized():
107 |     if not dist.is_available():
108 |         return False
109 |     if not dist.is_initialized():
110 |         return False
111 |     return True
112 | 


--------------------------------------------------------------------------------
/tools/README.md:
--------------------------------------------------------------------------------
 1 | This directory contains few tools for MaskFormer.
 2 | 
 3 | * `convert-torchvision-to-d2.py`
 4 | 
 5 | Tool to convert torchvision pre-trained weights for D2.
 6 | 
 7 | ```
 8 | wget https://download.pytorch.org/models/resnet101-63fe2227.pth
 9 | python tools/convert-torchvision-to-d2.py resnet101-63fe2227.pth R-101.pkl
10 | ```
11 | 
12 | * `convert-pretrained-swin-model-to-d2.py`
13 | 
14 | Tool to convert Swin Transformer pre-trained weights for D2.
15 | 
16 | ```
17 | pip install timm
18 | 
19 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth
20 | python tools/convert-pretrained-swin-model-to-d2.py swin_tiny_patch4_window7_224.pth swin_tiny_patch4_window7_224.pkl
21 | 
22 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_small_patch4_window7_224.pth
23 | python tools/convert-pretrained-swin-model-to-d2.py swin_small_patch4_window7_224.pth swin_small_patch4_window7_224.pkl
24 | 
25 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window12_384_22k.pth
26 | python tools/convert-pretrained-swin-model-to-d2.py swin_base_patch4_window12_384_22k.pth swin_base_patch4_window12_384_22k.pkl
27 | 
28 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth
29 | python tools/convert-pretrained-swin-model-to-d2.py swin_large_patch4_window12_384_22k.pth swin_large_patch4_window12_384_22k.pkl
30 | ```
31 | 
32 | * `evaluate_pq_for_semantic_segmentation.py`
33 | 
34 | Tool to evaluate PQ (PQ-stuff) for semantic segmentation predictions.
35 | 
36 | Usage:
37 | 
38 | ```
39 | python tools/evaluate_pq_for_semantic_segmentation.py --dataset-name ade20k_sem_seg_val --json-file OUTPUT_DIR/inference/sem_seg_predictions.json
40 | ```
41 | 
42 | where `OUTPUT_DIR` is set in the config file.
43 | 
44 | * `evaluate_coco_boundary_ap.py`
45 | 
46 | Tool to evaluate Boundary AP for instance segmentation predictions.
47 | 
48 | Usage:
49 | 
50 | ```
51 | python tools/coco_instance_evaluation.py --gt-json-file COCO_GT_JSON --dt-json-file COCO_DT_JSON
52 | ```
53 | 
54 | To install Boundary IoU API, run:
55 | 
56 | ```
57 | pip install git+https://github.com/bowenc0221/boundary-iou-api.git
58 | ```
59 | 
60 | * `analyze_model.py`
61 | 
62 | Tool to analyze model parameters and flops.
63 | 
64 | Usage for semantic segmentation (ADE20K only, use with caution!):
65 | 
66 | ```
67 | python tools/analyze_model.py --num-inputs 1 --tasks flop --use-fixed-input-size --config-file CONFIG_FILE
68 | ```
69 | 
70 | Note that, for semantic segmentation (ADE20K only), we use a dummy image with fixed size that equals to `cfg.INPUT.CROP.SIZE[0] x cfg.INPUT.CROP.SIZE[0]`.
71 | Please do not use `--use-fixed-input-size` for calculating FLOPs on other datasets like Cityscapes!
72 | 
73 | Usage for panoptic and instance segmentation:
74 | 
75 | ```
76 | python tools/analyze_model.py --num-inputs 100 --tasks flop --config-file CONFIG_FILE
77 | ```
78 | 
79 | Note that, for panoptic and instance segmentation, we compute the average flops over 100 real validation images.
80 | 


--------------------------------------------------------------------------------
/tools/convert-pretrained-swin-model-to-d2.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 3 | 
 4 | import pickle as pkl
 5 | import sys
 6 | 
 7 | import torch
 8 | 
 9 | """
10 | Usage:
11 |   # download pretrained swin model:
12 |   wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth
13 |   # run the conversion
14 |   ./convert-pretrained-model-to-d2.py swin_tiny_patch4_window7_224.pth swin_tiny_patch4_window7_224.pkl
15 |   # Then, use swin_tiny_patch4_window7_224.pkl with the following changes in config:
16 | MODEL:
17 |   WEIGHTS: "/path/to/swin_tiny_patch4_window7_224.pkl"
18 | INPUT:
19 |   FORMAT: "RGB"
20 | """
21 | 
22 | if __name__ == "__main__":
23 |     input = sys.argv[1]
24 | 
25 |     obj = torch.load(input, map_location="cpu")["model"]
26 | 
27 |     res = {"model": obj, "__author__": "third_party", "matching_heuristics": True}
28 | 
29 |     with open(sys.argv[2], "wb") as f:
30 |         pkl.dump(res, f)
31 | 


--------------------------------------------------------------------------------
/tools/convert-torchvision-to-d2.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | 
 4 | import pickle as pkl
 5 | import sys
 6 | 
 7 | import torch
 8 | 
 9 | """
10 | Usage:
11 |   # download one of the ResNet{18,34,50,101,152} models from torchvision:
12 |   wget https://download.pytorch.org/models/resnet50-19c8e357.pth -O r50.pth
13 |   # run the conversion
14 |   ./convert-torchvision-to-d2.py r50.pth r50.pkl
15 |   # Then, use r50.pkl with the following changes in config:
16 | MODEL:
17 |   WEIGHTS: "/path/to/r50.pkl"
18 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
19 |   PIXEL_STD: [58.395, 57.120, 57.375]
20 |   RESNETS:
21 |     DEPTH: 50
22 |     STRIDE_IN_1X1: False
23 | INPUT:
24 |   FORMAT: "RGB"
25 | """
26 | 
27 | if __name__ == "__main__":
28 |     input = sys.argv[1]
29 | 
30 |     obj = torch.load(input, map_location="cpu")
31 | 
32 |     newmodel = {}
33 |     for k in list(obj.keys()):
34 |         old_k = k
35 |         if "layer" not in k:
36 |             k = "stem." + k
37 |         for t in [1, 2, 3, 4]:
38 |             k = k.replace("layer{}".format(t), "res{}".format(t + 1))
39 |         for t in [1, 2, 3]:
40 |             k = k.replace("bn{}".format(t), "conv{}.norm".format(t))
41 |         k = k.replace("downsample.0", "shortcut")
42 |         k = k.replace("downsample.1", "shortcut.norm")
43 |         print(old_k, "->", k)
44 |         newmodel[k] = obj.pop(old_k).detach().numpy()
45 | 
46 |     res = {"model": newmodel, "__author__": "torchvision", "matching_heuristics": True}
47 | 
48 |     with open(sys.argv[2], "wb") as f:
49 |         pkl.dump(res, f)
50 |     if obj:
51 |         print("Unconverted keys:", obj.keys())
52 | 


--------------------------------------------------------------------------------
/tools/evaluate_coco_boundary_ap.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 3 | # Modified by Bowen Cheng from: https://github.com/bowenc0221/boundary-iou-api/blob/master/tools/coco_instance_evaluation.py
 4 | 
 5 | """
 6 | Evaluation for COCO val2017:
 7 | python ./tools/coco_instance_evaluation.py \
 8 |     --gt-json-file COCO_GT_JSON \
 9 |     --dt-json-file COCO_DT_JSON
10 | """
11 | import argparse
12 | import json
13 | 
14 | from boundary_iou.coco_instance_api.coco import COCO
15 | from boundary_iou.coco_instance_api.cocoeval import COCOeval
16 | 
17 | 
18 | def main():
19 |     parser = argparse.ArgumentParser()
20 |     parser.add_argument("--gt-json-file", default="")
21 |     parser.add_argument("--dt-json-file", default="")
22 |     parser.add_argument("--iou-type", default="boundary")
23 |     parser.add_argument("--dilation-ratio", default="0.020", type=float)
24 |     args = parser.parse_args()
25 |     print(args)
26 | 
27 |     annFile = args.gt_json_file
28 |     resFile = args.dt_json_file
29 |     dilation_ratio = args.dilation_ratio
30 |     if args.iou_type == "boundary":
31 |         get_boundary = True
32 |     else:
33 |         get_boundary = False
34 |     cocoGt = COCO(annFile, get_boundary=get_boundary, dilation_ratio=dilation_ratio)
35 |     
36 |     # remove box predictions
37 |     resFile = json.load(open(resFile))
38 |     for c in resFile:
39 |         c.pop("bbox", None)
40 | 
41 |     cocoDt = cocoGt.loadRes(resFile)
42 |     cocoEval = COCOeval(cocoGt, cocoDt, iouType=args.iou_type, dilation_ratio=dilation_ratio)
43 |     cocoEval.evaluate()
44 |     cocoEval.accumulate()
45 |     cocoEval.summarize()
46 | 
47 | 
48 | if __name__ == '__main__':
49 |     main()
50 | 


--------------------------------------------------------------------------------
/utils/panoptic_parts/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include panoptic_parts/specs/dataset_specs/*.yaml
2 | include panoptic_parts/specs/eval_specs/*.yaml


--------------------------------------------------------------------------------
/utils/panoptic_parts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tue-mps/tapps/54d8d9f039fa91da6e046bfd8c62c13a9dc5e8b0/utils/panoptic_parts/__init__.py


--------------------------------------------------------------------------------
/utils/panoptic_parts/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/utils/panoptic_parts/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/utils/panoptic_parts/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | Sphinx==4.5.0
2 | sphinx-rtd-theme==0.5.1
3 | sphinx-autodoc-typehints==1.11.1
4 | recommonmark==0.7.1
5 | sphinx-markdown-tables==0.0.15


--------------------------------------------------------------------------------
/utils/panoptic_parts/docs/source/api_and_code.rst:
--------------------------------------------------------------------------------
 1 | API Reference
 2 | =============
 3 | 
 4 | We provide a public, stable API consisting of tested modules. However, in members of the API you may encounter experimental features (e.g. arguments or functions). These have the prefix `experimental\_` and are exempted from stability guarantees.
 5 | 
 6 | The functions of the API are exported (apart from their original modules) also in the panoptic_parts namespace, so they can be imported and used as:
 7 | 
 8 | .. code-block:: python
 9 | 
10 |    import panoptic_parts as pp
11 |    pp.decode_uids(uids)
12 | 
13 | 
14 | 
15 | Label format handling
16 | ---------------------
17 | 
18 | .. autofunction:: panoptic_parts.utils.format.decode_uids
19 | .. autofunction:: panoptic_parts.utils.format.encode_ids
20 | 
21 | Visualization
22 | -------------
23 | 
24 | .. autofunction:: panoptic_parts.utils.visualization.random_colors
25 | .. autofunction:: panoptic_parts.utils.visualization.uid2color
26 | 
27 | Misc
28 | ----
29 | 
30 | .. autofunction:: panoptic_parts.utils.utils.safe_write
31 | 
32 | 
33 | Code Reference
34 | ==============
35 | 
36 | Documented/Undocumented functionality of the rest of the code his repo lies here. This functionality will be added to the API in the future. Until then, the following functions may be moved or be unstable.
37 | 
38 | Dataset & Evaluation specifications
39 | -----------------------------------
40 | 
41 | .. autoclass:: panoptic_parts.specs.dataset_spec.DatasetSpec
42 |    :members:
43 |    :undoc-members:
44 | 
45 | .. autoclass:: panoptic_parts.specs.eval_spec.PartPQEvalSpec
46 |    :members:
47 |    :undoc-members:
48 | 
49 | .. autoclass:: panoptic_parts.specs.eval_spec.SegmentationPartsEvalSpec
50 |    :members:
51 |    :undoc-members:
52 | 
53 | Visualization
54 | -------------
55 | 
56 | .. autofunction:: panoptic_parts.visualization.visualize_label_with_legend.visualize_from_paths
57 | .. autofunction:: panoptic_parts.utils.visualization.experimental_colorize_label
58 | .. autofunction:: panoptic_parts.utils.visualization._generate_shades
59 | .. autofunction:: panoptic_parts.utils.visualization._num_instances_per_sid
60 | .. autofunction:: panoptic_parts.utils.visualization._num_parts_per_sid
61 | .. autofunction:: panoptic_parts.utils.visualization._sid2iids
62 | .. autofunction:: panoptic_parts.utils.visualization._sid2pids
63 | 
64 | Evaluation
65 | ----------
66 | 
67 | .. autofunction:: panoptic_parts.utils.evaluation_PartPQ.evaluate_PartPQ_multicore
68 | .. autoclass::  panoptic_parts.utils.experimental_evaluation_IOU.ConfusionMatrixEvaluator_v2
69 |    :members:
70 |    :undoc-members:
71 |    :show-inheritance:
72 | 
73 | Misc
74 | ----
75 | 
76 | .. autofunction:: panoptic_parts.utils.utils.compare_pixelwise
77 | .. autofunction:: panoptic_parts.utils.utils._sparse_ids_mapping_to_dense_ids_mapping


--------------------------------------------------------------------------------
/utils/panoptic_parts/docs/source/conf.py:
--------------------------------------------------------------------------------
  1 | # Configuration file for the Sphinx documentation builder.
  2 | #
  3 | # This file only contains a selection of the most common options. For a full
  4 | # list see the documentation:
  5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
  6 | 
  7 | # -- Path setup --------------------------------------------------------------
  8 | 
  9 | # If extensions (or modules to document with autodoc) are in another directory,
 10 | # add these directories to sys.path here. If the directory is relative to the
 11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 12 | #
 13 | import os
 14 | import sys
 15 | from recommonmark.transform import AutoStructify
 16 | from recommonmark.parser import CommonMarkParser
 17 | sys.path.insert(0, os.path.abspath('../..'))
 18 | import panoptic_parts
 19 | 
 20 | 
 21 | # -- Project information -----------------------------------------------------
 22 | 
 23 | project = 'Part-aware Panoptic Segmentation'
 24 | copyright = '2021, The Panoptic Parts datasets team'
 25 | author = 'Panagiotis Meletis and Xiaoxiao (Vincent) Wen'
 26 | # version_file = '../../panoptic_parts/version.py'
 27 | 
 28 | 
 29 | # def get_version():
 30 | #     with open(version_file, 'r') as f:
 31 | #         exec(compile(f.read(), version_file, 'exec'))
 32 | #     return locals()['__version__']
 33 | 
 34 | 
 35 | # # The full version, including alpha/beta/rc tags
 36 | # release = get_version()
 37 | 
 38 | release = panoptic_parts.__version__
 39 | 
 40 | # -- General configuration ---------------------------------------------------
 41 | 
 42 | # Add any Sphinx extension module names here, as strings. They can be
 43 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 44 | # ones. Napoleon should be loaded before sphinx_autodoc_typehints.
 45 | extensions = [
 46 |         'sphinx.ext.autodoc',
 47 |         'sphinx.ext.napoleon',
 48 |         'sphinx.ext.viewcode',
 49 |         'sphinx_autodoc_typehints',
 50 |         'recommonmark',
 51 |         'sphinx_markdown_tables',
 52 |     ]
 53 | 
 54 | # Disable module names in auto documentation
 55 | # add_module_names = False
 56 | 
 57 | # sphinx.ext.autodoc options
 58 | # set_type_checking_flag = False  # defaults to False
 59 | typehints_fully_qualified = True  # defaults to False
 60 | autodoc_preserve_defaults = True
 61 | 
 62 | # Add any paths that contain templates here, relative to this directory.
 63 | # templates_path = ['_templates']
 64 | 
 65 | # The suffix(es) of source filenames.
 66 | # You can specify multiple suffix as a list of string:
 67 | source_suffix = {
 68 |     '.rst': 'restructuredtext',
 69 |     '.md': 'markdown',
 70 | }
 71 | 
 72 | 
 73 | source_parsers = {
 74 |     '.md': CommonMarkParser,
 75 | }
 76 | 
 77 | # The master toctree document.
 78 | master_doc = 'index'
 79 | 
 80 | # List of patterns, relative to source directory, that match files and
 81 | # directories to ignore when looking for source files.
 82 | # This pattern also affects html_static_path and html_extra_path.
 83 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
 84 | 
 85 | 
 86 | # -- Options for HTML output -------------------------------------------------
 87 | 
 88 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 89 | # a list of builtin themes.
 90 | #
 91 | html_theme = 'sphinx_rtd_theme'
 92 | 
 93 | # Add any paths that contain custom static files (such as style sheets) here,
 94 | # relative to this directory. They are copied after the builtin static files,
 95 | # so a file named "default.css" will overwrite the builtin "default.css".
 96 | html_static_path = ['_static']
 97 | 
 98 | github_doc_root = 'https://github.com/rtfd/recommonmark/tree/master/doc/'
 99 | def setup(app):
100 |     app.add_config_value('recommonmark_config', {
101 |             'url_resolver': lambda url: github_doc_root + url,
102 |             'auto_toc_tree_section': 'Contents',
103 |             }, True)
104 |     app.add_transform(AutoStructify)


--------------------------------------------------------------------------------
/utils/panoptic_parts/docs/source/contact.md:
--------------------------------------------------------------------------------
1 | ## Contact
2 | 
3 | Please feel free to contact us for any suggestions or questions:
4 | 
5 | * Panagiotis Meletis: **p**[DOT]**c**[DOT]**meletis**[AT]**tue.nl**
6 | * Xiaoxiao (Vincent) Wen: **wenxx10**[AT]**gmail.com**


--------------------------------------------------------------------------------
/utils/panoptic_parts/docs/source/errata_cvpr2021.md:
--------------------------------------------------------------------------------
1 | # CVPR 2021 paper errata
2 | 
3 | Here is list of the tables from the paper Part-aware Panoptic Segmentation and the corrected PartPQ results.
4 | 


--------------------------------------------------------------------------------
/utils/panoptic_parts/docs/source/ground_truth_usage_cases.md:
--------------------------------------------------------------------------------
 1 | ## Ground Truth usage cases
 2 | 
 3 | <!-- We provide for each image a single (image-like) ground truth file encoding semantic-, instance-, and parts- levels annotations. Our compact [label format](label_format.md) together with [_decode_uids_](api.html#panoptic_parts.utils.format.decode_uids) function enable easy decoding of the labels for various image understanding tasks including: -->
 4 | 
 5 | <!-- This is a workaround for internal reference to an API function problem using Markdown with Sphinx. -->
 6 | ```eval_rst
 7 | We provide for each image a single (image-like) ground truth file encoding semantic-, instance-, and parts- levels annotations. Our compact :doc:`Label format <label_format>` together with 
 8 | :func:`panoptic_parts.utils.format.decode_uids`
 9 | function enable easy decoding of the labels for various image understanding tasks including:
10 | ```
11 | 
12 | ```Python
13 | # labels: Python int, or np.ndarray, or tf.Tensor, or torch.tensor
14 | 
15 | # Semantic Segmentation
16 | semantic_ids, _, _ = decode_uids(labels)
17 | 
18 | # Instance Segmentation
19 | semantic_ids, instance_ids, _ = decode_uids(labels)
20 | 
21 | # Panoptic Segmentation
22 | _, _, _, semantic_instance_ids = decode_uids(labels, return_sids_iids=True)
23 | 
24 | # Parts Segmentation / Parts Parsing
25 | _, _, _, semantic_parts_ids = decode_uids(labels, return_sids_pids=True)
26 | 
27 | # Instance-level Parts Parsing
28 | semantic_ids, instance_ids, parts_ids = decode_uids(labels)
29 | 
30 | # Parts-level Panoptic Segmentation
31 | _, _, _, semantic_instance_ids, semantic_parts_ids = decode_uids(labels, return_sids_iids=True, return_sids_pids=True)
32 | 
33 | ```


--------------------------------------------------------------------------------
/utils/panoptic_parts/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | .. Part-aware Panoptic Segmentation documentation master file, created by
 2 |    sphinx-quickstart on Thu Jan 28 11:43:38 2021.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to Part-aware Panoptic Segmentation documentation!
 7 | ===========================================================================
 8 | 
 9 | .. toctree::
10 |    :maxdepth: 1
11 |    :caption: Get Started
12 | 
13 |    introduction.md
14 |    installation.md
15 |    label_format.md
16 | 
17 | .. toctree::
18 |    :caption: API & Code Reference
19 | 
20 |    api_and_code.rst
21 | 
22 | 
23 | .. toctree::
24 |    :caption: Evaluation
25 |    :maxdepth: 1
26 |    
27 |    evaluate_results.md
28 | 
29 | .. toctree::
30 |    :maxdepth: 1
31 |    :caption: Examples and Tools
32 | 
33 |    visualization.md
34 |    generate_results.md
35 |    ground_truth_usage_cases.md
36 |    tools.md
37 |    scripts.md
38 | 
39 | .. toctree::
40 |    :caption: Contact
41 | 
42 | =======
43 | Contact
44 | =======
45 | Please feel free to contact us for any suggestions or questions.
46 | 
47 |   **panoptic.parts@outlook.com**
48 | 
49 |   Correspondence: Panagiotis Meletis, Vincent (Xiaoxiao) Wen
50 | 
51 | The Panoptic Parts datasets team
52 | 
53 | 
54 | Indices and tables
55 | ==================
56 | 
57 | * :ref:`genindex`
58 | * :ref:`search`
59 | 


--------------------------------------------------------------------------------
/utils/panoptic_parts/docs/source/installation.md:
--------------------------------------------------------------------------------
 1 | ## Installation
 2 | 
 3 | The code can be installed from the PyPI and requires at least Python 3.7. It is recommended to install it in a Python virtual environment.
 4 | 
 5 | ```shell
 6 | pip install panoptic_parts
 7 | ```
 8 | 
 9 | Some functionality requires extra packages to be installed, e.g. evaluation scripts (tqdm) or Pytorch/Tensorflow (torch/tensorflow). These can be installed separately or by downloading the `optional.txt` file from this repo and running the following command in the virtual environment:
10 | 
11 | ```shell
12 | pip install -r optional.txt
13 | ```
14 | 
15 | After installation you can use the package as:
16 | 
17 | ```python
18 | import panoptic_parts as pp
19 | 
20 | print(pp.VERSION)
21 | ```
22 | 
23 | There are three scripts defined as entry points by the package:
24 | 
25 | ```shell
26 | pp_merge_to_panoptic <args>
27 | pp_merge_to_pps <args>
28 | pp_visualize_label_with_legend <args>
29 | ```
30 | 


--------------------------------------------------------------------------------
/utils/panoptic_parts/docs/source/introduction.md:
--------------------------------------------------------------------------------
 1 | # Introduction
 2 | 
 3 | This repository contains code and tools for reading, processing, evaluating on, and visualizing Panoptic Parts datasets. Moreover, it contains code for reproducing our CVPR 2021 paper results.
 4 | 
 5 | ## Datasets
 6 | 
 7 | *Cityscapes-Panoptic-Parts* and *PASCAL-Panoptic-Parts* are created by extending two established datasets for image scene understanding, namely [Cityscapes](https://github.com/mcordts/cityscapesScripts "Cityscapes") and [PASCAL](http://host.robots.ox.ac.uk/pascal/VOC/voc2010/ "PASCAL") datasets. Detailed description of the datasets and various statistics are presented in our technical report in [arxiv](https://arxiv.org/abs/2004.07944 "arxiv.org"). The datasets can be downloaded from:
 8 | 
 9 | - [Cityscapes Panoptic Parts](https://www.cityscapes-dataset.com/login/)
10 | - [PASCAL Panoptic Parts](https://1drv.ms/u/s!AojlpuGgPtL1bHXfIdeL14IeVhI?e=5tNfET) ([alternative link](https://pan.baidu.com/s/1k96Wdg_IyD91kvq87Wy7nw), code: i7ap)
11 | 
12 | ## API and code reference
13 | 
14 | We provide a public, stable API, and various code utilities that are documented [here](https://panoptic-parts.readthedocs.io/en/stable/api_and_code.html).
15 | 
16 | ## Reproducing CVPR 2021 paper
17 | 
18 | The part-aware panoptic segmentation results from the paper can be reproduced using [this](https://panoptic-parts.readthedocs.io/en/stable/generate_results.html) guide.
19 | 
20 | ## Evaluation metrics
21 | 
22 | We provide two metrics for evaluating performance on Panoptic Parts datasets.
23 | 
24 | - Part-aware Panoptic Quality (PartPQ): [here](https://panoptic-parts.readthedocs.io/en/stable/evaluate_results.html).
25 | - Intersection over Union (IoU): _TBA_
26 | 
27 | ## Citations
28 | 
29 |  Please cite us if you find our work useful or you use it in your research:
30 | 
31 | ```bibtex
32 | @inproceedings{degeus2021panopticparts,
33 |     title = {Part-aware Panoptic Segmentation},
34 |     author = {Daan de Geus and Panagiotis Meletis and Chenyang Lu and Xiaoxiao Wen and Gijs Dubbelman},
35 |     booktitle = {IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
36 |     year = {2021}
37 | }
38 | ```
39 | 
40 | ```bibtex
41 | @article{meletis2020panopticparts,
42 |     title = {Cityscapes-Panoptic-Parts and PASCAL-Panoptic-Parts datasets for Scene Understanding},
43 |     author = {Panagiotis Meletis and Xiaoxiao Wen and Chenyang Lu and Daan de Geus and Gijs Dubbelman},
44 |     type = {Technical report},
45 |     institution = {Eindhoven University of Technology},
46 |     date = {16/04/2020},
47 |     url = {https://github.com/tue-mps/panoptic_parts},
48 |     eprint={2004.07944},
49 |     archivePrefix={arXiv},
50 |     primaryClass={cs.CV}
51 | }
52 | ```
53 | 
54 | <!-- This is a workaround for the image rendering problem using Markdown with Sphinx. -->
55 | 
56 | ```eval_rst
57 | .. image:: _static/mps_logo.png
58 |     :target: https://www.tue.nl/en/research/research-groups/signal-processing-systems/mobile-perception-systems-lab/
59 |     :alt: MPS
60 |     :height: 100
61 | 
62 | .. image:: _static/tue_logo.jpg
63 |     :target: https://www.tue.nl/
64 |     :alt: TU/e
65 |     :height: 100
66 | ```
67 | 


--------------------------------------------------------------------------------
/utils/panoptic_parts/docs/source/label_format.md:
--------------------------------------------------------------------------------
 1 | # Serialization format: hierarchical information encoding
 2 | 
 3 | The goal of the format is to include (per image) all annotations in a single, image-like file with consistent representations across all abstractions and information levels. This enables easy transfer, reading, and compactly handling annotations. The following hierarchical structure is chosen, which extends the Cityscapes serialization format.
 4 | 
 5 | The goal of the format is to include (per image) all annotations in a single, image-like label file with a consistent encoding across all abstractions and information levels. This enables easy transfer, reading, and compact handling of the annotations. The following hierarchical structure is chosen, which extends the Cityscapes serialization format.
 6 | 
 7 | <!-- This is a workaround for the image rendering problem using Markdown with sphinx -->
 8 | ```eval_rst
 9 |     .. image:: _static/hierarchical_format.jpg
10 |         :target: _static/hierarchical_format.jpg
11 |         :alt: Hierarchical Label Format
12 | ```
13 | 
14 | We encode three levels of labels: semantic, instance, and parts in a single image-like file. Labels for both datasets follow this format.
15 | Each pixel in our hierarchical label format has an up to 7-digit _universal id_ (_uid_) containing:
16 | 
17 | - An up to 2-digit _semantic id_ (_sid_), encoding the semantic-level _things_ or _stuff_ class.
18 | - An up to 3-digit _instance id_ (_iid_), a counter of instances per _things_ class and per image. This is optional.
19 | - An up to 2-digit _part id_ (_pid_), encoding the parts-level semantic class per-instance and per-image. This is optional, but if provided requires also an _iid_. Only _things_ parts are covered by this format.
20 | 
21 | We compactly encode the aforementioned _ids_ (_sid_, _iid_, _pid_) into an up to 7-digit _uid_. Starting from the left, the first one or two digits encode the semantic class, the next 3 encode the instance (after zero pre-padding), and the final two encode the parts class (after zero pre-padding).
22 | 
23 | Using the above encoding:
24 | 
25 | - 1-2 digit _uids_ encode only semantic-level labels
26 | - 4-5 digit _uids_ encode semantic-instance-level labels
27 | - 6-7 digit _uids_ encode semantic-instance-parts-level labels
28 | 
29 | For example, in _Cityscapes-Panoptic-Parts_, a _sky_ (_stuff_) pixel will have _uid_ = 23, a _car_ (_things_) pixel that is labeled only on the semantic level will have _uid_ = 26, if it's labeled also on instance level it can have _uid_ = 26002, and a _person_ (_things_) pixel that is labeled on all three levels can have _uid_ = 2401002.
30 | 
31 | > The format covers parts-level classes for _stuff_ semantic classes using a dummy instance id (`iid = 0`). Cityscapes Panoptic Parts and PASCAL Panoptic Parts do not currently define any _stuff_ with part-level classes. This is a feature that can be used in future extensions.
32 | 
33 | ## Unlabeled/Ignored pixels
34 | 
35 | We handle the unlabeled / void / ignored / "do not care pixels" in the three levels as follows:
36 | 
37 | - Semantic level: For _Cityscapes-Panoptic-Parts_ we use the original Cityscapes void class. For _PASCAL-Panoptic-Parts_ we use the class with _uid_ = 0.
38 | - Instance level: For instances the void class is not required. If a pixel does not belong to an object or cannot be labeled on instance level then it has only an up to 2-digit _semantic id_.
39 | - Parts level: For both datasets we use the convention that, for each semantic class, the part-level class with _pid_ = 0 represents the void pixels, e.g., for a _person_ pixel, _uid_ = 2401000 represents the void parts pixels of instance 10. The need for a void class arises during the manual annotation process but in principle it is not needed at the parts level. Thus, we try to minimize void parts level pixels and assign them instead only the semantic- or semantic-instance -level labels.
40 | 


--------------------------------------------------------------------------------
/utils/panoptic_parts/docs/source/scripts.md:
--------------------------------------------------------------------------------
1 | ## Scripts


--------------------------------------------------------------------------------
/utils/panoptic_parts/docs/source/tools.md:
--------------------------------------------------------------------------------
1 | ## Tools


--------------------------------------------------------------------------------
/utils/panoptic_parts/docs/source/visualization.md:
--------------------------------------------------------------------------------
 1 | ## Visualization of ground truth
 2 | 
 3 | ### Cityscapes-Panoptic-Parts
 4 | 
 5 | ```eval_rst
 6 | .. list-table::
 7 |    :header-rows: 1
 8 | 
 9 |    * - 
10 |       .. image:: _static/aachen_000012_000019_leftImg8bit.jpg
11 |          :target: _static/aachen_000012_000019_leftImg8bit.jpg
12 |          :alt: aachen_000012_000019_leftImg8bit
13 |      
14 |      - 
15 |       .. image:: _static/aachen_000012_000019_uids_pids_colored.png
16 |          :target: _static/aachen_000012_000019_uids_pids_colored.png
17 |          :alt: aachen_000012_000019_uids_pids_colored
18 |      
19 |    * - 
20 |       .. image:: _static/frankfurt_000001_011835_leftImg8bit.jpg
21 |          :target: _static/frankfurt_000001_011835_leftImg8bit.jpg
22 |          :alt: frankfurt_000001_011835_leftImg8bit
23 |      
24 |      - 
25 |       .. image:: _static/frankfurt_000001_011835_uids_pids_colored.png
26 |          :target: _static/frankfurt_000001_011835_uids_pids_colored.png
27 |          :alt: frankfurt_000001_011835_uids_pids_colored
28 | ```
29 | 
30 | ### PASCAL-Panoptic-Parts
31 | 
32 | ```eval_rst
33 | .. list-table::
34 |    :header-rows: 1
35 | 
36 |    * - 
37 |       .. image:: _static/2008_000393.jpg
38 |          :target: _static/2008_000393.jpg
39 |          :alt: 2008_000393
40 |      
41 |      - 
42 |       .. image:: _static/2008_000393_colored.png
43 |          :target: _static/2008_000393_colored.png
44 |          :alt: 2008_000393_colored
45 |      
46 |      - 
47 |       .. image:: _static/2008_000716.jpg
48 |          :target: _static/2008_000716.jpg
49 |          :alt: 2008_000716
50 |      
51 |      - 
52 |       .. image:: _static/2008_000716_colored.png
53 |          :target: _static/2008_000716_colored.png
54 |          :alt: 2008_000716_colored
55 |      
56 |    * - 
57 |       .. image:: _static/2008_007456.jpg
58 |          :target: _static/2008_007456.jpg
59 |          :alt: 2008_007456
60 |      
61 |      - 
62 |       .. image:: _static/2008_007456_colored_repainted.png
63 |          :target: _static/2008_007456_colored_repainted.png
64 |          :alt: 2008_007456_colored_repainted
65 |      
66 |      - 
67 |       .. image:: _static/2010_002356.jpg
68 |          :target: _static/2010_002356.jpg
69 |          :alt: 2010_002356
70 |       
71 |      - 
72 |       .. image:: _static/2010_002356_colored.png
73 |          :target: _static/2010_002356_colored.png
74 |          :alt: 2010_002356_colored
75 | ```
76 | 


--------------------------------------------------------------------------------
/utils/panoptic_parts/optional.txt:
--------------------------------------------------------------------------------
1 | tensorflow>=2.4.0
2 | torch>=1.7.0
3 | git+https://github.com/cocodataset/panopticapi.git
4 | tqdm
5 | pycocotools>=2.0.0


--------------------------------------------------------------------------------
/utils/panoptic_parts/panoptic_parts/__init__.py:
--------------------------------------------------------------------------------
1 | from panoptic_parts.utils.format import decode_uids, encode_ids
2 | from panoptic_parts.utils.visualization import uid2color, random_colors
3 | from panoptic_parts.utils.utils import safe_write
4 | 
5 | 
6 | __version__ = '2.0rc5'
7 | VERSION = __version__
8 | 


--------------------------------------------------------------------------------
/utils/panoptic_parts/panoptic_parts/cityscapes_panoptic_parts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tue-mps/tapps/54d8d9f039fa91da6e046bfd8c62c13a9dc5e8b0/utils/panoptic_parts/panoptic_parts/cityscapes_panoptic_parts/__init__.py


--------------------------------------------------------------------------------
/utils/panoptic_parts/panoptic_parts/cityscapes_panoptic_parts/dataset_v2.0/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Cityscapes Panoptic Parts annotations
 3 | We have manually annotated 5 scene-level classes with 23 part-level classes from Cityscapes vehicle and human categories.
 4 | 
 5 | You can download the dataset from the [Cityscapes Dataset](https://www.cityscapes-dataset.com/login/) website.
 6 | 
 7 | Pixels of humans and vehicles (_sids_: 24, 25, 26, 27, or 28) that are not assigned to any part-level class by the annotation team or it is not clearly visible to which part they belong to, have _pid_ = 0 or they maintain their semantic-level or semantic-instance-level labels. From the perspective of semantics the labels `SS_III_00` and `SS_III` are equivalent.
 8 | 
 9 | ## Human (person (_sid_: 24), rider (_sid_: 25)) pids:
10 | 
11 | * 0: unlabeled / void
12 | * 1: torso
13 | * 2: head
14 | * 3: arms
15 | * 4: legs
16 | 
17 | > Note: For human and rider scene classes a _pid_ 5 exists in a minority of ground truth files (~10). This _pid_ is an artefact of data preprocessing. These artefact can be automatically set to void _pid_ 0 (unlabeled part) using the decoding functionality provided in the following snippet:
18 | 
19 |   ```python
20 |   uids = np.array(Image.open('gt_filepath.tif'), dtype=np.int32)
21 |   dataset_spec = DatasetSpec('cpp_datasetspec.yaml')
22 |   _, _, pids = decode_uids(uids, experimental_dataset_spec=dataset_spec, experimental_correct_range=True)
23 |   ```
24 | 
25 | ## Vehicle (car (_sid_: 26), truck (_sid_: 27), bus (_sid_: 28)) pids:
26 | 
27 | * 0: unlabeled / void
28 | * 1: windows
29 | * 2: wheels
30 | * 3: lights
31 | * 4: license plate
32 | * 5: chassis
33 | 
34 | ## Contact
35 | 
36 | Please feel free to contact us for any suggestions or questions:
37 | 
38 | * Panagiotis Meletis: **p**[DOT]**c**[DOT]**meletis**[AT]**tue.nl**
39 | * Xiaoxiao (Vincent) Wen: **wenxx10**[AT]**gmail.com**
40 | 


--------------------------------------------------------------------------------
/utils/panoptic_parts/panoptic_parts/evaluation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tue-mps/tapps/54d8d9f039fa91da6e046bfd8c62c13a9dc5e8b0/utils/panoptic_parts/panoptic_parts/evaluation/__init__.py


--------------------------------------------------------------------------------
/utils/panoptic_parts/panoptic_parts/evaluation/prepare_data.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import glob
 3 | import os
 4 | import argparse
 5 | 
 6 | from tqdm import tqdm
 7 | from PIL import Image
 8 | 
 9 | 
10 | def create_image_list(dataset_dir, output_dir, dataset):
11 |   """
12 |   :param dataset_dir: path to the PPS ground-truths file for the data split
13 |   :param output_dir: directory where the images.json file will be stored
14 |   :param dataset: dataset name ('Cityscapes' or 'Pascal')
15 | 
16 |   :return:
17 |   """
18 |   print("Creating images list...")
19 |   images_list = list()
20 | 
21 |   # Get all filenames in the GT directory
22 |   filenames = [file for file in glob.glob(dataset_dir + "/*")]
23 |   if dataset == 'Cityscapes':
24 |     filenames.extend([file for file in glob.glob(dataset_dir + "/*/*")])
25 | 
26 |   for filename in tqdm(filenames):
27 |     if filename.endswith(str('.tif')):
28 |       image_dict = dict()
29 |       file_name_gt = os.path.basename(filename)
30 | 
31 |       # Set names for file_name and image_id
32 |       if dataset == 'Cityscapes':
33 |         file_name = file_name_gt.replace('_gtFinePanopticParts.tif', '_gtFine_leftImg8bit.png')
34 |         image_id = file_name_gt.replace('_gtFinePanopticParts.tif', '')
35 |       else:
36 |         file_name = file_name_gt.replace('.tif', '.png')
37 |         image_id = file_name_gt.replace('.tif', '')
38 |       image_dict['file_name'] = file_name
39 |       image_dict['id'] = image_id
40 | 
41 |       # Open gt image and store image dimensions
42 |       img = Image.open(filename)
43 |       image_dict['width'], image_dict['height'] = img.size[0:2]
44 | 
45 |       images_list.append(image_dict)
46 | 
47 |   images_dict = {'images': images_list}
48 | 
49 |   # Save images.json file
50 |   output_path = os.path.join(output_dir, 'images.json')
51 |   with open(output_path, 'w') as fp:
52 |     json.dump(images_dict, fp)
53 | 
54 |   print("Created images list and stored at {}.".format(output_path))
55 | 
56 | if __name__ == '__main__':
57 |   parser = argparse.ArgumentParser(
58 |     description="Creates an images.json file for the Cityscapes Panoptic Parts or Pascal Panoptic Parts dataset."
59 |   )
60 | 
61 |   parser.add_argument('dataset_dir', type=str,
62 |                       help="path to the PPS ground-truths file for the data split")
63 |   parser.add_argument('output_dir', type=str,
64 |                       help="directory where the images.json file will be stored")
65 |   parser.add_argument('dataset', type=str,
66 |                       help="dataset name ('Cityscapes' or 'Pascal')")
67 |   args = parser.parse_args()
68 | 
69 |   create_image_list(args.dataset_dir,
70 |                     args.output_dir,
71 |                     dataset=args.dataset)
72 | 


--------------------------------------------------------------------------------
/utils/panoptic_parts/panoptic_parts/merging/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tue-mps/tapps/54d8d9f039fa91da6e046bfd8c62c13a9dc5e8b0/utils/panoptic_parts/panoptic_parts/merging/__init__.py


--------------------------------------------------------------------------------
/utils/panoptic_parts/panoptic_parts/pascal_panoptic_parts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tue-mps/tapps/54d8d9f039fa91da6e046bfd8c62c13a9dc5e8b0/utils/panoptic_parts/panoptic_parts/pascal_panoptic_parts/__init__.py


--------------------------------------------------------------------------------
/utils/panoptic_parts/panoptic_parts/pascal_panoptic_parts/dataset_v2.0/README.md:
--------------------------------------------------------------------------------
1 | # Contact
2 | 
3 | Please feel free to contact us for any suggestions or questions:
4 | 
5 | * Panagiotis Meletis: **p**[DOT]**c**[DOT]**meletis**[AT]**tue.nl**
6 | * Xiaoxiao (Vincent) Wen: **wenxx10**[AT]**gmail.com**
7 | 


--------------------------------------------------------------------------------
/utils/panoptic_parts/panoptic_parts/specs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tue-mps/tapps/54d8d9f039fa91da6e046bfd8c62c13a9dc5e8b0/utils/panoptic_parts/panoptic_parts/specs/__init__.py


--------------------------------------------------------------------------------
/utils/panoptic_parts/panoptic_parts/specs/dataset_specs/cpp_datasetspec.yaml:
--------------------------------------------------------------------------------
 1 | version: '2.0'
 2 | comments:
 3 |   - refer to ppp_datasetspec for now
 4 | name: Cityscapes Panoptic Parts
 5 | 
 6 | # scene_class2part_classes: An ordered mapping from scene-level class to part-level classes.
 7 | # Refer to ppp_datasetspec for now.
 8 | scene_class2part_classes: {
 9 |   ego vehicle: [],
10 |   rectification border: [],
11 |   out of roi: [],
12 |   static: [],
13 |   dynamic: [],
14 |   ground: [],
15 |   road: [],
16 |   sidewalk: [],
17 |   parking: [],
18 |   rail track: [],
19 |   building: [],
20 |   wall: [],
21 |   fence: [],
22 |   guard rail: [],
23 |   bridge: [],
24 |   tunnel: [],
25 |   pole: [],
26 |   polegroup: [],
27 |   traffic light: [],
28 |   traffic sign: [],
29 |   vegetation: [],
30 |   terrain: [],
31 |   sky: [],
32 |   person: [torso, head, arm, leg],
33 |   rider: [torso, head, arm, leg],
34 |   car: [window, wheel, light, license plate, chassis],
35 |   truck: [window, wheel, light, license plate, chassis],
36 |   bus: [window, wheel, light, license plate, chassis],
37 |   caravan: [],
38 |   trailer: [],
39 |   train: [],
40 |   motorcycle: [],
41 |   bicycle: [],
42 |   license plate: [],
43 | }
44 | 
45 | # Refer to ppp_datasetspec for now.
46 | scene_classes_with_instances: [
47 |   person, rider, car, truck, bus, caravan, trailer, train, motorcycle, bicycle
48 | ]
49 | 
50 | # Refer to ppp_datasetspec for now.
51 | scene_class2color: {
52 |   ego vehicle: [0, 0, 0],
53 |   rectification border: [0, 0, 0],
54 |   out of roi: [0, 0, 0],
55 |   static: [0, 0, 0],
56 |   dynamic: [111, 74, 0],
57 |   ground: [81, 0, 81],
58 |   road: [128, 64, 128],
59 |   sidewalk: [244, 35, 232],
60 |   parking: [250, 170, 160],
61 |   rail track: [230, 150, 140],
62 |   building: [70, 70, 70],
63 |   wall: [102, 102, 156],
64 |   fence: [190, 153, 153],
65 |   guard rail: [180, 165, 180],
66 |   bridge: [150, 100, 100],
67 |   tunnel: [150, 120, 90],
68 |   pole: [153, 153, 153],
69 |   polegroup: [153, 153, 153],
70 |   traffic light: [250, 170, 30],
71 |   traffic sign: [220, 220, 0],
72 |   vegetation: [107, 142, 35],
73 |   terrain: [152, 251, 152],
74 |   sky: [70, 130, 180],
75 |   person: [220, 20, 60],
76 |   rider: [255, 0, 0],
77 |   car: [0, 0, 142],
78 |   truck: [0, 0, 70],
79 |   bus: [0, 60, 100],
80 |   caravan: [0, 0, 90],
81 |   trailer: [0, 0, 110],
82 |   train: [0, 80, 100],
83 |   motorcycle: [0, 0, 230],
84 |   bicycle: [119, 11, 32],
85 |   license plate: [0, 0, 142],
86 | }
87 | 


--------------------------------------------------------------------------------
/utils/panoptic_parts/panoptic_parts/specs/eval_specs/ppq_cpp_19_23_cvpr21_default_evalspec.yaml:
--------------------------------------------------------------------------------
 1 | version: 2.0
 2 | comments:
 3 |  First version containing all information I (Daan) think is necessary for merging to panoptic and part-aware panoptic (and it should also be usable for PartPQ evaluation)
 4 | 
 5 | dataset_spec_path: utils/panoptic_parts/panoptic_parts/specs/dataset_specs/cpp_datasetspec.yaml
 6 | 
 7 | # To be used for evaluation
 8 | ignore_label: 255
 9 | 
10 | # To be used for evaluation
11 | dataset_sid2eval_sid: {
12 |   # evaluated
13 |   7: 7,   8: 8,   11: 11, 12: 12, 13: 13,
14 |   17: 17, 19: 19, 20: 20, 21: 21, 22: 22,
15 |   23: 23, 24: 24, 25: 25, 26: 26, 27: 27,
16 |   28: 28, 31: 31, 32: 32, 33: 33,
17 |   # ignored
18 |   0: IGNORED, 1: IGNORED, 2: IGNORED, 3: IGNORED, 4: IGNORED, 5: IGNORED, 6: IGNORED, 9: IGNORED, 10: IGNORED, 14: IGNORED, 15: IGNORED, 16: IGNORED, 18: IGNORED, 29: IGNORED, 30: IGNORED,
19 |   # default
20 |   DEFAULT: IGNORED
21 | }
22 | 
23 | # To be used for evaluation
24 | dataset_sid_pid2eval_sid_pid: {
25 |   # evaluated
26 |   24_01: 24_01, 24_02: 24_02, 24_03: 24_03, 24_04: 24_04,
27 |   25_01: 25_01, 25_02: 25_02, 25_03: 25_03, 25_04: 25_04,
28 |   26_01: 26_01, 26_02: 26_02, 26_03: 26_03, 26_04: 26_04, 26_05: 26_05,
29 |   27_01: 27_01, 27_02: 27_02, 27_03: 27_03, 27_04: 27_04, 27_05: 27_05,
30 |   28_01: 28_01, 28_02: 28_02, 28_03: 28_03, 28_04: 28_04, 28_05: 28_05,
31 |   # ignored
32 |   24: IGNORED, 25: IGNORED, 26: IGNORED, 27: IGNORED, 28: IGNORED,
33 |   0: IGNORED, 1: IGNORED, 2: IGNORED, 3: IGNORED, 4: IGNORED, 5: IGNORED, 6: IGNORED, 9: IGNORED, 10: IGNORED, 14: IGNORED, 15: IGNORED, 16: IGNORED, 18: IGNORED, 29: IGNORED, 30: IGNORED,
34 |   # default
35 |   DEFAULT: IGNORED
36 | }
37 | 
38 | # Used for merging and evaluation
39 | eval_sid_things: [24, 25, 26, 27, 28, 31, 32, 33]
40 | eval_sid_stuff: [7, 8, 11, 12, 13, 17, 19, 20, 21, 22, 23]
41 | eval_sid_parts: [24, 25, 26, 27, 28]
42 | eval_sid_no_parts: [7, 8, 11, 12, 13, 17, 19, 20, 21, 22, 23, 31, 32, 33]
43 | 
44 | # Used for merging panoptic & parts
45 | eval_sid_pid2eval_pid_flat: {
46 |   24_01: 1,  24_02: 2,  24_03: 3,  24_04: 4,
47 |   25_01: 5,  25_02: 6,  25_03: 7,  25_04: 8,
48 |   26_01: 9,  26_02: 10, 26_03: 11, 26_04: 12, 26_05: 13,
49 |   27_01: 14, 27_02: 15, 27_03: 16, 27_04: 17, 27_05: 18,
50 |   28_01: 19, 28_02: 20, 28_03: 21, 28_04: 22, 28_05: 23,
51 | }
52 | 
53 | 
54 | # Names for all labels that are to be evaluated
55 | eval_sid2scene_label: {
56 |   7: road, 8: sidewalk, 11: building, 12: wall,
57 |   13: fence, 17: pole, 19: traffic light, 20: traffic sign,
58 |   21: vegetation, 22: terrain, 23: sky, 24: person,
59 |   25: rider, 26: car, 27: truck, 28: bus,
60 |   31: train, 32: motorcycle, 33: bicycle
61 | }
62 | 
63 | eval_pid_flat2scene_part_label: {
64 |   1: person-torso, 2: person-head, 3: person-arms, 4: person-legs,
65 |   5: rider-torso, 6: rider-head, 7: rider-arms, 8: rider-legs,
66 |   9: car-windows, 10: car-wheels, 11: car-lights, 12: car-license_plate, 13: car-chassis,
67 |   14: truck-windows, 15: truck-wheels, 16: truck-lights, 17: truck-license_plate, 18: truck-chassis,
68 |   19: bus-windows, 20: bus-wheels, 21: bus-lights, 22: bus-license_plate, 23: bus-chassis,
69 | }
70 | 


--------------------------------------------------------------------------------
/utils/panoptic_parts/panoptic_parts/specs/eval_specs/ppq_cpp_19_23_cvpr21_grouped_evalspec.yaml:
--------------------------------------------------------------------------------
 1 | version: 1.0
 2 | comments: Information required to calculate the PartPQ for CPP
 3 | 
 4 | dataset_spec_path: utils/panoptic_parts/panoptic_parts/specs/dataset_specs/cpp_datasetspec.yaml
 5 | 
 6 | # To be used for evaluation
 7 | ignore_label: 255
 8 | 
 9 | # To be used for evaluation
10 | dataset_sid2eval_sid: {
11 |   # evaluated
12 |   7: 7,   8: 8,   11: 11, 12: 12, 13: 13,
13 |   17: 17, 19: 19, 20: 20, 21: 21, 22: 22,
14 |   23: 23, 24: 24, 25: 25, 26: 26, 27: 27,
15 |   28: 28, 31: 31, 32: 32, 33: 33,
16 |   # ignored
17 |   0: IGNORED, 1: IGNORED, 2: IGNORED, 3: IGNORED, 4: IGNORED, 5: IGNORED, 6: IGNORED, 9: IGNORED, 10: IGNORED, 14: IGNORED, 15: IGNORED, 16: IGNORED, 18: IGNORED, 29: IGNORED, 30: IGNORED,
18 |   # default
19 |   DEFAULT: IGNORED
20 | }
21 | 
22 | # To be used for evaluation
23 | dataset_sid_pid2eval_sid_pid: {
24 |   # evaluated
25 |   24_01: 24_01, 24_02: 24_02, 24_03: 24_03, 24_04: 24_04,
26 |   25_01: 25_01, 25_02: 25_02, 25_03: 25_03, 25_04: 25_04,
27 |   26_01: 26_01, 26_02: 26_02, 26_03: 26_03, 26_04: 26_04, 26_05: 26_05,
28 |   27_01: 27_01, 27_02: 27_02, 27_03: 27_03, 27_04: 27_04, 27_05: 27_05,
29 |   28_01: 28_01, 28_02: 28_02, 28_03: 28_03, 28_04: 28_04, 28_05: 28_05,
30 |   # ignored
31 |   24: IGNORED, 25: IGNORED, 26: IGNORED, 27: IGNORED, 28: IGNORED,
32 |   0: IGNORED, 1: IGNORED, 2: IGNORED, 3: IGNORED, 4: IGNORED, 5: IGNORED, 6: IGNORED, 9: IGNORED, 10: IGNORED, 14: IGNORED, 15: IGNORED, 16: IGNORED, 18: IGNORED, 29: IGNORED, 30: IGNORED,
33 |   # default
34 |   DEFAULT: IGNORED
35 | }
36 | 
37 | # Used for merging and evaluation
38 | eval_sid_things: [24, 25, 26, 27, 28, 31, 32, 33]
39 | eval_sid_stuff: [7, 8, 11, 12, 13, 17, 19, 20, 21, 22, 23]
40 | eval_sid_parts: [24, 25, 26, 27, 28]
41 | eval_sid_no_parts: [7, 8, 11, 12, 13, 17, 19, 20, 21, 22, 23, 31, 32, 33]
42 | 
43 | # Used for merging panoptic & parts
44 | eval_sid_pid2eval_pid_flat: {
45 |   24_01: 1, 24_02: 2, 24_03: 3, 24_04: 4,
46 |   25_01: 1, 25_02: 2, 25_03: 3, 25_04: 4,
47 |   26_01: 5, 26_02: 6, 26_03: 7, 26_04: 8, 26_05: 9,
48 |   27_01: 5, 27_02: 6, 27_03: 7, 27_04: 8, 27_05: 9,
49 |   28_01: 5, 28_02: 6, 28_03: 7, 28_04: 8, 28_05: 9,
50 | }
51 | 
52 | # Names for all labels that are to be evaluated
53 | eval_sid2scene_label: {
54 |   7: road, 8: sidewalk, 11: building, 12: wall,
55 |   13: fence, 17: pole, 19: traffic light, 20: traffic sign,
56 |   21: vegetation, 22: terrain, 23: sky, 24: person,
57 |   25: rider, 26: car, 27: truck, 28: bus,
58 |   31: train, 32: motorcycle, 33: bicycle
59 | }
60 | 
61 | eval_pid_flat2scene_part_label: {
62 |   1: human-torso, 2: human-head, 3: human-arms, 4: human-legs,
63 |   5: vehicle-windows, 6: vehicle-wheels, 7: vehicle-lights, 8: vehicle-license_plate, 9: vehicle-chassis,
64 | }
65 | 


--------------------------------------------------------------------------------
/utils/panoptic_parts/panoptic_parts/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tue-mps/tapps/54d8d9f039fa91da6e046bfd8c62c13a9dc5e8b0/utils/panoptic_parts/panoptic_parts/utils/__init__.py


--------------------------------------------------------------------------------
/utils/panoptic_parts/panoptic_parts/utils/internal/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tue-mps/tapps/54d8d9f039fa91da6e046bfd8c62c13a9dc5e8b0/utils/panoptic_parts/panoptic_parts/utils/internal/__init__.py


--------------------------------------------------------------------------------
/utils/panoptic_parts/panoptic_parts/utils/internal/convert_annotations_v1_to_v2.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import os
 3 | import os.path as op
 4 | import sys
 5 | from tqdm import tqdm
 6 | 
 7 | import numpy as np
 8 | from PIL import Image
 9 | 
10 | from panoptic_parts.utils.format import decode_uids, encode_ids
11 | 
12 | 
13 | def convert():
14 |   basepath = 'pascal_panoptic_parts/releases/20201704/pascal_panoptic_parts_v1'
15 | 
16 |   filepaths = glob.glob(op.join(basepath, 'training/*.tif')) + glob.glob(op.join(basepath, 'validation/*.tif'))
17 | 
18 |   for fp in tqdm(filepaths):
19 |     uids = np.asarray(Image.open(fp), dtype=np.int32)
20 |     # transformation 1 (tvmonitor-unlabeled becomes tvmonitor-frame): {20_XXX, 20_XXX_00} -> 20_XXX_02
21 |     sids, iids, pids, sids_iids, sids_pids =  decode_uids(uids, return_sids_iids=True, return_sids_pids=True)
22 |     pids = np.where(np.logical_and(iids >= 0,
23 |                                    np.logical_or(np.equal(sids_pids, 20), np.equal(sids_pids, 20_00))),
24 |                     2,
25 |                     pids)
26 |     uids = encode_ids(sids, iids, pids)
27 |     # transformation 1 (remove 00): XX_XXX_00 -> XX_XXX
28 |     _, _, pids, sids_iids = decode_uids(uids, return_sids_iids=True)
29 |     uids = np.where(np.logical_and(uids >= 1_000_00, np.equal(pids, 0)),
30 |                     sids_iids,
31 |                     uids)
32 | 
33 |     path_new = fp.replace('20201704/pascal_panoptic_parts_v1', '20210503/pascal_panoptic_parts_v2')
34 |     assert not op.exists(path_new), f'path {path_new} exists.'
35 |     os.makedirs(op.dirname(path_new), exist_ok=True)
36 |     Image.fromarray(uids, mode='I').save(path_new, format='TIFF', compression='tiff_lzw')
37 | 
38 | 
39 | def validate():
40 |   basepath_v1 = 'pascal_panoptic_parts/releases/20201704/pascal_panoptic_parts_v1'
41 |   basepath_v2 = 'pascal_panoptic_parts/releases/20210503/pascal_panoptic_parts_v2'
42 | 
43 |   filepaths_v1 = glob.glob(op.join(basepath_v1, 'training/*.tif')) + glob.glob(op.join(basepath_v1, 'validation/*.tif'))
44 |   filepaths_v2 = [fp.replace('20201704/pascal_panoptic_parts_v1', '20210503/pascal_panoptic_parts_v2') for fp in filepaths_v1]
45 | 
46 |   for i, (f1, f2) in enumerate(zip(filepaths_v1, filepaths_v2)):
47 |     l1 = np.asanyarray(Image.open(f1), dtype=np.int32)
48 |     l2 = np.asanyarray(Image.open(f2), dtype=np.int32)
49 |     # if there are differences print the unique tuples with (uid_l1, uid_l2) corresponding
50 |     # to the same spatial position
51 |     cond = l1 != l2
52 |     if np.any(cond):
53 |       uids_tuples = np.unique(np.stack([l1[cond], l2[cond]]), axis=1)
54 |       print(i, *(uids_tuples[:, j] for j in range(uids_tuples.shape[1])))
55 |     else:
56 |       print('No diff.')
57 | 
58 | 
59 | if __name__ == '__main__':
60 |   # convert()
61 |   validate()
62 | 


--------------------------------------------------------------------------------
/utils/panoptic_parts/panoptic_parts/utils/internal/populate_ppp_official_evalspec.py:
--------------------------------------------------------------------------------
 1 | from ruamel.yaml import YAML
 2 | 
 3 | from panoptic_parts.specs.dataset_spec import DatasetSpec
 4 | 
 5 | 
 6 | with open('ppp_20_58_iou_evalspec.yaml') as fd:
 7 |   gspec = YAML().load(fd)
 8 | 
 9 | dspec = DatasetSpec(gspec['dataset_spec_path'])
10 | 
11 | with open('ppq_ppp_59_57_evalspec.yaml') as fd:
12 |   espec = YAML().load(fd)
13 | 
14 | 
15 | 
16 | # dataset_sid_pid2eval_sid_pid
17 | ###################################################################################################
18 | part_groupings = gspec['part_groupings']
19 | dataset_sid_pid2eval_sid_pid = dict()
20 | for sid_pid, (scene_class, part_class) in dspec.sid_pid2scene_class_part_class.items():
21 |   if sid_pid == 0 or scene_class not in part_groupings.keys():
22 |     continue
23 |   sid = sid_pid // 100
24 |   pid_new = None
25 |   # find the part_class position in the part_groupings dict
26 |   for pid_new_cand, (part_class_new, part_classes_old) in enumerate(part_groupings[scene_class].items(), start=1):
27 |     for part_class_old in part_classes_old:
28 |       if part_class_old == part_class:
29 |         pid_new = pid_new_cand
30 |         break
31 |     else: # ie inner loop DOES NOT break, continue mid loop
32 |       continue
33 |     break # if inner loop breaks, then break mid loop
34 |   else: # ie mid loop DOES NOT break, continue outer loop
35 |     continue
36 |   dataset_sid_pid2eval_sid_pid[sid_pid] = sid * 100 + pid_new
37 | 
38 | # sanity check
39 | esd2epf = espec['eval_sid_pid2eval_pid_flat']
40 | assert all(v in esd2epf.keys() for v in dataset_sid_pid2eval_sid_pid.values())
41 | 
42 | # print in a friendly copy-paste way to yaml
43 | sid_prev = 0
44 | for k, v in dataset_sid_pid2eval_sid_pid.items():
45 |   sid_cur = k // 100
46 |   if sid_cur > sid_prev:
47 |     sid_prev = sid_cur
48 |     print('\n  ', end='')
49 |   print('{}_{:02d}'.format(*divmod(k, 100)) + ': ' + '{}_{:02d}'.format(*divmod(v, 100)) + ',', end=' ')
50 | ###################################################################################################
51 | 
52 | # eval_sid2scene_label
53 | ###################################################################################################
54 | # eval_sid2dataset_sid = espec['eval_sid2scene_label']
55 | # eval_sid2scene_label = {es: dspec.scene_class_from_sid(ds) for es, ds in eval_sid2dataset_sid.items()}
56 | ###################################################################################################
57 | 
58 | # eval_pid_flat2scene_part_label
59 | ###################################################################################################
60 | eval_pid_flat = espec['eval_pid_flat2scene_part_label'].keys()
61 | eval_pid_flat2eval_sid_pid = {v: k for k, v in espec['eval_sid_pid2eval_pid_flat'].items()}
62 | eval_pid_flat2eval_sid_pid[0] = 0
63 | 
64 | part_groupings['UNLABELED'] = {'UNLABELED': ['UNLABELED']}
65 | 
66 | eval_pid_flat2scene_part_label = dict()
67 | for k in eval_pid_flat:
68 |   eval_sid_pid = eval_pid_flat2eval_sid_pid[k]
69 |   eval_sid, eval_pid = divmod(eval_sid_pid, 100)
70 |   scene_class = dspec.scene_class_from_sid(eval_sid)
71 |   part_class_new2part_classes_old = {'UNLABELED': ['UNLABELED']}
72 |   part_class_new2part_classes_old.update(part_groupings[scene_class])
73 |   part_class = list(part_class_new2part_classes_old.keys())[eval_pid]
74 |   eval_pid_flat2scene_part_label[k] = f'{scene_class}-{part_class}'
75 | ###################################################################################################
76 | breakpoint()
77 | 


--------------------------------------------------------------------------------
/utils/panoptic_parts/panoptic_parts/utils/internal/ppq_ppp_20_58_part_groupings.yaml:
--------------------------------------------------------------------------------
  1 | version: 2.0
  2 | comments:
  3 |  - The IoU eval specification contains two optional fields.
  4 |  - scene_class_new2scene_classes_old contains a mapping from the new scene-level classes to at least one of the original scene-level classes of the dataset (dataset_spec.l). Use this mapping to group or ignore scene-level classes.
  5 |  - part_groupings is a mapping from the new part-level classes to at least one of the original part-level classes of the dataset.
  6 | 
  7 | dataset_spec_path: panoptic_parts/specs/dataset_specs/ppp_datasetspec.yaml
  8 | 
  9 | 
 10 | # part_groupings provides the information of the grouped part-level classes.
 11 | # typing: Dict(scene_class, Dict(part_class_new, part_classes_old))
 12 | # for now only a grouping of the part_classes under the same scene_class is supported
 13 | part_groupings: {
 14 |   aeroplane: {
 15 |     # UNLABELED: IGNORED,
 16 |     body: [body],
 17 |     engine: [engine],
 18 |     wing: [lwing, rwing],
 19 |     stern: [stern, tail],
 20 |     wheel: [wheel],
 21 |   },
 22 |   bicycle: {
 23 |     wheel: [fwheel, bwheel, chainwheel],
 24 |     body: [UNLABELED, saddle, handlebar, headlight],
 25 |   },
 26 |   bird: {
 27 |     # UNLABELED: IGNORED,
 28 |     head: [head, leye, reye, beak],
 29 |     wing: [lwing, rwing],
 30 |     leg: [lleg, lfoot, rleg, rfoot],
 31 |     torso: [torso, neck, tail],
 32 |   },
 33 |   boat: {
 34 |     boat: [UNLABELED],
 35 |   },
 36 |   bottle: {
 37 |     # UNLABELED: IGNORED,
 38 |     cap: [cap],
 39 |     body: [body],
 40 |   },
 41 |   bus: {
 42 |     # UNLABELED: IGNORED,
 43 |     window: [window],
 44 |     wheel: [wheel],
 45 |     body: [frontside, leftside, rightside, backside, roofside, leftmirror, rightmirror, fliplate, bliplate, door, headlight],
 46 |   },
 47 |   car: {
 48 |     window: [window],
 49 |     wheel: [wheel],
 50 |     light: [headlight],
 51 |     license plate: [fliplate, bliplate],
 52 |     body: [frontside, leftside, rightside, backside, roofside, leftmirror, rightmirror, door],
 53 |   },
 54 |   cat: {
 55 |     head: [head, leye, reye, lear, rear, nose],
 56 |     lower leg: [lfleg, lfpa, rfleg, rfpa, lbleg, lbpa, rbleg, rbpa],
 57 |     tail: [tail],
 58 |     torso: [torso, neck],
 59 |   },
 60 |   chair: {
 61 |     chair: [UNLABELED],
 62 |   },
 63 |   cow: {
 64 |     head: [head, leye, reye, lear, rear, muzzle, lhorn, rhorn],
 65 |     tail: [tail],
 66 |     lower leg: [lfuleg, lflleg, rfuleg, rflleg, lbuleg, lblleg, rbuleg, rblleg],
 67 |     torso: [torso, neck],
 68 |   },
 69 |   table: {
 70 |     table: [UNLABELED],
 71 |   },
 72 |   dog: {
 73 |     head: [head, leye, reye, lear, rear, nose, muzzle],
 74 |     lower leg: [lfleg, lfpa, rfleg, rfpa, lbleg, lbpa, rbleg, rbpa],
 75 |     tail: [tail],
 76 |     torso: [torso, neck],
 77 |   },
 78 |   horse: {
 79 |     head: [head, leye, reye, lear, rear, muzzle],
 80 |     tail: [tail],
 81 |     leg: [lfho, rfho, lbho, rbho, lfuleg, lflleg, rfuleg, rflleg, lbuleg, lblleg, rbuleg, rblleg],
 82 |     torso: [torso, neck],
 83 |   },
 84 |   motorbike: {
 85 |     wheel: [fwheel, bwheel],
 86 |     body: [UNLABELED, handlebar, saddle, headlight],
 87 |   },
 88 |   person: {
 89 |     head: [head, leye, reye, lear, rear, lebrow, rebrow, nose, mouth, hair],
 90 |     torso: [neck, torso],
 91 |     lower arm: [llarm, lhand, rlarm, rhand],
 92 |     upper arm: [luarm, ruarm],
 93 |     lower leg: [llleg, lfoot, rlleg, rfoot],
 94 |     upper leg: [luleg, ruleg],
 95 |   },
 96 |   pottedplant: {
 97 |     pot: [pot],
 98 |     plant: [plant],
 99 |   },
100 |   sheep: {
101 |     head: [head, leye, reye, lear, rear, muzzle, lhorn, rhorn],
102 |     leg: [lfuleg, lflleg, rfuleg, rflleg, lbuleg, lblleg, rbuleg, rblleg],
103 |     torso: [torso, neck, tail],
104 |   },
105 |   sofa: {
106 |     sofa: [UNLABELED],
107 |   },
108 |   train: {
109 |     train: [head, hfrontside, hleftside, hrightside, hbackside, hroofside, headlight, coach, cfrontside, cleftside, crightside, cbackside, croofside],
110 |   },
111 |   tvmonitor: {
112 |     screen: [screen],
113 |     frame: [frame],
114 |   },
115 | }
116 | 


--------------------------------------------------------------------------------
/utils/panoptic_parts/panoptic_parts/visualization/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tue-mps/tapps/54d8d9f039fa91da6e046bfd8c62c13a9dc5e8b0/utils/panoptic_parts/panoptic_parts/visualization/__init__.py


--------------------------------------------------------------------------------
/utils/panoptic_parts/panoptic_parts/visualization/visualize_label_with_legend.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Run this script as
 3 | `python -m panoptic_parts.visualization.visualize_label_with_legend \
 4 |      <datasetspec_path> <label_path>`
 5 | to visualize a label in all three levels (semantic, instance, parts),
 6 | together with a legend including all the colors and uids in that label.
 7 | """
 8 | import argparse
 9 | 
10 | import numpy as np
11 | from PIL import Image
12 | import matplotlib.pyplot as plt
13 | 
14 | from panoptic_parts.utils.visualization import experimental_colorize_label
15 | from panoptic_parts.utils.format import decode_uids, encode_ids
16 | from panoptic_parts.specs.dataset_spec import DatasetSpec
17 | 
18 | 
19 | def visualize_from_paths(datasetspec_path, label_path):
20 |   """
21 |   Visualizes in a pyplot window a label from the provided path.
22 | 
23 |   For visualization pixels are colored on:
24 |     - semantic-level: according to colors defined in dataspec.sid2scene_color
25 |     - semantic-instance-level: with random shades of colors defined in dataspec.sid2scene_color
26 |     - semantic-instance-parts-level: with a mixture of parula colormap and the shades above
27 |   See panoptic_parts.utils.visualization.uid2color for more information on color generation.
28 | 
29 |   Args:
30 |     datasetspec_path: a YAML file path, including keys:
31 |       `sid2scene_color`, `scene_class_part_class_from_sid_pid`
32 |     label_path: a label path, will be passed to Pillow.Image.open
33 |   """
34 |   spec = DatasetSpec(datasetspec_path)
35 |   uids = np.array(Image.open(label_path), dtype=np.int32)
36 |   # for PPP, we need to fold groupable parts (see dataset ppp_datasetspec.yaml for more details)
37 |   uids = encode_ids(*decode_uids(uids, experimental_dataset_spec=spec, experimental_correct_range=True))
38 | 
39 |   uids_sem_inst_parts_colored, uid2color_dct = experimental_colorize_label(
40 |       uids, sid2color=spec.sid2scene_color, emphasize_instance_boundaries=True, return_uid2color=True,
41 |       experimental_deltas=(60, 60, 60), experimental_alpha=0.5)
42 | 
43 |   # plot
44 |   _, ax1 = plt.subplots()
45 | 
46 |   # generate legend, h is a hidden rectangle just to create a legend entry
47 |   handles = []
48 |   handles_text = []
49 |   uids_unique = np.unique(uids)
50 |   for uid in uids_unique:
51 |     h = plt.Rectangle((0, 0), 1, 1, fc=list(map(lambda x: x/255, uid2color_dct[uid])))
52 |     handles.append(h)
53 |     _, _, _, sid_pid = decode_uids(uid, return_sids_pids=True)
54 |     scene_class_part_class = spec.scene_class_part_class_from_sid_pid(sid_pid)
55 |     handles_text.append(f'{uid}: {scene_class_part_class}')
56 | 
57 |   ax1.imshow(uids_sem_inst_parts_colored)
58 |   ax1.set_title('labels colored on semantic, instance, and part levels', fontsize='small')
59 |   ax1.legend(handles, handles_text, ncol=3, fontsize='small', handlelength=1.0,
60 |              loc='center left', bbox_to_anchor=(1.01, 0.5))
61 |   plt.tight_layout()
62 |   plt.show()
63 | 
64 | 
65 | def main():
66 |   parser = argparse.ArgumentParser()
67 |   parser.add_argument('datasetspec_path')
68 |   parser.add_argument('label_path')
69 |   args = parser.parse_args()
70 |   visualize_from_paths(args.datasetspec_path, args.label_path)
71 | 
72 |   return
73 | 
74 | 
75 | if __name__ == "__main__":
76 |   main()
77 | 


--------------------------------------------------------------------------------
/utils/panoptic_parts/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = [
3 |   "setuptools>=51",
4 |   "wheel",
5 |   "numpy>=1.15"
6 | ]
7 | build-backend = "setuptools.build_meta"
8 | 


--------------------------------------------------------------------------------
/utils/panoptic_parts/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy>=1.15
2 | Pillow>=8.0
3 | scipy>=1.4
4 | ruamel.yaml>=0.15
5 | matplotlib>=3.3.0


--------------------------------------------------------------------------------
/utils/panoptic_parts/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | name = panoptic_parts
 3 | version = 2.0rc5
 4 | description = Panoptic Parts datasets
 5 | long_description = file: README.md
 6 | long_description_content_type = text/markdown
 7 | author = Panoptic Parts team
 8 | author_email = panoptic.parts@outlook.com
 9 | maintainer = Panagiotis Meletis
10 | maintainer_email = panoptic.parts@outlook.com
11 | url = https://github.com/pmeletis/panoptic_parts
12 | project_urls =
13 |     Documentation = https://panoptic-parts.readthedocs.io
14 |     Bug Tracker = https://github.com/pmeletis/panoptic_parts/issues
15 | classifiers =
16 |     Programming Language :: Python :: 3
17 |     Programming Language :: Python :: 3.7
18 |     Operating System :: OS Independent
19 | 
20 | [options]
21 | packages = find:
22 | python_requires = >=3.7
23 | install_requires =
24 |     numpy>=1.15
25 |     Pillow>=8.0
26 |     scipy>=1.4
27 |     ruamel.yaml>=0.15
28 |     matplotlib>=3.3.0
29 | include_package_data = True
30 | 
31 | [options.extras_require]
32 | MERGING =
33 |     tqdm
34 |     pycocotools>=2.0.0
35 | 
36 | [options.entry_points]
37 | console_scripts =
38 |     pp_merge_to_pps = panoptic_parts.merging.merge_to_pps:main [MERGING]
39 |     pp_merge_to_panoptic = panoptic_parts.merging.merge_to_panoptic:main [MERGING]
40 |     pp_visualize_label_with_legend = panoptic_parts.visualization.visualize_label_with_legend:main
41 | 


--------------------------------------------------------------------------------
/utils/panoptic_parts/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tue-mps/tapps/54d8d9f039fa91da6e046bfd8c62c13a9dc5e8b0/utils/panoptic_parts/tests/__init__.py


--------------------------------------------------------------------------------
/utils/panoptic_parts/tests/cityscapes_panoptic_parts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tue-mps/tapps/54d8d9f039fa91da6e046bfd8c62c13a9dc5e8b0/utils/panoptic_parts/tests/cityscapes_panoptic_parts/__init__.py


--------------------------------------------------------------------------------
/utils/panoptic_parts/tests/cityscapes_panoptic_parts/dataset_sanity_check.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This script reads the original labels of Cityscapes (CO) and compares them against
 3 | the Cityscapes-Panoptic-Parts (CPP) labels. It verifies that the semantic and instance
 4 | level labels of Cityscapes Panoptic Parts (CPP) are equivalent to
 5 | original Cityscapes (CO), i.e., sids_iids_CPP == sids_iids_CO.
 6 | """
 7 | import sys
 8 | assert float(sys.version[:3]) >= 3.6, 'This test uses Python >= 3.6 functionality.'
 9 | import os.path as op
10 | import glob
11 | import multiprocessing
12 | 
13 | import numpy as np
14 | from PIL import Image
15 | 
16 | from panoptic_parts.utils.format import decode_uids
17 | 
18 | # find all label paths
19 | BASEPATH_LABELS_ORIGINAL = 'tests/tests_files/cityscapes/gtFine'
20 | labels_paths_original = glob.glob(op.join(BASEPATH_LABELS_ORIGINAL, 'train', '*', '*_instanceIds.png'))
21 | labels_paths_original.extend(glob.glob(op.join(BASEPATH_LABELS_ORIGINAL, 'val', '*', '*_instanceIds.png')))
22 | print(len(labels_paths_original))
23 | labels_paths_ours = [
24 |     lp.replace('cityscapes/gtFine', 'cityscapes_panoptic_parts/gtFine_v2').replace('_instanceIds.png', 'PanopticParts.tif')
25 |     for lp in labels_paths_original]
26 | print(len(labels_paths_ours))
27 | 
28 | def _sids_iids_are_maintained(inpts):
29 |   lp_orig, lp_ours = inpts
30 |   labels_orig = np.asarray(Image.open(lp_orig), dtype=np.int32)
31 |   labels_ours = np.asarray(Image.open(lp_ours), dtype=np.int32)
32 |   _, _, _, sids_iids = decode_uids(labels_ours, return_sids_iids=True)
33 |   returns = np.all(np.equal(labels_orig, sids_iids))
34 |   # if not returns:
35 |   #   print(lp_orig, lp_ours, sep='\n')
36 |   #   print(np.unique(labels_orig), print(np.unique(sids_iids)), np.unique(labels_ours), sep='\n')
37 |   return returns
38 | 
39 | # validate labels
40 | with multiprocessing.Pool(multiprocessing.cpu_count()) as pool:
41 |   maintained_bools =[mb for mb in pool.imap_unordered(
42 |       _sids_iids_are_maintained, zip(labels_paths_original, labels_paths_ours), chunksize=10)]
43 | 
44 | print(len(maintained_bools), 'files were verified.')
45 | assert all(maintained_bools), 'some sids_iids are not the same'
46 | 


--------------------------------------------------------------------------------
/utils/panoptic_parts/tests/cityscapes_panoptic_parts/visualize_from_paths_test.sh:
--------------------------------------------------------------------------------
1 | #! /bin/sh
2 | # Change the paths below for your system and run this script from top-level dir as:
3 | # bash tests/cityscapes_panoptic_parts/visualize_from_paths_test.sh
4 | 
5 | python -m panoptic_parts.cityscapes_panoptic_parts.visualize_from_paths \
6 |   tests/tests_files/cityscapes_panoptic_parts/leftImg8bit/train/aachen/aachen_000012_000019_leftImg8bit.png \
7 |   tests/tests_files/cityscapes_panoptic_parts/gtFine_v2/train/aachen/aachen_000012_000019_gtFinePanopticParts.tif \
8 |   panoptic_parts/utils/defs/cpp_20.yaml
9 | 


--------------------------------------------------------------------------------
/utils/panoptic_parts/tests/cityscapes_panoptic_parts/visualize_label_with_legend_test.sh:
--------------------------------------------------------------------------------
1 | #! /bin/sh
2 | # Change the paths below for your system and run this script from top-level dir as:
3 | # bash tests/cityscapes_panoptic_parts/visualize_label_with_legend_test.sh
4 | 
5 | python -m panoptic_parts.cityscapes_panoptic_parts.visualize_label_with_legend \
6 |   tests/tests_files/gtFinePanopticParts/val/munster/munster_000080_000019_gtFinePanopticParts.tif \
7 |   panoptic_parts/utils/defs/cpp_20.yaml
8 | 


--------------------------------------------------------------------------------
/utils/panoptic_parts/tests/pascal_panoptic_parts/visualize_from_paths_test.sh:
--------------------------------------------------------------------------------
1 | #! /bin/sh
2 | # Change the paths below for your system and run this script from top-level dir as:
3 | # bash tests/pascal_panoptic_parts/visualize_from_paths_test.sh
4 | 
5 | python -m panoptic_parts.pascal_panoptic_parts.visualize_from_paths \
6 |   tests/tests_files/pascal_panoptic_parts/images/2010_002877.jpg \
7 |   tests/tests_files/pascal_panoptic_parts/labels/2010_002877.tif \
8 |   panoptic_parts/utils/defs/ppp_100.yaml
9 | 


--------------------------------------------------------------------------------
/utils/panoptic_parts/tests/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tue-mps/tapps/54d8d9f039fa91da6e046bfd8c62c13a9dc5e8b0/utils/panoptic_parts/tests/utils/__init__.py


--------------------------------------------------------------------------------
/utils/panoptic_parts/tests/utils/utils_test.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | import tifffile
 4 | from panoptic_parts.utils.utils import safe_write
 5 | 
 6 | pth0 = "test.png"
 7 | pth1 = "test1.png"
 8 | pth2 = "test2.png"
 9 | pth3 = "test3.png"
10 | pth4 = "test4.png"
11 | 
12 | im = np.random.randint(0, high=255, size=(600, 800, 3), dtype=np.uint8)
13 | 
14 | # all following commands should have the same output file size
15 | safe_write(pth0, im)
16 | safe_write(pth1, im, optimize=True)
17 | safe_write(pth2, im, compression_level=9)
18 | tifffile.imwrite(pth3, im)
19 | tifffile.imwrite(pth4, im, compression = 'zlib')
20 | 


--------------------------------------------------------------------------------
/utils/panoptic_parts/tests/utils/visualization_test.py:
--------------------------------------------------------------------------------
 1 | import matplotlib as mpl
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | from panoptic_parts.utils.visualization import random_colors, PARULA99_CM
 5 | 
 6 | def random_colors_test():
 7 |   colors0 = random_colors(0)
 8 |   assert len(colors0) == 0
 9 |   print(colors0)
10 |   colors1 = random_colors(1)
11 |   assert len(colors1) == 1
12 |   print(colors1)
13 |   colors10 = random_colors(10)
14 |   assert len(colors10) == 10
15 |   assert all(isinstance(color, tuple) for color in colors10)
16 |   print(colors10)
17 | 
18 | 
19 | def parula99_cm_test():
20 |   # just a demo function plotting the colormap
21 |   fig, ax = plt.subplots(figsize=(6, 1))
22 |   fig.subplots_adjust(bottom=0.5)
23 |   norm = mpl.colors.Normalize(vmin=1, vmax=PARULA99_CM.N + 1)
24 |   fig.colorbar(mpl.cm.ScalarMappable(norm=norm, cmap=PARULA99_CM),
25 |                cax=ax, orientation='horizontal', label='part-level semantic classes')
26 |   fig.waitforbuttonpress(30.0)
27 |   Nparts = 5
28 |   bounds = list(range(1, Nparts + 1))
29 |   norm = mpl.colors.BoundaryNorm(bounds, PARULA99_CM.N, extend='both')
30 |   print(*map(norm, range(Nparts + 1 + 1)))
31 |   mpl.colorbar.ColorbarBase(ax, cmap=PARULA99_CM, norm=norm, orientation='horizontal')
32 |   plt.draw()
33 |   fig.waitforbuttonpress(30.0)
34 | 
35 | 
36 | if __name__ == "__main__":
37 |   random_colors_test()
38 |   parula99_cm_test()
39 | 


--------------------------------------------------------------------------------