├── .gitignore
├── GETTING_STARTED.md
├── INSTALL.md
├── LICENSE
├── README.md
├── colab
    └── oneformer_colab.ipynb
├── configs
    ├── ade20k
    │   ├── Base-ADE20K-UnifiedSegmentation.yaml
    │   ├── convnext
    │   │   ├── oneformer_convnext_large_bs16_160k.yaml
    │   │   └── oneformer_convnext_xlarge_bs16_160k.yaml
    │   ├── dinat
    │   │   ├── coco_pretrain_oneformer_dinat_large_bs16_160k_1280x1280.yaml
    │   │   ├── oneformer_dinat_large_bs16_160k.yaml
    │   │   ├── oneformer_dinat_large_bs16_160k_1280x1280.yaml
    │   │   └── oneformer_dinat_large_bs16_160k_896x896.yaml
    │   ├── oneformer_R50_bs16_160k.yaml
    │   └── swin
    │   │   ├── oneformer_swin_large_bs16_160k.yaml
    │   │   ├── oneformer_swin_large_bs16_160k_1280x1280.yaml
    │   │   ├── oneformer_swin_large_bs16_160k_896x896.yaml
    │   │   └── oneformer_swin_tiny_bs16_160k.yaml
    ├── cityscapes
    │   ├── Base-Cityscapes-UnifiedSegmentation.yaml
    │   ├── convnext
    │   │   ├── mapillary_pretrain_oneformer_convnext_large_bs16_90k.yaml
    │   │   ├── mapillary_pretrain_oneformer_convnext_xlarge_bs16_90k.yaml
    │   │   ├── oneformer_convnext_large_bs16_90k.yaml
    │   │   └── oneformer_convnext_xlarge_bs16_90k.yaml
    │   ├── dinat
    │   │   └── oneformer_dinat_large_bs16_90k.yaml
    │   ├── oneformer_R50_bs16_90k.yaml
    │   └── swin
    │   │   └── oneformer_swin_large_bs16_90k.yaml
    ├── coco
    │   ├── Base-COCO-UnifiedSegmentation.yaml
    │   ├── dinat
    │   │   └── oneformer_dinat_large_bs16_100ep.yaml
    │   ├── oneformer_R50_bs16_50ep.yaml
    │   └── swin
    │   │   ├── oneformer_swin_large_bs16_100ep.yaml
    │   │   └── oneformer_swin_tiny_bs16_50ep.yaml
    └── mapillary_vistas
    │   ├── Base-Mapillary-UnifiedSegmentation.yaml
    │   ├── convnext
    │       ├── cityscapes_pretrain_oneformer_convnext_large_bs16_300k.yaml
    │       ├── cityscapes_pretrain_oneformer_convnext_xlarge_bs16_300k.yaml
    │       └── oneformer_convnext_large_bs16_300k.yaml
    │   ├── dinat
    │       └── oneformer_dinat_large_bs16_300k.yaml
    │   ├── oneformer_R50_bs16_300k.yaml
    │   └── swin
    │       └── oneformer_swin_large_bs16_300k.yaml
├── datasets
    ├── README.md
    ├── ade20k_instance_catid_mapping.txt
    ├── ade20k_instance_imgCatIds.json
    ├── custom_datasets
    │   ├── README.md
    │   ├── instance_coco_custom_dataset_mapper.py
    │   ├── instance_oneformer_custom_dataset_mapper.py
    │   └── semantic_oneformer_custom_dataset_mapper.py
    ├── fg_ids.py
    ├── panoptic2detection_coco_format.py
    ├── panoptic_coco_categories.json
    ├── prepare_ade20k_ins_seg.py
    ├── prepare_ade20k_pan_seg.py
    ├── prepare_ade20k_sem_seg.py
    └── prepare_coco_semantic_annos_from_panoptic_annos.py
├── demo
    ├── README.md
    ├── colormap.py
    ├── defaults.py
    ├── demo.py
    ├── predictor.py
    └── visualizer.py
├── images
    ├── oneformer.svg
    ├── plots.svg
    ├── teaser.png
    └── teaser.svg
├── oneformer
    ├── __init__.py
    ├── config.py
    ├── data
    │   ├── __init__.py
    │   ├── bpe_simple_vocab_16e6.txt
    │   ├── bpe_simple_vocab_16e6.txt.gz
    │   ├── build.py
    │   ├── dataset_mappers
    │   │   ├── __init__.py
    │   │   ├── coco_unified_new_baseline_dataset_mapper.py
    │   │   ├── dataset_mapper.py
    │   │   └── oneformer_unified_dataset_mapper.py
    │   ├── datasets
    │   │   ├── __init__.py
    │   │   ├── register_ade20k_instance.py
    │   │   ├── register_ade20k_panoptic.py
    │   │   ├── register_cityscapes_panoptic.py
    │   │   ├── register_coco_panoptic2instance.py
    │   │   ├── register_coco_panoptic_annos_semseg.py
    │   │   ├── register_mapillary_vistas.py
    │   │   └── register_mapillary_vistas_panoptic.py
    │   └── tokenizer.py
    ├── datasetmapper_tta.py
    ├── evaluation
    │   ├── __init__.py
    │   ├── cityscapes_evaluation.py
    │   ├── coco_evaluator.py
    │   ├── detection_coco_evaluator.py
    │   ├── evaluator.py
    │   └── instance_evaluation.py
    ├── modeling
    │   ├── __init__.py
    │   ├── backbone
    │   │   ├── __init__.py
    │   │   ├── convnext.py
    │   │   ├── dinat.py
    │   │   └── swin.py
    │   ├── criterion.py
    │   ├── matcher.py
    │   ├── meta_arch
    │   │   ├── __init__.py
    │   │   └── oneformer_head.py
    │   ├── pixel_decoder
    │   │   ├── __init__.py
    │   │   ├── fpn.py
    │   │   ├── msdeformattn.py
    │   │   └── ops
    │   │   │   ├── functions
    │   │   │       ├── __init__.py
    │   │   │       └── ms_deform_attn_func.py
    │   │   │   ├── make.sh
    │   │   │   ├── modules
    │   │   │       ├── __init__.py
    │   │   │       └── ms_deform_attn.py
    │   │   │   ├── setup.py
    │   │   │   ├── src
    │   │   │       ├── cpu
    │   │   │       │   ├── ms_deform_attn_cpu.cpp
    │   │   │       │   └── ms_deform_attn_cpu.h
    │   │   │       ├── cuda
    │   │   │       │   ├── ms_deform_attn_cuda.cu
    │   │   │       │   ├── ms_deform_attn_cuda.h
    │   │   │       │   └── ms_deform_im2col_cuda.cuh
    │   │   │       ├── ms_deform_attn.h
    │   │   │       └── vision.cpp
    │   │   │   └── test.py
    │   └── transformer_decoder
    │   │   ├── __init__.py
    │   │   ├── oneformer_transformer_decoder.py
    │   │   ├── position_encoding.py
    │   │   ├── text_transformer.py
    │   │   └── transformer.py
    ├── oneformer_model.py
    ├── test_time_augmentation.py
    └── utils
    │   ├── __init__.py
    │   ├── box_ops.py
    │   ├── events.py
    │   ├── misc.py
    │   └── pos_embed.py
├── requirements.txt
├── tools
    ├── README.md
    ├── analyze_model.py
    ├── calc_throughput.py
    ├── convert-pretrained-model-to-d2.py
    ├── convert-pretrained-nat-model-to-d2.py
    ├── convert-torchvision-to-d2.py
    ├── setup_detectron2.py
    └── trainers
    │   ├── trainer.py
    │   └── trainer_base.py
└── train_net.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | *_video
 2 | *_video.py
 3 | extras/
 4 | 
 5 | # output dir
 6 | .DS_Store
 7 | output
 8 | instant_test_output
 9 | inference_test_output
10 | 
11 | *.json
12 | *.diff
13 | *.jpg
14 | !/projects/DensePose/doc/images/*.jpg
15 | 
16 | # compilation and distribution
17 | __pycache__
18 | _ext
19 | *.pyc
20 | *.pyd
21 | *.so
22 | *.dll
23 | *.egg-info/
24 | build/
25 | dist/
26 | wheels/
27 | 
28 | # pytorch/python/numpy formats
29 | *.pth
30 | *.pkl
31 | *.npy
32 | *.ts
33 | model_ts*.txt
34 | 
35 | # ipython/jupyter notebooks
36 | **/.ipynb_checkpoints/
37 | 
38 | # Editor temporaries
39 | *.swn
40 | *.swo
41 | *.swp
42 | *~
43 | 
44 | # editor settings
45 | .idea
46 | .vscode
47 | _darcs
48 | 
49 | # project dirs
50 | /detectron2/model_zoo/configs
51 | /projects/*/datasets
52 | /models
53 | /snippet


--------------------------------------------------------------------------------
/GETTING_STARTED.md:
--------------------------------------------------------------------------------
 1 | # Getting Started with OneFormer
 2 | 
 3 | This document provides a brief intro of the usage of OneFormer.
 4 | 
 5 | Please see [Getting Started with Detectron2](https://github.com/facebookresearch/detectron2/blob/master/GETTING_STARTED.md) for full usage.
 6 | 
 7 | ## Training
 8 | 
 9 | - Make sure to setup wandb before training a model.
10 | 
11 |   ```bash
12 |   pip install wandb
13 |   wandb login
14 |   ```
15 | 
16 | - We provide a script `train_net.py`, that is made to train all the configs provided in OneFormer.
17 | 
18 | - To train a model with "train_net.py", first setup the corresponding datasets following [datasets/README.md](./datasets/README.md).
19 | 
20 | - Be default, the model uses `task=panoptic` for evaluation during training.
21 | 
22 | ```bash
23 | python train_net.py --dist-url 'tcp://127.0.0.1:50163' \
24 |     --num-gpus 8 \
25 |     --config-file configs/ade20k/swin/oneformer_swin_large_bs16_160k.yaml \
26 |     OUTPUT_DIR outputs/ade20k_swin_large WANDB.NAME ade20k_swin_large
27 | ```
28 | 
29 | ## Evaluation
30 | 
31 | - You need to pass the value of `task` token. `task` belongs to [panoptic, semantic, instance].
32 | 
33 | - To evaluate a model's performance, use:
34 | 
35 | ```bash
36 | python train_net.py --dist-url 'tcp://127.0.0.1:50164' \
37 |     --num-gpus 8 \
38 |     --config-file configs/ade20k/swin/oneformer_swin_large_bs16_160k.yaml \
39 |     --eval-only MODEL.IS_TRAIN False MODEL.WEIGHTS <path-to-checkpoint> \
40 |     MODEL.TEST.TASK <task>
41 | ```
42 | 
43 | ## Inference Demo
44 | 
45 | We provide a demo script for inference on images. For more information, please see [demo/README.md](demo/README.md).
46 | 


--------------------------------------------------------------------------------
/INSTALL.md:
--------------------------------------------------------------------------------
 1 | # Installation
 2 | 
 3 | ## Requirements
 4 | 
 5 | We use an evironment with the following specifications, packages and dependencies:
 6 | 
 7 | - Ubuntu 20.04.3 LTS
 8 | - Python 3.8.13
 9 | - conda 4.12.0
10 | - [PyTorch v1.10.1](https://pytorch.org/get-started/previous-versions/)
11 | - [Torchvision v0.11.2](https://pytorch.org/get-started/previous-versions/)
12 | - [Detectron2 v0.6](https://github.com/facebookresearch/detectron2/releases/tag/v0.6)
13 | - [NATTEN v0.14.4](https://github.com/SHI-Labs/NATTEN/releases/tag/v0.14.4)
14 | 
15 | ## Setup Instructions
16 | 
17 | - Create a conda environment
18 |   
19 |   ```bash
20 |   conda create --name oneformer python=3.8 -y
21 |   conda activate oneformer
22 |   ```
23 | 
24 | - Install packages and other dependencies.
25 | 
26 |   ```bash
27 |   git clone https://github.com/SHI-Labs/OneFormer.git
28 |   cd OneFormer
29 | 
30 |   # Install Pytorch
31 |   conda install pytorch==1.10.1 torchvision==0.11.2 cudatoolkit=11.3 -c pytorch -c conda-forge
32 | 
33 |   # Install opencv (required for running the demo)
34 |   pip3 install -U opencv-python
35 | 
36 |   # Install detectron2
37 |   python tools/setup_detectron2.py
38 | 
39 |   # Install other dependencies
40 |   pip3 install git+https://github.com/cocodataset/panopticapi.git
41 |   pip3 install git+https://github.com/mcordts/cityscapesScripts.git
42 |   pip3 install -r requirements.txt
43 |   ```
44 | 
45 | - Setup wandb.
46 | 
47 |   ```bash
48 |   # Setup wand
49 |   pip3 install wandb
50 |   wandb login
51 |   ```
52 | 
53 | - Setup CUDA Kernel for MSDeformAttn. `CUDA_HOME` must be defined and points to the directory of the installed CUDA toolkit.
54 | 
55 |   ```bash
56 |   # Setup MSDeformAttn
57 |   cd oneformer/modeling/pixel_decoder/ops
58 |   sh make.sh
59 |   cd ../../../..
60 |   ```
61 |   
62 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 SHI Labs
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/configs/ade20k/Base-ADE20K-UnifiedSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     # NORM: "SyncBN"
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("ade20k_panoptic_train",)
18 |   TEST_PANOPTIC: ("ade20k_panoptic_val",)
19 |   TEST_INSTANCE: ("ade20k_instance_val",)
20 |   TEST_SEMANTIC: ("ade20k_sem_seg_val",)
21 | SOLVER:
22 |   IMS_PER_BATCH: 16
23 |   BASE_LR: 0.0001
24 |   MAX_ITER: 160000
25 |   WARMUP_FACTOR: 1.0
26 |   WARMUP_ITERS: 0
27 |   WEIGHT_DECAY: 0.05
28 |   OPTIMIZER: "ADAMW"
29 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
30 |   BACKBONE_MULTIPLIER: 0.1
31 |   CLIP_GRADIENTS:
32 |     ENABLED: True
33 |     CLIP_TYPE: "full_model"
34 |     CLIP_VALUE: 0.01
35 |     NORM_TYPE: 2.0
36 |   AMP:
37 |     ENABLED: True
38 | INPUT:
39 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 512) for x in range(5, 21)]"]
40 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
41 |   MIN_SIZE_TEST: 512
42 |   MAX_SIZE_TRAIN: 2048
43 |   MAX_SIZE_TEST: 2048
44 |   CROP:
45 |     ENABLED: True
46 |     TYPE: "absolute"
47 |     SIZE: (512, 512)
48 |     SINGLE_CATEGORY_MAX_AREA: 1.0
49 |   COLOR_AUG_SSD: True
50 |   SIZE_DIVISIBILITY: 512  # used in dataset mapper
51 |   FORMAT: "RGB"
52 |   DATASET_MAPPER_NAME: "oneformer_unified"
53 |   MAX_SEQ_LEN: 77
54 |   TASK_SEQ_LEN: 77
55 |   TASK_PROB: 
56 |     SEMANTIC: 0.33
57 |     INSTANCE: 0.66
58 | TEST:
59 |   EVAL_PERIOD: 5000
60 |   AUG:
61 |     ENABLED: False
62 |     MIN_SIZES: [256, 384, 512, 640, 768, 896]
63 |     MAX_SIZE: 3584
64 |     FLIP: True
65 | DATALOADER:
66 |   FILTER_EMPTY_ANNOTATIONS: True
67 |   NUM_WORKERS: 4
68 | VERSION: 2


--------------------------------------------------------------------------------
/configs/ade20k/convnext/oneformer_convnext_large_bs16_160k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../oneformer_R50_bs16_160k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2ConvNeXt"
 5 |   CONVNEXT:
 6 |     IN_CHANNELS: 3
 7 |     DEPTHS: [3, 3, 27, 3]
 8 |     DIMS: [192, 384, 768, 1536]
 9 |     DROP_PATH_RATE: 0.4
10 |     LSIT: 1.0
11 |     OUT_INDICES: [0, 1, 2, 3]
12 |   WEIGHTS: "convnext_large_22k_1k_384.pkl"
13 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
14 |   PIXEL_STD: [58.395, 57.120, 57.375]
15 |   ONE_FORMER:
16 |     NUM_OBJECT_QUERIES: 250
17 | INPUT:
18 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
19 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
20 |   MIN_SIZE_TEST: 640
21 |   MAX_SIZE_TRAIN: 2560
22 |   MAX_SIZE_TEST: 2560
23 |   CROP:
24 |     ENABLED: True
25 |     TYPE: "absolute"
26 |     SIZE: (640, 640)
27 |     SINGLE_CATEGORY_MAX_AREA: 1.0
28 |   COLOR_AUG_SSD: True
29 |   SIZE_DIVISIBILITY: 640  # used in dataset mapper
30 |   FORMAT: "RGB"
31 | TEST:
32 |   DETECTIONS_PER_IMAGE: 250
33 |   EVAL_PERIOD: 5000
34 |   AUG:
35 |     ENABLED: False
36 |     MIN_SIZES: [320, 480, 640, 800, 960, 1120]
37 |     MAX_SIZE: 4480
38 |     FLIP: True
39 | 


--------------------------------------------------------------------------------
/configs/ade20k/convnext/oneformer_convnext_xlarge_bs16_160k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../oneformer_R50_bs16_160k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2ConvNeXt"
 5 |   CONVNEXT:
 6 |     IN_CHANNELS: 3
 7 |     DEPTHS: [3, 3, 27, 3]
 8 |     DIMS: [256, 512, 1024, 2048]
 9 |     DROP_PATH_RATE: 0.4
10 |     LSIT: 1.0
11 |     OUT_INDICES: [0, 1, 2, 3]
12 |   WEIGHTS: "convnext_xlarge_22k_1k_384_ema.pkl"
13 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
14 |   PIXEL_STD: [58.395, 57.120, 57.375]
15 |   ONE_FORMER:
16 |     NUM_OBJECT_QUERIES: 250
17 | INPUT:
18 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
19 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
20 |   MIN_SIZE_TEST: 640
21 |   MAX_SIZE_TRAIN: 2560
22 |   MAX_SIZE_TEST: 2560
23 |   CROP:
24 |     ENABLED: True
25 |     TYPE: "absolute"
26 |     SIZE: (640, 640)
27 |     SINGLE_CATEGORY_MAX_AREA: 1.0
28 |   COLOR_AUG_SSD: True
29 |   SIZE_DIVISIBILITY: 640  # used in dataset mapper
30 |   FORMAT: "RGB"
31 | TEST:
32 |   DETECTIONS_PER_IMAGE: 250
33 |   EVAL_PERIOD: 5000
34 |   AUG:
35 |     ENABLED: False
36 |     MIN_SIZES: [320, 480, 640, 800, 960, 1120]
37 |     MAX_SIZE: 4480
38 |     FLIP: True
39 | 


--------------------------------------------------------------------------------
/configs/ade20k/dinat/coco_pretrain_oneformer_dinat_large_bs16_160k_1280x1280.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../oneformer_R50_bs16_160k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2DiNAT"
 5 |   DiNAT:
 6 |     EMBED_DIM: 192
 7 |     MLP_RATIO: 2.0
 8 |     DEPTHS: [3, 4, 18, 5]
 9 |     NUM_HEADS: [6, 12, 24, 48]
10 |     KERNEL_SIZE: 11
11 |     DROP_PATH_RATE: 0.3
12 |     DILATIONS: [[1, 28, 1], [1, 7, 1, 14], [1, 3, 1, 5, 1, 5, 1, 7, 1, 3, 1, 5, 1, 5, 1, 7, 1, 7], [1, 3, 1, 3, 1]]
13 |   WEIGHTS: "150_16_dinat_l_oneformer_coco_100ep.pth"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 |   ONE_FORMER:
17 |     NUM_OBJECT_QUERIES: 150
18 | SOLVER:
19 |   AMP:
20 |     ENABLED: False
21 | INPUT:
22 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1280) for x in range(5, 21)]"]
23 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
24 |   MIN_SIZE_TEST: 1280
25 |   MAX_SIZE_TRAIN: 5120
26 |   MAX_SIZE_TEST: 5120
27 |   CROP:
28 |     ENABLED: True
29 |     TYPE: "absolute"
30 |     SIZE: (1280, 1280)
31 |     SINGLE_CATEGORY_MAX_AREA: 1.0
32 |   COLOR_AUG_SSD: True
33 |   SIZE_DIVISIBILITY: 1280  # used in dataset mapper
34 |   FORMAT: "RGB"
35 | TEST:
36 |   DETECTIONS_PER_IMAGE: 150
37 |   EVAL_PERIOD: 5000
38 |   AUG:
39 |     ENABLED: False
40 |     MIN_SIZES: [640, 960, 1280, 1600, 1920, 2240]
41 |     MAX_SIZE: 8960
42 |     FLIP: True


--------------------------------------------------------------------------------
/configs/ade20k/dinat/oneformer_dinat_large_bs16_160k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../oneformer_R50_bs16_160k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2DiNAT"
 5 |   DiNAT:
 6 |     EMBED_DIM: 192
 7 |     MLP_RATIO: 2.0
 8 |     DEPTHS: [3, 4, 18, 5]
 9 |     NUM_HEADS: [6, 12, 24, 48]
10 |     KERNEL_SIZE: 11
11 |     DROP_PATH_RATE: 0.3
12 |     DILATIONS: [[1, 20, 1], [1, 5, 1, 10], [1, 2, 1, 3, 1, 4, 1, 5, 1, 2, 1, 3, 1, 4, 1, 5, 1, 5], [1, 2, 1, 2, 1]]
13 |   WEIGHTS: "dinat_large_in22k_in1k_384_11x11.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 |   ONE_FORMER:
17 |     NUM_OBJECT_QUERIES: 250
18 | SOLVER:
19 |   AMP:
20 |     ENABLED: False
21 | INPUT:
22 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
23 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
24 |   MIN_SIZE_TEST: 640
25 |   MAX_SIZE_TRAIN: 2560
26 |   MAX_SIZE_TEST: 2560
27 |   CROP:
28 |     ENABLED: True
29 |     TYPE: "absolute"
30 |     SIZE: (640, 640)
31 |     SINGLE_CATEGORY_MAX_AREA: 1.0
32 |   COLOR_AUG_SSD: True
33 |   SIZE_DIVISIBILITY: 640  # used in dataset mapper
34 |   FORMAT: "RGB"
35 | TEST:
36 |   DETECTIONS_PER_IMAGE: 250
37 |   EVAL_PERIOD: 5000
38 |   AUG:
39 |     ENABLED: False
40 |     MIN_SIZES: [320, 480, 640, 800, 960, 1120]
41 |     MAX_SIZE: 4480
42 |     FLIP: True
43 | 


--------------------------------------------------------------------------------
/configs/ade20k/dinat/oneformer_dinat_large_bs16_160k_1280x1280.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../oneformer_R50_bs16_160k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2DiNAT"
 5 |   DiNAT:
 6 |     EMBED_DIM: 192
 7 |     MLP_RATIO: 2.0
 8 |     DEPTHS: [3, 4, 18, 5]
 9 |     NUM_HEADS: [6, 12, 24, 48]
10 |     KERNEL_SIZE: 11
11 |     DROP_PATH_RATE: 0.3
12 |     DILATIONS: [[1, 28, 1], [1, 7, 1, 14], [1, 3, 1, 5, 1, 5, 1, 7, 1, 3, 1, 5, 1, 5, 1, 7, 1, 7], [1, 3, 1, 3, 1]]
13 |   WEIGHTS: "dinat_large_in22k_in1k_384_11x11.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 |   ONE_FORMER:
17 |     NUM_OBJECT_QUERIES: 250
18 | SOLVER:
19 |   AMP:
20 |     ENABLED: False
21 | INPUT:
22 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1280) for x in range(5, 21)]"]
23 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
24 |   MIN_SIZE_TEST: 1280
25 |   MAX_SIZE_TRAIN: 5120
26 |   MAX_SIZE_TEST: 5120
27 |   CROP:
28 |     ENABLED: True
29 |     TYPE: "absolute"
30 |     SIZE: (1280, 1280)
31 |     SINGLE_CATEGORY_MAX_AREA: 1.0
32 |   COLOR_AUG_SSD: True
33 |   SIZE_DIVISIBILITY: 1280  # used in dataset mapper
34 |   FORMAT: "RGB"
35 | TEST:
36 |   DETECTIONS_PER_IMAGE: 250
37 |   EVAL_PERIOD: 5000
38 |   AUG:
39 |     ENABLED: False
40 |     MIN_SIZES: [640, 960, 1280, 1600, 1920, 2240]
41 |     MAX_SIZE: 8960
42 |     FLIP: True


--------------------------------------------------------------------------------
/configs/ade20k/dinat/oneformer_dinat_large_bs16_160k_896x896.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../oneformer_R50_bs16_160k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2DiNAT"
 5 |   DiNAT:
 6 |     EMBED_DIM: 192
 7 |     MLP_RATIO: 2.0
 8 |     DEPTHS: [3, 4, 18, 5]
 9 |     NUM_HEADS: [6, 12, 24, 48]
10 |     KERNEL_SIZE: 11
11 |     DROP_PATH_RATE: 0.3
12 |     DILATIONS: [[1, 20, 1], [1, 5, 1, 10], [1, 2, 1, 3, 1, 4, 1, 5, 1, 2, 1, 3, 1, 4, 1, 5, 1, 5], [1, 2, 1, 2, 1]]
13 |   WEIGHTS: "dinat_large_in22k_in1k_384_11x11.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 |   ONE_FORMER:
17 |     NUM_OBJECT_QUERIES: 250
18 | SOLVER:
19 |   AMP:
20 |     ENABLED: False
21 | INPUT:
22 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 896) for x in range(5, 21)]"]
23 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
24 |   MIN_SIZE_TEST: 896
25 |   MAX_SIZE_TRAIN: 3584
26 |   MAX_SIZE_TEST: 3584
27 |   CROP:
28 |     ENABLED: True
29 |     TYPE: "absolute"
30 |     SIZE: (896, 896)
31 |     SINGLE_CATEGORY_MAX_AREA: 1.0
32 |   COLOR_AUG_SSD: True
33 |   SIZE_DIVISIBILITY: 896  # used in dataset mapper
34 |   FORMAT: "RGB"
35 | TEST:
36 |   DETECTIONS_PER_IMAGE: 250
37 |   EVAL_PERIOD: 5000
38 |   AUG:
39 |     ENABLED: False
40 |     MIN_SIZES: [448, 678, 896, 1120, 1344, 1568]
41 |     MAX_SIZE: 6272
42 |     FLIP: True
43 | 


--------------------------------------------------------------------------------
/configs/ade20k/oneformer_R50_bs16_160k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-ADE20K-UnifiedSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "OneFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "OneFormerHead"
 6 |     IGNORE_VALUE: 255
 7 |     NUM_CLASSES: 150
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   ONE_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "ContrastiveMultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     DEEP_SUPERVISION: True
22 |     NO_OBJECT_WEIGHT: 0.1
23 |     CLASS_WEIGHT: 2.0
24 |     MASK_WEIGHT: 5.0
25 |     DICE_WEIGHT: 5.0
26 |     CONTRASTIVE_WEIGHT: 0.5
27 |     CONTRASTIVE_TEMPERATURE: 0.07
28 |     HIDDEN_DIM: 256
29 |     NUM_OBJECT_QUERIES: 150
30 |     USE_TASK_NORM: True
31 |     NHEADS: 8
32 |     DROPOUT: 0.1
33 |     DIM_FEEDFORWARD: 2048
34 |     ENC_LAYERS: 0
35 |     PRE_NORM: False
36 |     ENFORCE_INPUT_PROJ: False
37 |     SIZE_DIVISIBILITY: 32
38 |     CLASS_DEC_LAYERS: 2
39 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
40 |     TRAIN_NUM_POINTS: 12544
41 |     OVERSAMPLE_RATIO: 3.0
42 |     IMPORTANCE_SAMPLE_RATIO: 0.75
43 |   TEXT_ENCODER:
44 |     WIDTH: 256
45 |     CONTEXT_LENGTH: 77
46 |     NUM_LAYERS: 6
47 |     VOCAB_SIZE: 49408
48 |     PROJ_NUM_LAYERS: 2
49 |     N_CTX: 16
50 |   TEST:
51 |     SEMANTIC_ON: True
52 |     INSTANCE_ON: True
53 |     PANOPTIC_ON: True
54 |     OVERLAP_THRESHOLD: 0.8
55 |     OBJECT_MASK_THRESHOLD: 0.5
56 |     TASK: "panoptic"
57 | TEST:
58 |   DETECTIONS_PER_IMAGE: 150
59 | 


--------------------------------------------------------------------------------
/configs/ade20k/swin/oneformer_swin_large_bs16_160k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../oneformer_R50_bs16_160k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22kto1k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   ONE_FORMER:
18 |     NUM_OBJECT_QUERIES: 250
19 | INPUT:
20 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
21 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
22 |   MIN_SIZE_TEST: 640
23 |   MAX_SIZE_TRAIN: 2560
24 |   MAX_SIZE_TEST: 2560
25 |   CROP:
26 |     ENABLED: True
27 |     TYPE: "absolute"
28 |     SIZE: (640, 640)
29 |     SINGLE_CATEGORY_MAX_AREA: 1.0
30 |   COLOR_AUG_SSD: True
31 |   SIZE_DIVISIBILITY: 640  # used in dataset mapper
32 |   FORMAT: "RGB"
33 | TEST:
34 |   DETECTIONS_PER_IMAGE: 250
35 |   EVAL_PERIOD: 5000
36 |   AUG:
37 |     ENABLED: False
38 |     MIN_SIZES: [320, 480, 640, 800, 960, 1120]
39 |     MAX_SIZE: 4480
40 |     FLIP: True
41 | 


--------------------------------------------------------------------------------
/configs/ade20k/swin/oneformer_swin_large_bs16_160k_1280x1280.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../oneformer_R50_bs16_160k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22kto1k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   ONE_FORMER:
18 |     NUM_OBJECT_QUERIES: 250
19 | INPUT:
20 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1280) for x in range(5, 21)]"]
21 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
22 |   MIN_SIZE_TEST: 1280
23 |   MAX_SIZE_TRAIN: 5120
24 |   MAX_SIZE_TEST: 5120
25 |   CROP:
26 |     ENABLED: True
27 |     TYPE: "absolute"
28 |     SIZE: (1280, 1280)
29 |     SINGLE_CATEGORY_MAX_AREA: 1.0
30 |   COLOR_AUG_SSD: True
31 |   SIZE_DIVISIBILITY: 1280  # used in dataset mapper
32 |   FORMAT: "RGB"
33 | TEST:
34 |   DETECTIONS_PER_IMAGE: 250
35 |   EVAL_PERIOD: 5000
36 |   AUG:
37 |     ENABLED: False
38 |     MIN_SIZES: [640, 960, 1280, 1600, 1920, 2240]
39 |     MAX_SIZE: 8960
40 |     FLIP: True
41 | 


--------------------------------------------------------------------------------
/configs/ade20k/swin/oneformer_swin_large_bs16_160k_896x896.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../oneformer_R50_bs16_160k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22kto1k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   ONE_FORMER:
18 |     NUM_OBJECT_QUERIES: 250
19 | INPUT:
20 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 896) for x in range(5, 21)]"]
21 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
22 |   MIN_SIZE_TEST: 896
23 |   MAX_SIZE_TRAIN: 3584
24 |   MAX_SIZE_TEST: 3584
25 |   CROP:
26 |     ENABLED: True
27 |     TYPE: "absolute"
28 |     SIZE: (896, 896)
29 |     SINGLE_CATEGORY_MAX_AREA: 1.0
30 |   COLOR_AUG_SSD: True
31 |   SIZE_DIVISIBILITY: 896  # used in dataset mapper
32 |   FORMAT: "RGB"
33 | TEST:
34 |   DETECTIONS_PER_IMAGE: 250
35 |   EVAL_PERIOD: 5000
36 |   AUG:
37 |     ENABLED: False
38 |     MIN_SIZES: [448, 678, 896, 1120, 1344, 1568]
39 |     MAX_SIZE: 6272
40 |     FLIP: True
41 | 


--------------------------------------------------------------------------------
/configs/ade20k/swin/oneformer_swin_tiny_bs16_160k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../oneformer_R50_bs16_160k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 6, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/configs/cityscapes/Base-Cityscapes-UnifiedSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     NORM: "SyncBN"  # use syncbn for cityscapes dataset
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("cityscapes_fine_panoptic_train",)
18 |   TEST_PANOPTIC: ("cityscapes_fine_panoptic_val",)
19 |   TEST_INSTANCE: ("cityscapes_fine_instance_seg_val",)
20 |   TEST_SEMANTIC: ("cityscapes_fine_sem_seg_val",)
21 | SOLVER:
22 |   IMS_PER_BATCH: 16
23 |   BASE_LR: 0.0001
24 |   MAX_ITER: 90000
25 |   WARMUP_FACTOR: 1.0
26 |   WARMUP_ITERS: 0
27 |   WEIGHT_DECAY: 0.05
28 |   OPTIMIZER: "ADAMW"
29 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
30 |   BACKBONE_MULTIPLIER: 0.1
31 |   CLIP_GRADIENTS:
32 |     ENABLED: True
33 |     CLIP_TYPE: "full_model"
34 |     CLIP_VALUE: 0.01
35 |     NORM_TYPE: 2.0
36 |   AMP:
37 |     ENABLED: True
38 | INPUT:
39 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1024) for x in range(5, 21)]"]
40 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
41 |   MIN_SIZE_TEST: 1024
42 |   MAX_SIZE_TRAIN: 4096
43 |   MAX_SIZE_TEST: 2048
44 |   CROP:
45 |     ENABLED: True
46 |     TYPE: "absolute"
47 |     SIZE: (512, 1024)
48 |     SINGLE_CATEGORY_MAX_AREA: 1.0
49 |   COLOR_AUG_SSD: True
50 |   SIZE_DIVISIBILITY: -1
51 |   FORMAT: "RGB"
52 |   DATASET_MAPPER_NAME: "oneformer_unified"
53 |   MAX_SEQ_LEN: 77
54 |   TASK_SEQ_LEN: 77
55 |   TASK_PROB: 
56 |     SEMANTIC: 0.33
57 |     INSTANCE: 0.66
58 | TEST:
59 |   EVAL_PERIOD: 5000
60 |   AUG:
61 |     ENABLED: False
62 |     MIN_SIZES: [512, 768, 1024, 1280, 1536, 1792]
63 |     MAX_SIZE: 4096
64 |     FLIP: True
65 | DATALOADER:
66 |   FILTER_EMPTY_ANNOTATIONS: True
67 |   NUM_WORKERS: 4
68 | VERSION: 2


--------------------------------------------------------------------------------
/configs/cityscapes/convnext/mapillary_pretrain_oneformer_convnext_large_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../oneformer_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2ConvNeXt"
 5 |   CONVNEXT:
 6 |     IN_CHANNELS: 3
 7 |     DEPTHS: [3, 3, 27, 3]
 8 |     DIMS: [192, 384, 768, 1536]
 9 |     DROP_PATH_RATE: 0.4
10 |     LSIT: 1.0
11 |     OUT_INDICES: [0, 1, 2, 3]
12 |   WEIGHTS: "250_16_convnext_l_oneformer_mapillary_300k.pth"
13 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
14 |   PIXEL_STD: [58.395, 57.120, 57.375]
15 |   ONE_FORMER:
16 |     NUM_OBJECT_QUERIES: 250
17 | TEST:
18 |   DETECTIONS_PER_IMAGE: 250
19 | 


--------------------------------------------------------------------------------
/configs/cityscapes/convnext/mapillary_pretrain_oneformer_convnext_xlarge_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../oneformer_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2ConvNeXt"
 5 |   CONVNEXT:
 6 |     IN_CHANNELS: 3
 7 |     DEPTHS: [3, 3, 27, 3]
 8 |     DIMS: [256, 512, 1024, 2048]
 9 |     DROP_PATH_RATE: 0.4
10 |     LSIT: 1.0
11 |     OUT_INDICES: [0, 1, 2, 3]
12 |   WEIGHTS: "250_16_convnext_xl_oneformer_mapillary_300k.pth"
13 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
14 |   PIXEL_STD: [58.395, 57.120, 57.375]
15 |   ONE_FORMER:
16 |     NUM_OBJECT_QUERIES: 250
17 | TEST:
18 |   DETECTIONS_PER_IMAGE: 250
19 | 


--------------------------------------------------------------------------------
/configs/cityscapes/convnext/oneformer_convnext_large_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../oneformer_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2ConvNeXt"
 5 |   CONVNEXT:
 6 |     IN_CHANNELS: 3
 7 |     DEPTHS: [3, 3, 27, 3]
 8 |     DIMS: [192, 384, 768, 1536]
 9 |     DROP_PATH_RATE: 0.4
10 |     LSIT: 1.0
11 |     OUT_INDICES: [0, 1, 2, 3]
12 |   WEIGHTS: "convnext_large_22k_1k_384.pkl"
13 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
14 |   PIXEL_STD: [58.395, 57.120, 57.375]
15 |   ONE_FORMER:
16 |     NUM_OBJECT_QUERIES: 250
17 | TEST:
18 |   DETECTIONS_PER_IMAGE: 250
19 | 


--------------------------------------------------------------------------------
/configs/cityscapes/convnext/oneformer_convnext_xlarge_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../oneformer_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2ConvNeXt"
 5 |   CONVNEXT:
 6 |     IN_CHANNELS: 3
 7 |     DEPTHS: [3, 3, 27, 3]
 8 |     DIMS: [256, 512, 1024, 2048]
 9 |     DROP_PATH_RATE: 0.4
10 |     LSIT: 1.0
11 |     OUT_INDICES: [0, 1, 2, 3]
12 |   WEIGHTS: "convnext_xlarge_22k_1k_384_ema.pkl"
13 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
14 |   PIXEL_STD: [58.395, 57.120, 57.375]
15 |   ONE_FORMER:
16 |     NUM_OBJECT_QUERIES: 250
17 | TEST:
18 |   DETECTIONS_PER_IMAGE: 250
19 | 


--------------------------------------------------------------------------------
/configs/cityscapes/dinat/oneformer_dinat_large_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../oneformer_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2DiNAT"
 5 |   DiNAT:
 6 |     EMBED_DIM: 192
 7 |     MLP_RATIO: 2.0
 8 |     DEPTHS: [3, 4, 18, 5]
 9 |     NUM_HEADS: [6, 12, 24, 48]
10 |     KERNEL_SIZE: 7
11 |     DROP_PATH_RATE: 0.3
12 |     DILATIONS: [[1, 18, 1], [1, 5, 1, 9], [1, 2, 1, 3, 1, 4, 1, 2, 1, 3, 1, 4, 1, 2, 1, 3, 1, 4], [1, 2, 1, 2, 1]]
13 |   WEIGHTS: "dinat_large_in22k_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 |   ONE_FORMER:
17 |     NUM_OBJECT_QUERIES: 250
18 | SOLVER:
19 |   AMP:
20 |     ENABLED: False
21 | TEST:
22 |   DETECTIONS_PER_IMAGE: 250


--------------------------------------------------------------------------------
/configs/cityscapes/oneformer_R50_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-Cityscapes-UnifiedSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "OneFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "OneFormerHead"
 6 |     IGNORE_VALUE: 255
 7 |     NUM_CLASSES: 19
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   ONE_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "ContrastiveMultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     DEEP_SUPERVISION: True
22 |     NO_OBJECT_WEIGHT: 0.1
23 |     CLASS_WEIGHT: 2.0
24 |     MASK_WEIGHT: 5.0
25 |     DICE_WEIGHT: 5.0
26 |     CONTRASTIVE_WEIGHT: 0.5
27 |     CONTRASTIVE_TEMPERATURE: 0.07
28 |     HIDDEN_DIM: 256
29 |     NUM_OBJECT_QUERIES: 150
30 |     USE_TASK_NORM: True
31 |     NHEADS: 8
32 |     DROPOUT: 0.1
33 |     DIM_FEEDFORWARD: 2048
34 |     ENC_LAYERS: 0
35 |     PRE_NORM: False
36 |     ENFORCE_INPUT_PROJ: False
37 |     SIZE_DIVISIBILITY: 32
38 |     ENC_LAYERS: 0
39 |     CLASS_DEC_LAYERS: 2
40 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
41 |     TRAIN_NUM_POINTS: 12544
42 |     OVERSAMPLE_RATIO: 3.0
43 |     IMPORTANCE_SAMPLE_RATIO: 0.75
44 |   TEXT_ENCODER:
45 |     WIDTH: 256
46 |     CONTEXT_LENGTH: 77
47 |     NUM_LAYERS: 6
48 |     VOCAB_SIZE: 49408
49 |     PROJ_NUM_LAYERS: 2
50 |     N_CTX: 16
51 |   TEST:
52 |     SEMANTIC_ON: True
53 |     INSTANCE_ON: True
54 |     PANOPTIC_ON: True
55 |     OVERLAP_THRESHOLD: 0.8
56 |     OBJECT_MASK_THRESHOLD: 0.8
57 |     TASK: "panoptic"
58 | TEST:
59 |   DETECTIONS_PER_IMAGE: 150
60 | 


--------------------------------------------------------------------------------
/configs/cityscapes/swin/oneformer_swin_large_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../oneformer_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22kto1k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   ONE_FORMER:
18 |     NUM_OBJECT_QUERIES: 250
19 | TEST:
20 |   DETECTIONS_PER_IMAGE: 250
21 | 


--------------------------------------------------------------------------------
/configs/coco/Base-COCO-UnifiedSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     # NORM: "SyncBN"
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("coco_2017_train_panoptic_with_sem_seg",)
18 |   TEST_PANOPTIC: ("coco_2017_val_panoptic_with_sem_seg",)  # to evaluate instance and semantic performance as well
19 |   TEST_INSTANCE: ("coco_2017_val_panoptic2instance",)
20 |   TEST_SEMANTIC: ("coco_2017_val_panoptic_with_sem_seg",)
21 | SOLVER:
22 |   IMS_PER_BATCH: 16
23 |   BASE_LR: 0.0001
24 |   STEPS: (327778, 355092)
25 |   MAX_ITER: 368750
26 |   WARMUP_FACTOR: 1.0
27 |   WARMUP_ITERS: 10
28 |   WEIGHT_DECAY: 0.05
29 |   OPTIMIZER: "ADAMW"
30 |   BACKBONE_MULTIPLIER: 0.1
31 |   CLIP_GRADIENTS:
32 |     ENABLED: True
33 |     CLIP_TYPE: "full_model"
34 |     CLIP_VALUE: 0.01
35 |     NORM_TYPE: 2.0
36 |   AMP:
37 |     ENABLED: True
38 | INPUT:
39 |   IMAGE_SIZE: 1024
40 |   MIN_SCALE: 0.1
41 |   MAX_SCALE: 2.0
42 |   FORMAT: "RGB"
43 |   DATASET_MAPPER_NAME: "coco_unified_lsj"
44 |   MAX_SEQ_LEN: 77
45 |   TASK_SEQ_LEN: 77
46 |   TASK_PROB: 
47 |     SEMANTIC: 0.33
48 |     INSTANCE: 0.66
49 | TEST:
50 |   EVAL_PERIOD: 5000
51 | DATALOADER:
52 |   FILTER_EMPTY_ANNOTATIONS: True
53 |   NUM_WORKERS: 4
54 | VERSION: 2
55 | 


--------------------------------------------------------------------------------
/configs/coco/dinat/oneformer_dinat_large_bs16_100ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../oneformer_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2DiNAT"
 5 |   DiNAT:
 6 |     EMBED_DIM: 192
 7 |     MLP_RATIO: 2.0
 8 |     DEPTHS: [3, 4, 18, 5]
 9 |     NUM_HEADS: [6, 12, 24, 48]
10 |     KERNEL_SIZE: 11
11 |     DROP_PATH_RATE: 0.3
12 |     DILATIONS: [[1, 20, 1], [1, 5, 1, 10], [1, 2, 1, 3, 1, 4, 1, 5, 1, 2, 1, 3, 1, 4, 1, 5, 1, 5], [1, 2, 1, 2, 1]]
13 |   WEIGHTS: "dinat_large_in22k_in1k_384_11x11.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 |   ONE_FORMER:
17 |     NUM_OBJECT_QUERIES: 150
18 | SOLVER:
19 |   STEPS: (655556, 710184)
20 |   MAX_ITER: 737500
21 |   AMP:
22 |     ENABLED: False
23 | TEST:
24 |   DETECTIONS_PER_IMAGE: 150


--------------------------------------------------------------------------------
/configs/coco/oneformer_R50_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-COCO-UnifiedSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "OneFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "OneFormerHead"
 6 |     IGNORE_VALUE: 255
 7 |     NUM_CLASSES: 133
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   ONE_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "ContrastiveMultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     DEEP_SUPERVISION: True
22 |     NO_OBJECT_WEIGHT: 0.1
23 |     CLASS_WEIGHT: 2.0
24 |     MASK_WEIGHT: 5.0
25 |     DICE_WEIGHT: 5.0
26 |     CONTRASTIVE_WEIGHT: 0.5
27 |     CONTRASTIVE_TEMPERATURE: 0.07
28 |     HIDDEN_DIM: 256
29 |     NUM_OBJECT_QUERIES: 150
30 |     USE_TASK_NORM: True
31 |     NHEADS: 8
32 |     DROPOUT: 0.1
33 |     DIM_FEEDFORWARD: 2048
34 |     ENC_LAYERS: 0
35 |     PRE_NORM: False
36 |     ENFORCE_INPUT_PROJ: False
37 |     SIZE_DIVISIBILITY: 32
38 |     CLASS_DEC_LAYERS: 2
39 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
40 |     TRAIN_NUM_POINTS: 12544
41 |     OVERSAMPLE_RATIO: 3.0
42 |     IMPORTANCE_SAMPLE_RATIO: 0.75
43 |   TEXT_ENCODER:
44 |     WIDTH: 256
45 |     CONTEXT_LENGTH: 77
46 |     NUM_LAYERS: 6
47 |     VOCAB_SIZE: 49408
48 |     PROJ_NUM_LAYERS: 2
49 |     N_CTX: 16
50 |   TEST:
51 |     SEMANTIC_ON: True
52 |     INSTANCE_ON: True
53 |     PANOPTIC_ON: True
54 |     DETECTION_ON: False
55 |     OVERLAP_THRESHOLD: 0.8
56 |     OBJECT_MASK_THRESHOLD: 0.8
57 |     TASK: "panoptic"
58 | TEST:
59 |   DETECTIONS_PER_IMAGE: 150
60 | 


--------------------------------------------------------------------------------
/configs/coco/swin/oneformer_swin_large_bs16_100ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../oneformer_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22kto1k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   ONE_FORMER:
18 |     NUM_OBJECT_QUERIES: 150
19 | SOLVER:
20 |   STEPS: (655556, 735184)
21 |   MAX_ITER: 737500
22 |   AMP:
23 |     ENABLED: False
24 | TEST:
25 |   DETECTIONS_PER_IMAGE: 150
26 | 


--------------------------------------------------------------------------------
/configs/coco/swin/oneformer_swin_tiny_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../oneformer_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 6, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/configs/mapillary_vistas/Base-Mapillary-UnifiedSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     NORM: "SyncBN"  # use syncbn for cityscapes dataset
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("mapillary_vistas_panoptic_train",)
18 |   TEST_PANOPTIC: ("mapillary_vistas_panoptic_val",)
19 |   TEST_INSTANCE: ("mapillary_vistas_panoptic_val",)
20 |   TEST_SEMANTIC: ("mapillary_vistas_sem_seg_val",)
21 | SOLVER:
22 |   IMS_PER_BATCH: 16
23 |   BASE_LR: 0.0001
24 |   MAX_ITER: 300000
25 |   WARMUP_FACTOR: 1.0
26 |   WARMUP_ITERS: 0
27 |   WEIGHT_DECAY: 0.05
28 |   OPTIMIZER: "ADAMW"
29 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
30 |   BACKBONE_MULTIPLIER: 0.1
31 |   CLIP_GRADIENTS:
32 |     ENABLED: True
33 |     CLIP_TYPE: "full_model"
34 |     CLIP_VALUE: 0.01
35 |     NORM_TYPE: 2.0
36 |   AMP:
37 |     ENABLED: True
38 | INPUT:
39 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 2048) for x in range(5, 21)]"]
40 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
41 |   MIN_SIZE_TEST: 2048
42 |   MAX_SIZE_TRAIN: 8192
43 |   MAX_SIZE_TEST: 2048
44 |   CROP:
45 |     ENABLED: True
46 |     TYPE: "absolute"
47 |     SIZE: (1024, 1024)
48 |     SINGLE_CATEGORY_MAX_AREA: 1.0
49 |   COLOR_AUG_SSD: True
50 |   SIZE_DIVISIBILITY: 1024  # used in dataset mapper
51 |   FORMAT: "RGB"
52 |   DATASET_MAPPER_NAME: "oneformer_unified"
53 |   MAX_SEQ_LEN: 77
54 |   TASK_SEQ_LEN: 77
55 |   TASK_PROB: 
56 |     SEMANTIC: 0.50
57 |     INSTANCE: 0.50
58 | TEST:
59 |   EVAL_PERIOD: 30000
60 |   AUG:
61 |     ENABLED: False
62 |     MIN_SIZES: [512, 768, 1024, 1280, 1536, 1792]
63 |     MAX_SIZE: 4096
64 |     FLIP: True
65 | DATALOADER:
66 |   FILTER_EMPTY_ANNOTATIONS: True
67 |   NUM_WORKERS: 10
68 | VERSION: 2


--------------------------------------------------------------------------------
/configs/mapillary_vistas/convnext/cityscapes_pretrain_oneformer_convnext_large_bs16_300k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../oneformer_R50_bs16_300k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2ConvNeXt"
 5 |   CONVNEXT:
 6 |     IN_CHANNELS: 3
 7 |     DEPTHS: [3, 3, 27, 3]
 8 |     DIMS: [192, 384, 768, 1536]
 9 |     DROP_PATH_RATE: 0.4
10 |     LSIT: 1.0
11 |     OUT_INDICES: [0, 1, 2, 3]
12 |   WEIGHTS: "convnext_large_22k_1k_384.pkl"
13 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
14 |   PIXEL_STD: [58.395, 57.120, 57.375]
15 |   ONE_FORMER:
16 |     NUM_OBJECT_QUERIES: 250
17 | INPUT:
18 |   TASK_PROB: 
19 |     SEMANTIC: 0.33
20 |     INSTANCE: 0.66
21 | TEST:
22 |   DETECTIONS_PER_IMAGE: 250
23 | 


--------------------------------------------------------------------------------
/configs/mapillary_vistas/convnext/cityscapes_pretrain_oneformer_convnext_xlarge_bs16_300k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../oneformer_R50_bs16_300k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2ConvNeXt"
 5 |   CONVNEXT:
 6 |     IN_CHANNELS: 3
 7 |     DEPTHS: [3, 3, 27, 3]
 8 |     DIMS: [256, 512, 1024, 2048]
 9 |     DROP_PATH_RATE: 0.4
10 |     LSIT: 1.0
11 |     OUT_INDICES: [0, 1, 2, 3]
12 |   WEIGHTS: "convnext_xlarge_22k_1k_384_ema.pkl"
13 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
14 |   PIXEL_STD: [58.395, 57.120, 57.375]
15 |   ONE_FORMER:
16 |     NUM_OBJECT_QUERIES: 250
17 | INPUT:
18 |   TASK_PROB: 
19 |     SEMANTIC: 0.33
20 |     INSTANCE: 0.66
21 | TEST:
22 |   DETECTIONS_PER_IMAGE: 250
23 | 


--------------------------------------------------------------------------------
/configs/mapillary_vistas/convnext/oneformer_convnext_large_bs16_300k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../oneformer_R50_bs16_300k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2ConvNeXt"
 5 |   CONVNEXT:
 6 |     IN_CHANNELS: 3
 7 |     DEPTHS: [3, 3, 27, 3]
 8 |     DIMS: [192, 384, 768, 1536]
 9 |     DROP_PATH_RATE: 0.4
10 |     LSIT: 1.0
11 |     OUT_INDICES: [0, 1, 2, 3]
12 |   WEIGHTS: "convnext_large_22k_1k_384.pkl"
13 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
14 |   PIXEL_STD: [58.395, 57.120, 57.375]
15 |   ONE_FORMER:
16 |     NUM_OBJECT_QUERIES: 250
17 | TEST:
18 |   DETECTIONS_PER_IMAGE: 250
19 | 


--------------------------------------------------------------------------------
/configs/mapillary_vistas/dinat/oneformer_dinat_large_bs16_300k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../oneformer_R50_bs16_300k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2DiNAT"
 5 |   DiNAT:
 6 |     EMBED_DIM: 192
 7 |     MLP_RATIO: 2.0
 8 |     DEPTHS: [3, 4, 18, 5]
 9 |     NUM_HEADS: [6, 12, 24, 48]
10 |     KERNEL_SIZE: 11
11 |     DROP_PATH_RATE: 0.3
12 |     DILATIONS: [[1, 20, 1], [1, 5, 1, 10], [1, 2, 1, 3, 1, 4, 1, 5, 1, 2, 1, 3, 1, 4, 1, 5, 1, 5], [1, 2, 1, 2, 1]]
13 |   WEIGHTS: "dinat_large_in22k_in1k_384_11x11.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 |   ONE_FORMER:
17 |     NUM_OBJECT_QUERIES: 250
18 | SOLVER:
19 |   AMP:
20 |     ENABLED: False
21 | TEST:
22 |   DETECTIONS_PER_IMAGE: 250


--------------------------------------------------------------------------------
/configs/mapillary_vistas/oneformer_R50_bs16_300k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-Mapillary-UnifiedSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "OneFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "OneFormerHead"
 6 |     IGNORE_VALUE: 65
 7 |     NUM_CLASSES: 65
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   ONE_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "ContrastiveMultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     DEEP_SUPERVISION: True
22 |     NO_OBJECT_WEIGHT: 0.1
23 |     CLASS_WEIGHT: 2.0
24 |     MASK_WEIGHT: 5.0
25 |     DICE_WEIGHT: 5.0
26 |     CONTRASTIVE_WEIGHT: 0.5
27 |     CONTRASTIVE_TEMPERATURE: 0.07
28 |     HIDDEN_DIM: 256
29 |     NUM_OBJECT_QUERIES: 150
30 |     USE_TASK_NORM: True
31 |     NHEADS: 8
32 |     DROPOUT: 0.1
33 |     DIM_FEEDFORWARD: 2048
34 |     ENC_LAYERS: 0
35 |     PRE_NORM: False
36 |     ENFORCE_INPUT_PROJ: False
37 |     SIZE_DIVISIBILITY: 32
38 |     ENC_LAYERS: 0
39 |     CLASS_DEC_LAYERS: 2
40 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
41 |     TRAIN_NUM_POINTS: 12544
42 |     OVERSAMPLE_RATIO: 3.0
43 |     IMPORTANCE_SAMPLE_RATIO: 0.75
44 |   TEXT_ENCODER:
45 |     WIDTH: 256
46 |     CONTEXT_LENGTH: 77
47 |     NUM_LAYERS: 6
48 |     VOCAB_SIZE: 49408
49 |     PROJ_NUM_LAYERS: 2
50 |     N_CTX: 16
51 |   TEST:
52 |     SEMANTIC_ON: True
53 |     INSTANCE_ON: True
54 |     PANOPTIC_ON: True
55 |     OVERLAP_THRESHOLD: 0.8
56 |     OBJECT_MASK_THRESHOLD: 0.8
57 |     TASK: "panoptic"
58 | TEST:
59 |   DETECTIONS_PER_IMAGE: 150
60 | 


--------------------------------------------------------------------------------
/configs/mapillary_vistas/swin/oneformer_swin_large_bs16_300k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../oneformer_R50_bs16_300k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22kto1k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   ONE_FORMER:
18 |     NUM_OBJECT_QUERIES: 250
19 | TEST:
20 |   DETECTIONS_PER_IMAGE: 250
21 | 


--------------------------------------------------------------------------------
/datasets/README.md:
--------------------------------------------------------------------------------
  1 | # Prepare Datasets for OneFormer
  2 | 
  3 | - A dataset can be used by accessing [DatasetCatalog](https://detectron2.readthedocs.io/modules/data.html#detectron2.data.DatasetCatalog) for its data, or [MetadataCatalog](https://detectron2.readthedocs.io/modules/data.html#detectron2.data.MetadataCatalog) for its metadata (class names, etc).
  4 | - This document explains how to setup the builtin datasets so they can be used by the above APIs. [Training OneFormer with Custom Datasets](https://github.com/SHI-Labs/OneFormer/tree/main/datasets/custom_datasets) gives a deeper dive on how to train OneFormer with custom datasets.
  5 | - Detectron2 has builtin support for a few datasets. The datasets are assumed to exist in a directory specified by the environment variable `DETECTRON2_DATASETS`. Under this directory, detectron2 will look for datasets in the structure described below, if needed.
  6 | 
  7 |   ```text
  8 |   $DETECTRON2_DATASETS/
  9 |     ADEChallengeData2016/
 10 |     cityscapes/
 11 |     coco/
 12 |     mapillary_vistas/
 13 |   ```
 14 | 
 15 | - You can set the location for builtin datasets by `export DETECTRON2_DATASETS=/path/to/datasets`. If left unset, the default is `./datasets` relative to your current working directory.
 16 | 
 17 | 
 18 | ## Expected dataset structure for [ADE20K](http://sceneparsing.csail.mit.edu/)
 19 | 
 20 | ```text
 21 | ADEChallengeData2016/
 22 |   images/
 23 |   annotations/
 24 |   objectInfo150.txt
 25 |   # download instance annotation
 26 |   annotations_instance/
 27 |   # generated by prepare_ade20k_sem_seg.py
 28 |   annotations_detectron2/
 29 |   # below are generated by prepare_ade20k_pan_seg.py
 30 |   ade20k_panoptic_{train,val}.json
 31 |   ade20k_panoptic_{train,val}/
 32 |   # below are generated by prepare_ade20k_ins_seg.py
 33 |   ade20k_instance_{train,val}.json
 34 | ```
 35 | 
 36 | - Generate `annotations_detectron2`:
 37 | 
 38 |   ```bash
 39 |   python datasets/prepare_ade20k_sem_seg.py
 40 |   ```
 41 | 
 42 | - Install panopticapi by:
 43 | 
 44 |   ```bash
 45 |   pip install git+https://github.com/cocodataset/panopticapi.git
 46 |   ```
 47 | 
 48 | - Download the instance annotation from <http://sceneparsing.csail.mit.edu/>:
 49 | 
 50 |   ```bash
 51 |   wget http://sceneparsing.csail.mit.edu/data/ChallengeData2017/annotations_instance.tar
 52 |   ```
 53 | 
 54 | - Then, run `python datasets/prepare_ade20k_pan_seg.py`, to combine semantic and instance annotations for panoptic annotations.
 55 | 
 56 | - Run `python datasets/prepare_ade20k_ins_seg.py`, to extract instance annotations in COCO format.
 57 | 
 58 | ## Expected dataset structure for [Cityscapes](https://www.cityscapes-dataset.com/downloads/)
 59 | 
 60 | ```text
 61 | cityscapes/
 62 |   gtFine/
 63 |     train/
 64 |       aachen/
 65 |         color.png, instanceIds.png, labelIds.png, polygons.json,
 66 |         labelTrainIds.png
 67 |       ...
 68 |     val/
 69 |     test/
 70 |     # below are generated Cityscapes panoptic annotation
 71 |     cityscapes_panoptic_train.json
 72 |     cityscapes_panoptic_train/
 73 |     cityscapes_panoptic_val.json
 74 |     cityscapes_panoptic_val/
 75 |     cityscapes_panoptic_test.json
 76 |     cityscapes_panoptic_test/
 77 |   leftImg8bit/
 78 |     train/
 79 |     val/
 80 |     test/
 81 | ```
 82 | 
 83 | - Login and download the dataset
 84 | 
 85 |   ```bash
 86 |   wget --keep-session-cookies --save-cookies=cookies.txt --post-data 'username=myusername&password=mypassword&submit=Login' https://www.cityscapes-dataset.com/login/
 87 |   ######## gtFine
 88 |   wget --load-cookies cookies.txt --content-disposition https://www.cityscapes-dataset.com/file-handling/?packageID=1
 89 |   ######## leftImg8bit
 90 |   wget --load-cookies cookies.txt --content-disposition https://www.cityscapes-dataset.com/file-handling/?packageID=3
 91 |   ```
 92 | 
 93 | - Install cityscapes scripts by:
 94 | 
 95 |   ```bash
 96 |   pip install git+https://github.com/mcordts/cityscapesScripts.git
 97 |   ```
 98 | 
 99 | - To create labelTrainIds.png, first prepare the above structure, then run cityscapesescript with:
100 | 
101 |   ```bash
102 |   git clone https://github.com/mcordts/cityscapesScripts.git
103 |   ```
104 | 
105 |   ```bash
106 |   CITYSCAPES_DATASET=/path/to/abovementioned/cityscapes python cityscapesScripts/cityscapesscripts/preparation/createTrainIdLabelImgs.py
107 |   ```
108 | 
109 |   These files are not needed for instance segmentation.
110 | 
111 | - To generate Cityscapes panoptic dataset, run cityscapesescript with:
112 | 
113 |   ```bash
114 |   CITYSCAPES_DATASET=/path/to/abovementioned/cityscapes python cityscapesScripts/cityscapesscripts/preparation/createPanopticImgs.py
115 |   ```
116 | 
117 |   These files are not needed for semantic and instance segmentation.
118 | 
119 | ## Expected dataset structure for [COCO](https://cocodataset.org/#download)
120 | 
121 | ```text
122 | coco/
123 |   annotations/
124 |     instances_{train,val}2017.json
125 |     panoptic_{train,val}2017.json
126 |     caption_{train,val}2017.json
127 |     # evaluate on instance labels derived from panoptic annotations
128 |     panoptic2instances_val2017.json
129 |   {train,val}2017/
130 |     # image files that are mentioned in the corresponding json
131 |   panoptic_{train,val}2017/  # png annotations
132 |   panoptic_semseg_{train,val}2017/  # generated by the script mentioned below
133 | ```
134 | 
135 | - Install panopticapi by:
136 | 
137 |   ```bash
138 |   pip install git+https://github.com/cocodataset/panopticapi.git
139 |   ```
140 | 
141 | - Then, run `python datasets/prepare_coco_semantic_annos_from_panoptic_annos.py`, to extract semantic annotations from panoptic annotations (only used for evaluation).
142 | 
143 | - Then run the following command to convert the panoptic json into instance json format (used for evaluation on instance segmentation task):
144 | 
145 |   ```bash
146 |   python datasets/panoptic2detection_coco_format.py --things_only
147 |   ```
148 | 
149 | ## Expected dataset structure for [Mapillary Vistas](https://www.mapillary.com/dataset/vistas)
150 | 
151 | ```text
152 | mapillary_vistas/
153 |   training/
154 |     images/
155 |     instances/
156 |     labels/
157 |     panoptic/
158 |   validation/
159 |     images/
160 |     instances/
161 |     labels/
162 |     panoptic/
163 |   mapillary_vistas_instance_{train,val}.json  # generated by the script mentioned below
164 | ```
165 | 
166 | No preprocessing is needed for Mapillary Vistas on semantic and panoptic segmentation.
167 | 
168 | We do not evaluate for the instance segmentation task on the Mapillary Vistas dataset.
169 | 


--------------------------------------------------------------------------------
/datasets/ade20k_instance_catid_mapping.txt:
--------------------------------------------------------------------------------
  1 | Instacne100	SceneParse150	FullADE20K
  2 | 1		8		165
  3 | 2		9		3055
  4 | 3		11		350
  5 | 4		13		1831
  6 | 5		15		774
  7 | 5		15		783
  8 | 6		16		2684
  9 | 7		19		687
 10 | 8		20		471
 11 | 9		21		401
 12 | 10		23		1735
 13 | 11		24		2473
 14 | 12		25		2329
 15 | 13		28		1564
 16 | 14		31		57
 17 | 15		32		2272
 18 | 16		33		907
 19 | 17		34		724
 20 | 18		36		2985
 21 | 18		36		533
 22 | 19		37		1395
 23 | 20		38		155
 24 | 21		39		2053
 25 | 22		40		689
 26 | 23		42		266
 27 | 24		43		581
 28 | 25		44		2380
 29 | 26		45		491
 30 | 27		46		627
 31 | 28		48		2388
 32 | 29		50		943
 33 | 30		51		2096
 34 | 31		54		2530
 35 | 32		56		420
 36 | 33		57		1948
 37 | 34		58		1869
 38 | 35		59		2251
 39 | 36		63		239
 40 | 37		65		571
 41 | 38		66		2793
 42 | 39		67		978
 43 | 40		68		236
 44 | 41		70		181
 45 | 42		71		629
 46 | 43		72		2598
 47 | 44		73		1744
 48 | 45		74		1374
 49 | 46		75		591
 50 | 47		76		2679
 51 | 48		77		223
 52 | 49		79		47
 53 | 50		81		327
 54 | 51		82		2821
 55 | 52		83		1451
 56 | 53		84		2880
 57 | 54		86		480
 58 | 55		87		77
 59 | 56		88		2616
 60 | 57		89		246
 61 | 57		89		247
 62 | 58		90		2733
 63 | 59		91		14
 64 | 60		93		38
 65 | 61		94		1936
 66 | 62		96		120
 67 | 63		98		1702
 68 | 64		99		249
 69 | 65		103		2928
 70 | 66		104		2337
 71 | 67		105		1023
 72 | 68		108		2989
 73 | 69		109		1930
 74 | 70		111		2586
 75 | 71		112		131
 76 | 72		113		146
 77 | 73		116		95
 78 | 74		117		1563
 79 | 75		119		1708
 80 | 76		120		103
 81 | 77		121		1002
 82 | 78		122		2569
 83 | 79		124		2833
 84 | 80		125		1551
 85 | 81		126		1981
 86 | 82		127		29
 87 | 83		128		187
 88 | 84		130		747
 89 | 85		131		2254
 90 | 86		133		2262
 91 | 87		134		1260
 92 | 88		135		2243
 93 | 89		136		2932
 94 | 90		137		2836
 95 | 91		138		2850
 96 | 92		139		64
 97 | 93		140		894
 98 | 94		143		1919
 99 | 95		144		1583
100 | 96		145		318
101 | 97		147		2046
102 | 98		148		1098
103 | 99		149		530
104 | 100		150		954
105 | 


--------------------------------------------------------------------------------
/datasets/custom_datasets/README.md:
--------------------------------------------------------------------------------
 1 | # Training OneFormer with Custom Datasets
 2 | 
 3 | OneFormer advocates the usage of panoptic annotations along with its task-conditioned joint training strategy. However, if panoptic annotations are not available, then also OneFormer can be trained using only the instance or semantic annotations on custom datasets. We provide some guidelines for training with custom datasets.
 4 | 
 5 | ## Register your New Dataset
 6 | 
 7 | - OneFormer uses the information (class names, thing classes, etc.) stored in a dataset's metadata while preparing a dataset dictionary using a [`dataset_mapper`](https://github.com/SHI-Labs/OneFormer/tree/main/oneformer/data/dataset_mappers).
 8 | 
 9 | - [Use Custom Datasets](https://detectron2.readthedocs.io/en/latest/tutorials/datasets.html) gives a deeper dive into registering a new custom dataset.
10 | 
11 | ## Training with Available Panoptic Annotations
12 | 
13 | - To prepare the dataset dictionary for each iteration during training, OneFormer uses a [`dataset_mapper`](https://github.com/SHI-Labs/OneFormer/tree/main/oneformer/data/dataset_mappers) class.
14 | 
15 | - Originally, we provide two `dataset_mapper` classes which support task-conditioned joint training using the panoptic annotations:  
16 |   - [`COCOUnifiedNewBaselineDatasetMapper`](https://github.com/SHI-Labs/OneFormer/blob/5e04c9aaffd9bc73020d2238757f62346fe778c0/oneformer/data/dataset_mappers/coco_unified_new_baseline_dataset_mapper.py#L56): Specifically designed for COCO annotation format.
17 |   - [`OneFormerUnifiedDatasetMapper`](https://github.com/SHI-Labs/OneFormer/blob/5e04c9aaffd9bc73020d2238757f62346fe778c0/oneformer/data/dataset_mappers/oneformer_unified_dataset_mapper.py#L26): General annotation format.
18 | 
19 | - If you have panoptic annotations for your custom dataset, you may use these dataset_mapper classes directly after registering your dataset. You may also tune the [task sampling probabilities in the corresponding config file](https://github.com/SHI-Labs/OneFormer/blob/5e04c9aaffd9bc73020d2238757f62346fe778c0/configs/ade20k/Base-ADE20K-UnifiedSegmentation.yaml#L55).
20 | 
21 | - If you want to train using only the instance or semantic annotation, please follow the next section on preparing a custom dataset mapper class.
22 | 
23 | ## Write a Custom Dataset Mapper Class
24 | 
25 | - If you want to train using only instance or semantic annotations, write your custom dataset mapper class and add it to the [`build_train_loader`](https://github.com/SHI-Labs/OneFormer/blob/5e04c9aaffd9bc73020d2238757f62346fe778c0/train_net.py#L156) method.
26 | 
27 | - We provide a few templates for custom dataset mappers:
28 |   - [`InstanceCOCOCustomNewBaselineDatasetMapper`](https://github.com/SHI-Labs/OneFormer/blob/a7fae86ce5791a93132c059c1bdfc79c9f842820/datasets/custom_datasets/instance_coco_custom_dataset_mapper.py#L72): Specifically designed for COCO instance annotation format.
29 |   - [`InstanceOneFormerCustomDatasetMapper`](https://github.com/SHI-Labs/OneFormer/blob/a7fae86ce5791a93132c059c1bdfc79c9f842820/datasets/custom_datasets/instance_oneformer_custom_dataset_mapper.py#L26): General instance annotation format.
30 |   - [`SemanticOneFormerCustomDatasetMapper`](https://github.com/SHI-Labs/OneFormer/blob/a7fae86ce5791a93132c059c1bdfc79c9f842820/datasets/custom_datasets/semantic_oneformer_custom_dataset_mapper.py#L26): General semantic annotation format.
31 | 
32 | - Remember to register your custom dataset before training. 
33 | 
34 | 
35 | Now you are all set to train OneFormer using your custom dataset!


--------------------------------------------------------------------------------
/datasets/fg_ids.py:
--------------------------------------------------------------------------------
  1 | ADE20K_FG_IDS = {
  2 |         1: 8,
  3 |         2:	9,
  4 |         3:	11,
  5 |         4:	13,
  6 |         5:	15,
  7 |         5:	15,
  8 |         6:	16,
  9 |         7:	19,
 10 |         8:	20,
 11 |         9:	21,
 12 |         10:	23,
 13 |         11:	24,
 14 |         12:	25,
 15 |         13:	28,
 16 |         14:	31,
 17 |         15:	32,
 18 |         16:	33,
 19 |         17:	34,
 20 |         18:	36,
 21 |         18:	36,
 22 |         19:	37,
 23 |         20:	38,
 24 |         21:	39,
 25 |         22:	40,
 26 |         23:	42,
 27 |         24:	43,
 28 |         25:	44,
 29 |         26:	45,
 30 |         27:	46,
 31 |         28:	48,
 32 |         29:	50,
 33 |         30:	51,
 34 |         31:	54,
 35 |         32:	56,
 36 |         33:	57,
 37 |         34:	58,
 38 |         35:	59,
 39 |         36:	63,
 40 |         37:	65,
 41 |         38:	66,
 42 |         39:	67,
 43 |         40:	68,
 44 |         41:	70,
 45 |         42:	71,
 46 |         43:	72,
 47 |         44:	73,
 48 |         45:	74,
 49 |         46:	75,
 50 |         47:	76,
 51 |         48:	77,
 52 |         49:	79,
 53 |         50:	81,
 54 |         51:	82,
 55 |         52:	83,
 56 |         53:	84,
 57 |         54:	86,
 58 |         55:	87,
 59 |         56:	88,
 60 |         57:	89,
 61 |         57:	89,
 62 |         58:	90,
 63 |         59:	91,
 64 |         60:	93,
 65 |         61:	94,
 66 |         62:	96,
 67 |         63:	98,
 68 |         64:	99,
 69 |         65:	103,
 70 |         66:	104,
 71 |         67:	105,
 72 |         68:	108,
 73 |         69:	109,
 74 |         70:	111,
 75 |         71:	112,
 76 |         72:	113,
 77 |         73:	116,
 78 |         74:	117,
 79 |         75:	119,
 80 |         76:	120,
 81 |         77:	121,
 82 |         78:	122,
 83 |         79:	124,
 84 |         80:	125,
 85 |         81:	126,
 86 |         82:	127,
 87 |         83:	128,
 88 |         84:	130,
 89 |         85:	131,
 90 |         86:	133,
 91 |         87:	134,
 92 |         88:	135,
 93 |         89:	136,
 94 |         90:	137,
 95 |         91:	138,
 96 |         92:	139,
 97 |         93:	140,
 98 |         94:	143,
 99 |         95:	144,
100 |         96:	145,
101 |         97:	147,
102 |         98:	148,
103 |         99:	149,
104 |         100: 150
105 |     }
106 | 
107 | 
108 | CITYSCAPES_FG_NAMES = ['person', 'rider', 'car', 'truck', 'bus', 'train', 'motorcycle', 'bicycle']


--------------------------------------------------------------------------------
/datasets/panoptic2detection_coco_format.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # ------------------------------------------------------------------------------
  3 | # Reference: https://github.com/cocodataset/panopticapi/blob/master/converters/panoptic2detection_coco_format.py
  4 | # Modified by Jitesh Jain (https://github.com/praeclarumjj3)
  5 | # ------------------------------------------------------------------------------
  6 | '''
  7 | This script converts panoptic COCO format to detection COCO format. More
  8 | information about the formats can be found here:
  9 | http://cocodataset.org/#format-data. All segments will be stored in RLE format.
 10 | 
 11 | Additional option:
 12 | - using option '--things_only' the script can discard all stuff
 13 | segments, saving segments of things classes only.
 14 | '''
 15 | from __future__ import absolute_import
 16 | from __future__ import division
 17 | from __future__ import print_function
 18 | from __future__ import unicode_literals
 19 | import os, sys
 20 | import argparse
 21 | import numpy as np
 22 | import json
 23 | import time
 24 | import multiprocessing
 25 | 
 26 | import PIL.Image as Image
 27 | 
 28 | from panopticapi.utils import get_traceback, rgb2id, save_json
 29 | 
 30 | try:
 31 |     # set up path for pycocotools
 32 |     # sys.path.append('./cocoapi-master/PythonAPI/')
 33 |     from pycocotools import mask as COCOmask
 34 | except Exception:
 35 |     raise Exception("Please install pycocotools module from https://github.com/cocodataset/cocoapi")
 36 | 
 37 | @get_traceback
 38 | def convert_panoptic_to_detection_coco_format_single_core(
 39 |     proc_id, annotations_set, categories, segmentations_folder, things_only
 40 | ):
 41 |     annotations_detection = []
 42 |     for working_idx, annotation in enumerate(annotations_set):
 43 |         if working_idx % 100 == 0:
 44 |             print('Core: {}, {} from {} images processed'.format(proc_id,
 45 |                                                                  working_idx,
 46 |                                                                  len(annotations_set)))
 47 | 
 48 |         file_name = '{}.png'.format(annotation['file_name'].rsplit('.')[0])
 49 |         try:
 50 |             pan_format = np.array(
 51 |                 Image.open(os.path.join(segmentations_folder, file_name)), dtype=np.uint32
 52 |             )
 53 |         except IOError:
 54 |             raise KeyError('no prediction png file for id: {}'.format(annotation['image_id']))
 55 |         pan = rgb2id(pan_format)
 56 | 
 57 |         for segm_info in annotation['segments_info']:
 58 |             if things_only and categories[segm_info['category_id']]['isthing'] != 1:
 59 |                 continue
 60 |             mask = (pan == segm_info['id']).astype(np.uint8)
 61 |             mask = np.expand_dims(mask, axis=2)
 62 |             segm_info.pop('id')
 63 |             segm_info['image_id'] = annotation['image_id']
 64 |             rle = COCOmask.encode(np.asfortranarray(mask))[0]
 65 |             rle['counts'] = rle['counts'].decode('utf8')
 66 |             segm_info['segmentation'] = rle
 67 |             annotations_detection.append(segm_info)
 68 | 
 69 |     print('Core: {}, all {} images processed'.format(proc_id, len(annotations_set)))
 70 |     return annotations_detection
 71 | 
 72 | 
 73 | def convert_panoptic_to_detection_coco_format(input_json_file,
 74 |                                               segmentations_folder,
 75 |                                               output_json_file,
 76 |                                               categories_json_file,
 77 |                                               things_only):
 78 |     start_time = time.time()
 79 | 
 80 |     if segmentations_folder is None:
 81 |         segmentations_folder = input_json_file.rsplit('.', 1)[0]
 82 | 
 83 |     print("CONVERTING...")
 84 |     print("COCO panoptic format:")
 85 |     print("\tSegmentation folder: {}".format(segmentations_folder))
 86 |     print("\tJSON file: {}".format(input_json_file))
 87 |     print("TO")
 88 |     print("COCO detection format")
 89 |     print("\tJSON file: {}".format(output_json_file))
 90 |     if things_only:
 91 |         print("Saving only segments of things classes.")
 92 |     print('\n')
 93 | 
 94 |     print("Reading annotation information from {}".format(input_json_file))
 95 |     with open(input_json_file, 'r') as f:
 96 |         d_coco = json.load(f)
 97 |     annotations_panoptic = d_coco['annotations']
 98 | 
 99 |     with open(categories_json_file, 'r') as f:
100 |         categories_list = json.load(f)
101 |     categories = {category['id']: category for category in categories_list}
102 | 
103 |     cpu_num = multiprocessing.cpu_count()
104 |     annotations_split = np.array_split(annotations_panoptic, cpu_num)
105 |     print("Number of cores: {}, images per core: {}".format(cpu_num, len(annotations_split[0])))
106 |     workers = multiprocessing.Pool(processes=cpu_num)
107 |     processes = []
108 |     for proc_id, annotations_set in enumerate(annotations_split):
109 |         p = workers.apply_async(convert_panoptic_to_detection_coco_format_single_core,
110 |                                 (proc_id, annotations_set, categories, segmentations_folder, things_only))
111 |         processes.append(p)
112 |     annotations_coco_detection = []
113 |     for p in processes:
114 |         annotations_coco_detection.extend(p.get())
115 |     for idx, ann in enumerate(annotations_coco_detection):
116 |         ann['id'] = idx
117 | 
118 |     d_coco['annotations'] = annotations_coco_detection
119 |     categories_coco_detection = []
120 |     for category in d_coco['categories']:
121 |         if things_only and category['isthing'] != 1:
122 |             continue
123 |         category.pop('isthing')
124 |         categories_coco_detection.append(category)
125 |     d_coco['categories'] = categories_coco_detection
126 |     save_json(d_coco, output_json_file)
127 | 
128 |     t_delta = time.time() - start_time
129 |     print("Time elapsed: {:0.2f} seconds".format(t_delta))
130 | 
131 | 
132 | if __name__ == "__main__":
133 |     parser = argparse.ArgumentParser(
134 |         description="The script converts panoptic COCO format to detection \
135 |          COCO format. See this file's head for more information."
136 |     )
137 |     parser.add_argument('--things_only', action='store_true',
138 |                         help="discard stuff classes")
139 |     args = parser.parse_args()
140 |     
141 |     _root = os.getenv("DETECTRON2_DATASETS", "datasets")
142 |     root = os.path.join(_root, "coco")
143 |     input_json_file = os.path.join(root, "annotations", "panoptic_val2017.json")
144 |     output_json_file = os.path.join(root, "annotations", "panoptic2instances_val2017.json")
145 |     categories_json_file = "datasets/panoptic_coco_categories.json"
146 |     segmentations_folder = os.path.join(root, "panoptic_val2017")
147 |     
148 |     convert_panoptic_to_detection_coco_format(input_json_file,
149 |                                               segmentations_folder,
150 |                                               output_json_file,
151 |                                               categories_json_file,
152 |                                               args.things_only)
153 | 


--------------------------------------------------------------------------------
/datasets/prepare_ade20k_ins_seg.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | # Copyright (c) Facebook, Inc. and its affiliates.
  4 | import glob
  5 | import json
  6 | import os
  7 | from collections import Counter
  8 | 
  9 | import numpy as np
 10 | import tqdm
 11 | from panopticapi.utils import IdGenerator, save_json
 12 | from PIL import Image
 13 | import pycocotools.mask as mask_util
 14 | 
 15 | 
 16 | if __name__ == "__main__":
 17 |     dataset_dir = os.getenv("DETECTRON2_DATASETS", "datasets")
 18 | 
 19 |     for name, dirname in [("train", "training"), ("val", "validation")]:
 20 |         image_dir = os.path.join(dataset_dir, f"ADEChallengeData2016/images/{dirname}/")
 21 |         instance_dir = os.path.join(
 22 |             dataset_dir, f"ADEChallengeData2016/annotations_instance/{dirname}/"
 23 |         )
 24 | 
 25 |         # img_id = 0
 26 |         ann_id = 1
 27 | 
 28 |         # json
 29 |         out_file = os.path.join(dataset_dir, f"ADEChallengeData2016/ade20k_instance_{name}.json")
 30 | 
 31 |         # json config
 32 |         instance_config_file = "datasets/ade20k_instance_imgCatIds.json"
 33 |         with open(instance_config_file) as f:
 34 |             category_dict = json.load(f)["categories"]
 35 | 
 36 |         # load catid mapping
 37 |         # it is important to share category id for both instance and panoptic annotations
 38 |         mapping_file = "datasets/ade20k_instance_catid_mapping.txt"
 39 |         with open(mapping_file) as f:
 40 |             map_id = {}
 41 |             for i, line in enumerate(f.readlines()):
 42 |                 if i == 0:
 43 |                     continue
 44 |                 ins_id, sem_id, _ = line.strip().split()
 45 |                 # shift id by 1 because we want it to start from 0!
 46 |                 # ignore_label becomes 255
 47 |                 map_id[int(ins_id)] = int(sem_id) - 1
 48 | 
 49 |         for cat in category_dict:
 50 |             cat["id"] = map_id[cat["id"]]
 51 | 
 52 |         filenames = sorted(glob.glob(os.path.join(image_dir, "*.jpg")))
 53 | 
 54 |         ann_dict = {}
 55 |         images = []
 56 |         annotations = []
 57 | 
 58 |         for idx, filename in enumerate(tqdm.tqdm(filenames)):
 59 |             image = {}
 60 |             image_id = os.path.basename(filename).split(".")[0]
 61 | 
 62 |             image["id"] = image_id
 63 |             image["file_name"] = os.path.basename(filename)
 64 | 
 65 |             original_format = np.array(Image.open(filename))
 66 |             image["width"] = original_format.shape[1]
 67 |             image["height"] = original_format.shape[0]
 68 | 
 69 |             images.append(image)
 70 | 
 71 |             filename_instance = os.path.join(instance_dir, image_id + ".png")
 72 |             ins_seg = np.asarray(Image.open(filename_instance))
 73 |             assert ins_seg.dtype == np.uint8
 74 | 
 75 |             instance_cat_ids = ins_seg[..., 0]
 76 |             # instance id starts from 1!
 77 |             # because 0 is reserved as VOID label
 78 |             instance_ins_ids = ins_seg[..., 1]
 79 | 
 80 |             # process things
 81 |             for thing_id in np.unique(instance_ins_ids):
 82 |                 if thing_id == 0:
 83 |                     continue
 84 |                 mask = instance_ins_ids == thing_id
 85 |                 instance_cat_id = np.unique(instance_cat_ids[mask])
 86 |                 assert len(instance_cat_id) == 1
 87 | 
 88 |                 anno = {}
 89 |                 anno['id'] = ann_id
 90 |                 ann_id += 1
 91 |                 anno['image_id'] = image['id']
 92 |                 anno["iscrowd"] = int(0)
 93 |                 anno["category_id"] = int(map_id[instance_cat_id[0]])
 94 | 
 95 |                 inds = np.nonzero(mask)
 96 |                 ymin, ymax = inds[0].min(), inds[0].max()
 97 |                 xmin, xmax = inds[1].min(), inds[1].max()
 98 |                 anno["bbox"] = [int(xmin), int(ymin), int(xmax - xmin + 1), int(ymax - ymin + 1)]
 99 |                 # if xmax <= xmin or ymax <= ymin:
100 |                 #     continue
101 |                 rle = mask_util.encode(np.array(mask[:, :, None], order="F", dtype="uint8"))[0]
102 |                 rle["counts"] = rle["counts"].decode("utf-8")
103 |                 anno["segmentation"] = rle
104 |                 anno["area"] = int(mask_util.area(rle))
105 |                 annotations.append(anno)
106 | 
107 |         # save this
108 |         ann_dict['images'] = images
109 |         ann_dict['categories'] = category_dict
110 |         ann_dict['annotations'] = annotations
111 | 
112 |         save_json(ann_dict, out_file)
113 | 


--------------------------------------------------------------------------------
/datasets/prepare_ade20k_sem_seg.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | # Copyright (c) Facebook, Inc. and its affiliates.
 4 | import os
 5 | from pathlib import Path
 6 | 
 7 | import numpy as np
 8 | import tqdm
 9 | from PIL import Image
10 | 
11 | 
12 | def convert(input, output):
13 |     img = np.asarray(Image.open(input))
14 |     assert img.dtype == np.uint8
15 |     img = img - 1  # 0 (ignore) becomes 255. others are shifted by 1
16 |     Image.fromarray(img).save(output)
17 | 
18 | 
19 | if __name__ == "__main__":
20 |     dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "ADEChallengeData2016"
21 |     for name in ["training", "validation"]:
22 |         annotation_dir = dataset_dir / "annotations" / name
23 |         output_dir = dataset_dir / "annotations_detectron2" / name
24 |         output_dir.mkdir(parents=True, exist_ok=True)
25 |         for file in tqdm.tqdm(list(annotation_dir.iterdir())):
26 |             output_file = output_dir / file.name
27 |             convert(file, output_file)
28 | 


--------------------------------------------------------------------------------
/datasets/prepare_coco_semantic_annos_from_panoptic_annos.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | # Copyright (c) Facebook, Inc. and its affiliates.
 4 | 
 5 | import functools
 6 | import json
 7 | import multiprocessing as mp
 8 | import numpy as np
 9 | import os
10 | import time
11 | from fvcore.common.download import download
12 | from panopticapi.utils import rgb2id
13 | from PIL import Image
14 | 
15 | from detectron2.data.datasets.builtin_meta import COCO_CATEGORIES
16 | 
17 | 
18 | def _process_panoptic_to_semantic(input_panoptic, output_semantic, segments, id_map):
19 |     panoptic = np.asarray(Image.open(input_panoptic), dtype=np.uint32)
20 |     panoptic = rgb2id(panoptic)
21 |     output = np.zeros_like(panoptic, dtype=np.uint8) + 255
22 |     for seg in segments:
23 |         cat_id = seg["category_id"]
24 |         new_cat_id = id_map[cat_id]
25 |         output[panoptic == seg["id"]] = new_cat_id
26 |     Image.fromarray(output).save(output_semantic)
27 | 
28 | 
29 | def separate_coco_semantic_from_panoptic(panoptic_json, panoptic_root, sem_seg_root, categories):
30 |     """
31 |     Create semantic segmentation annotations from panoptic segmentation
32 |     annotations, to be used by PanopticFPN.
33 |     It maps all thing categories to class 0, and maps all unlabeled pixels to class 255.
34 |     It maps all stuff categories to contiguous ids starting from 1.
35 |     Args:
36 |         panoptic_json (str): path to the panoptic json file, in COCO's format.
37 |         panoptic_root (str): a directory with panoptic annotation files, in COCO's format.
38 |         sem_seg_root (str): a directory to output semantic annotation files
39 |         categories (list[dict]): category metadata. Each dict needs to have:
40 |             "id": corresponds to the "category_id" in the json annotations
41 |             "isthing": 0 or 1
42 |     """
43 |     os.makedirs(sem_seg_root, exist_ok=True)
44 | 
45 |     id_map = {}  # map from category id to id in the output semantic annotation
46 |     assert len(categories) <= 254
47 |     for i, k in enumerate(categories):
48 |         id_map[k["id"]] = i
49 |     # what is id = 0?
50 |     # id_map[0] = 255
51 |     print(id_map)
52 | 
53 |     with open(panoptic_json) as f:
54 |         obj = json.load(f)
55 | 
56 |     pool = mp.Pool(processes=max(mp.cpu_count() // 2, 4))
57 | 
58 |     def iter_annotations():
59 |         for anno in obj["annotations"]:
60 |             file_name = anno["file_name"]
61 |             segments = anno["segments_info"]
62 |             input = os.path.join(panoptic_root, file_name)
63 |             output = os.path.join(sem_seg_root, file_name)
64 |             yield input, output, segments
65 | 
66 |     print("Start writing to {} ...".format(sem_seg_root))
67 |     start = time.time()
68 |     pool.starmap(
69 |         functools.partial(_process_panoptic_to_semantic, id_map=id_map),
70 |         iter_annotations(),
71 |         chunksize=100,
72 |     )
73 |     print("Finished. time: {:.2f}s".format(time.time() - start))
74 | 
75 | 
76 | if __name__ == "__main__":
77 |     dataset_dir = os.path.join(os.getenv("DETECTRON2_DATASETS", "datasets"), "coco")
78 |     for s in ["val2017", "train2017"]:
79 |         separate_coco_semantic_from_panoptic(
80 |             os.path.join(dataset_dir, "annotations/panoptic_{}.json".format(s)),
81 |             os.path.join(dataset_dir, "panoptic_{}".format(s)),
82 |             os.path.join(dataset_dir, "panoptic_semseg_{}".format(s)),
83 |             COCO_CATEGORIES,
84 |         )
85 | 


--------------------------------------------------------------------------------
/demo/README.md:
--------------------------------------------------------------------------------
 1 | # OneFormer Demo
 2 | 
 3 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/SHI-Labs/OneFormer/blob/main/colab/oneformer_colab.ipynb) [![Huggingface space](https://img.shields.io/badge/🤗-Huggingface%20Space-cyan.svg)](https://huggingface.co/spaces/shi-labs/OneFormer)
 4 | 
 5 | - Pick a model and its config file from. For example, `configs/ade20k/swin/oneformer_swin_large_IN21k_384_bs16_160k.yaml`.
 6 | - We provide `demo.py` that is able to demo builtin configs.
 7 | - You need to specify the `task` token value during inference, The outputs will be saved accordingly in the specified `OUTPUT_DIR`:
 8 |   - `panoptic`: Panoptic, Semantic and Instance Predictions when the value of `task` token is `panoptic`.
 9 |   - `instance`: Instance Predictions when the value of `task` token is `instance`.
10 |   - `semantic`: Semantic Predictions when the value of `task` token is `semantic`.
11 |   - >Note: You can change the outputs to be saved on line 60 in [predictor.py](predictor.py).
12 | 
13 | ```bash
14 | export task=panoptic
15 | 
16 | python demo.py --config-file ../configs/ade20k/swin/oneformer_swin_large_bs16_160k.yaml \
17 |   --input <path-to-images> \
18 |   --output <output-path> \
19 |   --task $task \
20 |   --opts MODEL.IS_TRAIN False MODEL.IS_DEMO True MODEL.WEIGHTS <path-to-checkpoint>
21 | ```
22 | 
23 | For details of the command line arguments, see `demo.py -h` or look at its source code
24 | to understand its behavior. 


--------------------------------------------------------------------------------
/demo/colormap.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Reference: https://github.com/facebookresearch/detectron2/blob/main/detectron2/utils/colormap.py
 3 | # Modified by Jitesh Jain (https://github.com/praeclarumjj3)
 4 | # ------------------------------------------------------------------------------
 5 | 
 6 | """
 7 | An awesome colormap for really neat visualizations.
 8 | Copied from Detectron, and removed gray colors.
 9 | """
10 | 
11 | import numpy as np
12 | import random
13 | random.seed(0)
14 | 
15 | __all__ = ["colormap", "random_color", "random_colors"]
16 | 
17 | _COLORS = []
18 | 
19 | def gen_color():
20 |     color = tuple(np.round(np.random.choice(range(256), size=3)/255, 3))
21 |     if color not in _COLORS and np.mean(color) != 0.0:
22 |         _COLORS.append(color)
23 |     else:
24 |         gen_color()
25 | 
26 | 
27 | for _ in range(300):
28 |     gen_color()
29 | 
30 | 
31 | def colormap(rgb=False, maximum=255):
32 |     """
33 |     Args:
34 |         rgb (bool): whether to return RGB colors or BGR colors.
35 |         maximum (int): either 255 or 1
36 |     Returns:
37 |         ndarray: a float32 array of Nx3 colors, in range [0, 255] or [0, 1]
38 |     """
39 |     assert maximum in [255, 1], maximum
40 |     c = _COLORS * maximum
41 |     if not rgb:
42 |         c = c[:, ::-1]
43 |     return c
44 | 
45 | 
46 | def random_color(rgb=False, maximum=255):
47 |     """
48 |     Args:
49 |         rgb (bool): whether to return RGB colors or BGR colors.
50 |         maximum (int): either 255 or 1
51 |     Returns:
52 |         ndarray: a vector of 3 numbers
53 |     """
54 |     idx = np.random.randint(0, len(_COLORS))
55 |     ret = _COLORS[idx] * maximum
56 |     if not rgb:
57 |         ret = ret[::-1]
58 |     return ret
59 | 
60 | 
61 | def random_colors(N, rgb=False, maximum=255):
62 |     """
63 |     Args:
64 |         N (int): number of unique colors needed
65 |         rgb (bool): whether to return RGB colors or BGR colors.
66 |         maximum (int): either 255 or 1
67 |     Returns:
68 |         ndarray: a list of random_color
69 |     """
70 |     indices = random.sample(range(len(_COLORS)), N)
71 |     ret = [_COLORS[i] * maximum for i in indices]
72 |     if not rgb:
73 |         ret = [x[::-1] for x in ret]
74 |     return ret
75 | 
76 | 
77 | if __name__ == "__main__":
78 |     import cv2
79 | 
80 |     size = 100
81 |     H, W = 10, 10
82 |     canvas = np.random.rand(H * size, W * size, 3).astype("float32")
83 |     for h in range(H):
84 |         for w in range(W):
85 |             idx = h * W + w
86 |             if idx >= len(_COLORS):
87 |                 break
88 |             canvas[h * size : (h + 1) * size, w * size : (w + 1) * size] = _COLORS[idx]
89 |     cv2.imshow("a", canvas)
90 |     cv2.waitKey(0)


--------------------------------------------------------------------------------
/demo/defaults.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Reference: https://github.com/facebookresearch/detectron2/blob/main/detectron2/engine/defaults.py
 3 | # Modified by Jitesh Jain (https://github.com/praeclarumjj3)
 4 | # ------------------------------------------------------------------------------
 5 | 
 6 | import torch
 7 | import detectron2.data.transforms as T
 8 | from detectron2.checkpoint import DetectionCheckpointer
 9 | from detectron2.data import (
10 |     MetadataCatalog,
11 | )
12 | from detectron2.modeling import build_model
13 | 
14 | 
15 | __all__ = [
16 |     "DefaultPredictor",
17 | ]
18 | 
19 | 
20 | class DefaultPredictor:
21 |     """
22 |     Create a simple end-to-end predictor with the given config that runs on
23 |     single device for a single input image.
24 |     Compared to using the model directly, this class does the following additions:
25 |     1. Load checkpoint from `cfg.MODEL.WEIGHTS`.
26 |     2. Always take BGR image as the input and apply conversion defined by `cfg.INPUT.FORMAT`.
27 |     3. Apply resizing defined by `cfg.INPUT.{MIN,MAX}_SIZE_TEST`.
28 |     4. Take one input image and produce a single output, instead of a batch.
29 |     This is meant for simple demo purposes, so it does the above steps automatically.
30 |     This is not meant for benchmarks or running complicated inference logic.
31 |     If you'd like to do anything more complicated, please refer to its source code as
32 |     examples to build and use the model manually.
33 |     Attributes:
34 |         metadata (Metadata): the metadata of the underlying dataset, obtained from
35 |             cfg.DATASETS.TEST.
36 |     Examples:
37 |     ::
38 |         pred = DefaultPredictor(cfg)
39 |         inputs = cv2.imread("input.jpg")
40 |         outputs = pred(inputs)
41 |     """
42 | 
43 |     def __init__(self, cfg):
44 |         self.cfg = cfg.clone()  # cfg can be modified by model
45 |         self.model = build_model(self.cfg)
46 |         self.model.eval()
47 |         if len(cfg.DATASETS.TEST):
48 |             self.metadata = MetadataCatalog.get(cfg.DATASETS.TEST[0])
49 | 
50 |         checkpointer = DetectionCheckpointer(self.model)
51 |         checkpointer.load(cfg.MODEL.WEIGHTS)
52 | 
53 |         self.aug = T.ResizeShortestEdge(
54 |             [cfg.INPUT.MIN_SIZE_TEST, cfg.INPUT.MIN_SIZE_TEST], cfg.INPUT.MAX_SIZE_TEST
55 |         )
56 | 
57 |         self.input_format = cfg.INPUT.FORMAT
58 |         assert self.input_format in ["RGB", "BGR"], self.input_format
59 | 
60 |     def __call__(self, original_image, task):
61 |         """
62 |         Args:
63 |             original_image (np.ndarray): an image of shape (H, W, C) (in BGR order).
64 |         Returns:
65 |             predictions (dict):
66 |                 the output of the model for one image only.
67 |                 See :doc:`/tutorials/models` for details about the format.
68 |         """
69 |         with torch.no_grad():  # https://github.com/sphinx-doc/sphinx/issues/4258
70 |             # Apply pre-processing to image.
71 |             if self.input_format == "RGB":
72 |                 # whether the model expects BGR inputs or RGB
73 |                 original_image = original_image[:, :, ::-1]
74 |             height, width = original_image.shape[:2]
75 |             image = self.aug.get_transform(original_image).apply_image(original_image)
76 |             image = torch.as_tensor(image.astype("float32").transpose(2, 0, 1))
77 |             
78 |             task = f"The task is {task}"
79 | 
80 |             inputs = {"image": image, "height": height, "width": width, "task": task}
81 |             predictions = self.model([inputs])[0]
82 |             return predictions


--------------------------------------------------------------------------------
/demo/demo.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------------
  2 | # Reference: https://github.com/facebookresearch/Mask2Former/blob/main/demo/demo.py
  3 | # Modified by Jitesh Jain (https://github.com/praeclarumjj3)
  4 | # ------------------------------------------------------------------------------
  5 | 
  6 | import argparse
  7 | import multiprocessing as mp
  8 | import os
  9 | import torch
 10 | import random
 11 | # fmt: off
 12 | import sys
 13 | sys.path.insert(1, os.path.join(sys.path[0], '..'))
 14 | # fmt: on
 15 | 
 16 | import time
 17 | import cv2
 18 | import numpy as np
 19 | import tqdm
 20 | 
 21 | from detectron2.config import get_cfg
 22 | from detectron2.data.detection_utils import read_image
 23 | from detectron2.projects.deeplab import add_deeplab_config
 24 | from detectron2.utils.logger import setup_logger
 25 | 
 26 | from oneformer import (
 27 |     add_oneformer_config,
 28 |     add_common_config,
 29 |     add_swin_config,
 30 |     add_dinat_config,
 31 |     add_convnext_config,
 32 | )
 33 | from predictor import VisualizationDemo
 34 | 
 35 | # constants
 36 | WINDOW_NAME = "OneFormer Demo"
 37 | 
 38 | def setup_cfg(args):
 39 |     # load config from file and command-line arguments
 40 |     cfg = get_cfg()
 41 |     add_deeplab_config(cfg)
 42 |     add_common_config(cfg)
 43 |     add_swin_config(cfg)
 44 |     add_dinat_config(cfg)
 45 |     add_convnext_config(cfg)
 46 |     add_oneformer_config(cfg)
 47 |     cfg.merge_from_file(args.config_file)
 48 |     cfg.merge_from_list(args.opts)
 49 |     cfg.freeze()
 50 |     return cfg
 51 | 
 52 | 
 53 | def get_parser():
 54 |     parser = argparse.ArgumentParser(description="oneformer demo for builtin configs")
 55 |     parser.add_argument(
 56 |         "--config-file",
 57 |         default="../configs/ade20k/swin/oneformer_swin_large_IN21k_384_bs16_160k.yaml",
 58 |         metavar="FILE",
 59 |         help="path to config file",
 60 |     )
 61 |     parser.add_argument("--task", help="Task type")
 62 |     parser.add_argument(
 63 |         "--input",
 64 |         nargs="+",
 65 |         help="A list of space separated input images; "
 66 |         "or a single glob pattern such as 'directory/*.jpg'",
 67 |     )
 68 |     parser.add_argument(
 69 |         "--output",
 70 |         help="A file or directory to save output visualizations. "
 71 |         "If not given, will show output in an OpenCV window.",
 72 |     )
 73 | 
 74 |     parser.add_argument(
 75 |         "--confidence-threshold",
 76 |         type=float,
 77 |         default=0.5,
 78 |         help="Minimum score for instance predictions to be shown",
 79 |     )
 80 |     parser.add_argument(
 81 |         "--opts",
 82 |         help="Modify config options using the command-line 'KEY VALUE' pairs",
 83 |         default=[],
 84 |         nargs=argparse.REMAINDER,
 85 |     )
 86 |     return parser
 87 | 
 88 | if __name__ == "__main__":
 89 |     seed = 0
 90 |     random.seed(seed)
 91 |     np.random.seed(seed)
 92 |     torch.manual_seed(seed)
 93 |     torch.cuda.manual_seed_all(seed)
 94 |     torch.backends.cudnn.deterministic = True
 95 |     torch.backends.cudnn.benchmark = False
 96 | 
 97 |     mp.set_start_method("spawn", force=True)
 98 |     args = get_parser().parse_args()
 99 |     setup_logger(name="fvcore")
100 |     logger = setup_logger()
101 |     logger.info("Arguments: " + str(args))
102 | 
103 |     cfg = setup_cfg(args)
104 | 
105 |     demo = VisualizationDemo(cfg)
106 | 
107 |     if args.input:
108 |         for path in tqdm.tqdm(args.input, disable=not args.output):
109 |             # use PIL, to be consistent with evaluation
110 |                 
111 |             img = read_image(path, format="BGR")
112 |             start_time = time.time()
113 |             predictions, visualized_output = demo.run_on_image(img, args.task)
114 |             logger.info(
115 |                 "{}: {} in {:.2f}s".format(
116 |                     path,
117 |                     "detected {} instances".format(len(predictions["instances"]))
118 |                     if "instances" in predictions
119 |                     else "finished",
120 |                     time.time() - start_time,
121 |                 )
122 |             )
123 |             if args.output:
124 |                 if len(args.input) == 1:
125 |                     for k in visualized_output.keys():
126 |                         os.makedirs(k, exist_ok=True)
127 |                         out_filename = os.path.join(k, args.output)
128 |                         visualized_output[k].save(out_filename)    
129 |                 else:
130 |                     for k in visualized_output.keys():
131 |                         opath = os.path.join(args.output, k)    
132 |                         os.makedirs(opath, exist_ok=True)
133 |                         out_filename = os.path.join(opath, os.path.basename(path))
134 |                         visualized_output[k].save(out_filename)    
135 |             else:
136 |                 raise ValueError("Please specify an output path!")
137 |     else:
138 |         raise ValueError("No Input Given")
139 | 


--------------------------------------------------------------------------------
/demo/predictor.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------------
  2 | # Reference: https://github.com/facebookresearch/detectron2/blob/master/demo/predictor.py
  3 | # Modified by Jitesh Jain (https://github.com/praeclarumjj3)
  4 | # ------------------------------------------------------------------------------
  5 | 
  6 | import atexit
  7 | import bisect
  8 | import multiprocessing as mp
  9 | import torch
 10 | 
 11 | from detectron2.data import MetadataCatalog
 12 | from defaults import DefaultPredictor
 13 | from visualizer import ColorMode, Visualizer
 14 | 
 15 | 
 16 | class VisualizationDemo(object):
 17 |     def __init__(self, cfg, instance_mode=ColorMode.IMAGE, parallel=False):
 18 |         """
 19 |         Args:
 20 |             cfg (CfgNode):
 21 |             instance_mode (ColorMode):
 22 |             parallel (bool): whether to run the model in different processes from visualization.
 23 |                 Useful since the visualization logic can be slow.
 24 |         """
 25 |         self.metadata = MetadataCatalog.get(
 26 |             cfg.DATASETS.TEST_PANOPTIC[0] if len(cfg.DATASETS.TEST_PANOPTIC) else "__unused"
 27 |         )
 28 |         if 'cityscapes_fine_sem_seg_val' in cfg.DATASETS.TEST_PANOPTIC[0]:
 29 |             from cityscapesscripts.helpers.labels import labels
 30 |             stuff_colors = [k.color for k in labels if k.trainId != 255]
 31 |             self.metadata = self.metadata.set(stuff_colors=stuff_colors)
 32 |         self.cpu_device = torch.device("cpu")
 33 |         self.instance_mode = instance_mode
 34 | 
 35 |         self.parallel = parallel
 36 |         if parallel:
 37 |             num_gpu = torch.cuda.device_count()
 38 |             self.predictor = AsyncPredictor(cfg, num_gpus=num_gpu)
 39 |         else:
 40 |             self.predictor = DefaultPredictor(cfg)
 41 | 
 42 |     def run_on_image(self, image, task):
 43 |         """
 44 |         Args:
 45 |             image (np.ndarray): an image of shape (H, W, C) (in BGR order).
 46 |                 This is the format used by OpenCV.
 47 |         Returns:
 48 |             predictions (dict): the output of the model.
 49 |             vis_output (VisImage): the visualized image output.
 50 |         """
 51 |         vis_output = None
 52 |         # Convert image from OpenCV BGR format to Matplotlib RGB format.
 53 |         image = image[:, :, ::-1]
 54 |         vis_output = {}
 55 |         
 56 |         if task == 'panoptic':
 57 |             visualizer = Visualizer(image, metadata=self.metadata, instance_mode=ColorMode.IMAGE)
 58 |             predictions = self.predictor(image, task)
 59 |             panoptic_seg, segments_info = predictions["panoptic_seg"]
 60 |             vis_output['panoptic_inference'] = visualizer.draw_panoptic_seg_predictions(
 61 |             panoptic_seg.to(self.cpu_device), segments_info, alpha=0.7
 62 |         )
 63 | 
 64 |         if task == 'panoptic' or task == 'semantic':
 65 |             visualizer = Visualizer(image, metadata=self.metadata, instance_mode=ColorMode.IMAGE_BW)
 66 |             predictions = self.predictor(image, task)
 67 |             vis_output['semantic_inference'] = visualizer.draw_sem_seg(
 68 |                 predictions["sem_seg"].argmax(dim=0).to(self.cpu_device), alpha=0.7
 69 |             )
 70 | 
 71 |         if task == 'panoptic' or task == 'instance':
 72 |             visualizer = Visualizer(image, metadata=self.metadata, instance_mode=ColorMode.IMAGE_BW)
 73 |             predictions = self.predictor(image, task)
 74 |             instances = predictions["instances"].to(self.cpu_device)
 75 |             vis_output['instance_inference'] = visualizer.draw_instance_predictions(predictions=instances, alpha=1)
 76 | 
 77 |         return predictions, vis_output
 78 | 
 79 | 
 80 | class AsyncPredictor:
 81 |     """
 82 |     A predictor that runs the model asynchronously, possibly on >1 GPUs.
 83 |     Because rendering the visualization takes considerably amount of time,
 84 |     this helps improve throughput a little bit when rendering videos.
 85 |     """
 86 | 
 87 |     class _StopToken:
 88 |         pass
 89 | 
 90 |     class _PredictWorker(mp.Process):
 91 |         def __init__(self, cfg, task_queue, result_queue):
 92 |             self.cfg = cfg
 93 |             self.task_queue = task_queue
 94 |             self.result_queue = result_queue
 95 |             super().__init__()
 96 | 
 97 |         def run(self):
 98 |             predictor = DefaultPredictor(self.cfg)
 99 | 
100 |             while True:
101 |                 task = self.task_queue.get()
102 |                 if isinstance(task, AsyncPredictor._StopToken):
103 |                     break
104 |                 idx, data = task
105 |                 result = predictor(data)
106 |                 self.result_queue.put((idx, result))
107 | 
108 |     def __init__(self, cfg, num_gpus: int = 1):
109 |         """
110 |         Args:
111 |             cfg (CfgNode):
112 |             num_gpus (int): if 0, will run on CPU
113 |         """
114 |         num_workers = max(num_gpus, 1)
115 |         self.task_queue = mp.Queue(maxsize=num_workers * 3)
116 |         self.result_queue = mp.Queue(maxsize=num_workers * 3)
117 |         self.procs = []
118 |         for gpuid in range(max(num_gpus, 1)):
119 |             cfg = cfg.clone()
120 |             cfg.defrost()
121 |             cfg.MODEL.DEVICE = "cuda:{}".format(gpuid) if num_gpus > 0 else "cpu"
122 |             self.procs.append(
123 |                 AsyncPredictor._PredictWorker(cfg, self.task_queue, self.result_queue)
124 |             )
125 | 
126 |         self.put_idx = 0
127 |         self.get_idx = 0
128 |         self.result_rank = []
129 |         self.result_data = []
130 | 
131 |         for p in self.procs:
132 |             p.start()
133 |         atexit.register(self.shutdown)
134 | 
135 |     def put(self, image):
136 |         self.put_idx += 1
137 |         self.task_queue.put((self.put_idx, image))
138 | 
139 |     def get(self):
140 |         self.get_idx += 1  # the index needed for this request
141 |         if len(self.result_rank) and self.result_rank[0] == self.get_idx:
142 |             res = self.result_data[0]
143 |             del self.result_data[0], self.result_rank[0]
144 |             return res
145 | 
146 |         while True:
147 |             # make sure the results are returned in the correct order
148 |             idx, res = self.result_queue.get()
149 |             if idx == self.get_idx:
150 |                 return res
151 |             insert = bisect.bisect(self.result_rank, idx)
152 |             self.result_rank.insert(insert, idx)
153 |             self.result_data.insert(insert, res)
154 | 
155 |     def __len__(self):
156 |         return self.put_idx - self.get_idx
157 | 
158 |     def __call__(self, image):
159 |         self.put(image)
160 |         return self.get()
161 | 
162 |     def shutdown(self):
163 |         for _ in self.procs:
164 |             self.task_queue.put(AsyncPredictor._StopToken())
165 | 
166 |     @property
167 |     def default_buffer_size(self):
168 |         return len(self.procs) * 5
169 | 


--------------------------------------------------------------------------------
/images/teaser.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SHI-Labs/OneFormer/4962ef6a96ffb76a76771bfa3e8b3587f209752b/images/teaser.png


--------------------------------------------------------------------------------
/oneformer/__init__.py:
--------------------------------------------------------------------------------
 1 | from . import data  # register all new datasets
 2 | from . import modeling
 3 | 
 4 | # config
 5 | from .config import *
 6 | 
 7 | # dataset loading
 8 | from .data.dataset_mappers.coco_unified_new_baseline_dataset_mapper import COCOUnifiedNewBaselineDatasetMapper
 9 | from .data.dataset_mappers.oneformer_unified_dataset_mapper import (
10 |     OneFormerUnifiedDatasetMapper,
11 | )
12 | 
13 | # models
14 | from .oneformer_model import OneFormer
15 | from .test_time_augmentation import SemanticSegmentorWithTTA
16 | 
17 | # evaluation
18 | from .evaluation.instance_evaluation import InstanceSegEvaluator
19 | 


--------------------------------------------------------------------------------
/oneformer/config.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Copyright (c) Facebook, Inc. and its affiliates.
  3 | from detectron2.config import CfgNode as CN
  4 | 
  5 | __all__ = ["add_common_config", "add_oneformer_config", "add_swin_config", 
  6 |             "add_dinat_config", "add_convnext_config"]
  7 | 
  8 | def add_common_config(cfg):
  9 |     """
 10 |     Add config for common configuration
 11 |     """
 12 | 
 13 |     # data config
 14 |     # select the dataset mapper
 15 |     cfg.INPUT.DATASET_MAPPER_NAME = "oneformer_unified"
 16 |     # Color augmentation
 17 |     cfg.INPUT.COLOR_AUG_SSD = False
 18 |     # We retry random cropping until no single category in semantic segmentation GT occupies more
 19 |     # than `SINGLE_CATEGORY_MAX_AREA` part of the crop.
 20 |     cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA = 1.0
 21 |     # Pad image and segmentation GT in dataset mapper.
 22 |     cfg.INPUT.SIZE_DIVISIBILITY = -1
 23 | 
 24 |     cfg.INPUT.TASK_SEQ_LEN = 77
 25 |     cfg.INPUT.MAX_SEQ_LEN = 77
 26 | 
 27 |     cfg.INPUT.TASK_PROB = CN()
 28 |     cfg.INPUT.TASK_PROB.SEMANTIC = 0.33
 29 |     cfg.INPUT.TASK_PROB.INSTANCE = 0.66
 30 | 
 31 |     # test dataset
 32 |     cfg.DATASETS.TEST_PANOPTIC = ("",)
 33 |     cfg.DATASETS.TEST_INSTANCE = ("",)
 34 |     cfg.DATASETS.TEST_SEMANTIC = ("",)
 35 | 
 36 |     # solver config
 37 |     # weight decay on embedding
 38 |     cfg.SOLVER.WEIGHT_DECAY_EMBED = 0.0
 39 |     # optimizer
 40 |     cfg.SOLVER.OPTIMIZER = "ADAMW"
 41 |     cfg.SOLVER.BACKBONE_MULTIPLIER = 0.1
 42 | 
 43 |     # wandb
 44 |     cfg.WANDB = CN()
 45 |     cfg.WANDB.PROJECT = "OneFormer"
 46 |     cfg.WANDB.NAME = None
 47 | 
 48 |     cfg.MODEL.IS_TRAIN = True
 49 |     cfg.MODEL.IS_DEMO = False
 50 | 
 51 |     # text encoder config
 52 |     cfg.MODEL.TEXT_ENCODER = CN()
 53 | 
 54 |     cfg.MODEL.TEXT_ENCODER.WIDTH = 256
 55 |     cfg.MODEL.TEXT_ENCODER.CONTEXT_LENGTH = 77
 56 |     cfg.MODEL.TEXT_ENCODER.NUM_LAYERS = 12
 57 |     cfg.MODEL.TEXT_ENCODER.VOCAB_SIZE = 49408
 58 |     cfg.MODEL.TEXT_ENCODER.PROJ_NUM_LAYERS = 2
 59 |     cfg.MODEL.TEXT_ENCODER.N_CTX = 16
 60 | 
 61 |     # oneformer inference config
 62 |     cfg.MODEL.TEST = CN()
 63 |     cfg.MODEL.TEST.SEMANTIC_ON = True
 64 |     cfg.MODEL.TEST.INSTANCE_ON = False
 65 |     cfg.MODEL.TEST.PANOPTIC_ON = False
 66 |     cfg.MODEL.TEST.DETECTION_ON = False
 67 |     cfg.MODEL.TEST.OBJECT_MASK_THRESHOLD = 0.0
 68 |     cfg.MODEL.TEST.OVERLAP_THRESHOLD = 0.0
 69 |     cfg.MODEL.TEST.SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE = False
 70 |     cfg.MODEL.TEST.TASK = "panoptic"
 71 | 
 72 |     # TEST AUG Slide
 73 |     cfg.TEST.AUG.IS_SLIDE = False
 74 |     cfg.TEST.AUG.CROP_SIZE = (640, 640)
 75 |     cfg.TEST.AUG.STRIDE = (426, 426)
 76 |     cfg.TEST.AUG.SCALE = (2048, 640)
 77 |     cfg.TEST.AUG.SETR_MULTI_SCALE = True
 78 |     cfg.TEST.AUG.KEEP_RATIO = True
 79 |     cfg.TEST.AUG.SIZE_DIVISOR = 32
 80 | 
 81 |     # pixel decoder config
 82 |     cfg.MODEL.SEM_SEG_HEAD.MASK_DIM = 256
 83 |     # adding transformer in pixel decoder
 84 |     cfg.MODEL.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS = 0
 85 |     # pixel decoder
 86 |     cfg.MODEL.SEM_SEG_HEAD.PIXEL_DECODER_NAME = "BasePixelDecoder"
 87 |     cfg.MODEL.SEM_SEG_HEAD.SEM_EMBED_DIM = 256
 88 |     cfg.MODEL.SEM_SEG_HEAD.INST_EMBED_DIM = 256
 89 | 
 90 |     # LSJ aug
 91 |     cfg.INPUT.IMAGE_SIZE = 1024
 92 |     cfg.INPUT.MIN_SCALE = 0.1
 93 |     cfg.INPUT.MAX_SCALE = 2.0
 94 | 
 95 |     # MSDeformAttn encoder configs
 96 |     cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES = ["res3", "res4", "res5"]
 97 |     cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_POINTS = 4
 98 |     cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_HEADS = 8
 99 | 
100 | def add_oneformer_config(cfg):
101 |     """
102 |     Add config for ONE_FORMER.
103 |     """
104 | 
105 |     # oneformer model config
106 |     cfg.MODEL.ONE_FORMER = CN()
107 | 
108 |     # loss
109 |     cfg.MODEL.ONE_FORMER.DEEP_SUPERVISION = True
110 |     cfg.MODEL.ONE_FORMER.NO_OBJECT_WEIGHT = 0.1
111 |     cfg.MODEL.ONE_FORMER.CLASS_WEIGHT = 1.0
112 |     cfg.MODEL.ONE_FORMER.DICE_WEIGHT = 1.0
113 |     cfg.MODEL.ONE_FORMER.MASK_WEIGHT = 20.0
114 |     cfg.MODEL.ONE_FORMER.CONTRASTIVE_WEIGHT = 0.5
115 |     cfg.MODEL.ONE_FORMER.CONTRASTIVE_TEMPERATURE = 0.07
116 | 
117 |     # transformer config
118 |     cfg.MODEL.ONE_FORMER.NHEADS = 8
119 |     cfg.MODEL.ONE_FORMER.DROPOUT = 0.1
120 |     cfg.MODEL.ONE_FORMER.DIM_FEEDFORWARD = 2048
121 |     cfg.MODEL.ONE_FORMER.ENC_LAYERS = 0
122 |     cfg.MODEL.ONE_FORMER.CLASS_DEC_LAYERS = 2
123 |     cfg.MODEL.ONE_FORMER.DEC_LAYERS = 6
124 |     cfg.MODEL.ONE_FORMER.PRE_NORM = False
125 | 
126 |     cfg.MODEL.ONE_FORMER.HIDDEN_DIM = 256
127 |     cfg.MODEL.ONE_FORMER.NUM_OBJECT_QUERIES = 120
128 |     cfg.MODEL.ONE_FORMER.NUM_OBJECT_CTX = 16
129 |     cfg.MODEL.ONE_FORMER.USE_TASK_NORM = True
130 | 
131 |     cfg.MODEL.ONE_FORMER.TRANSFORMER_IN_FEATURE = "res5"
132 |     cfg.MODEL.ONE_FORMER.ENFORCE_INPUT_PROJ = False
133 | 
134 |     # Sometimes `backbone.size_divisibility` is set to 0 for some backbone (e.g. ResNet)
135 |     # you can use this config to override
136 |     cfg.MODEL.ONE_FORMER.SIZE_DIVISIBILITY = 32
137 | 
138 |     # transformer module
139 |     cfg.MODEL.ONE_FORMER.TRANSFORMER_DECODER_NAME = "ContrastiveMultiScaleMaskedTransformerDecoder"
140 | 
141 |     # point loss configs
142 |     # Number of points sampled during training for a mask point head.
143 |     cfg.MODEL.ONE_FORMER.TRAIN_NUM_POINTS = 112 * 112
144 |     # Oversampling parameter for PointRend point sampling during training. Parameter `k` in the
145 |     # original paper.
146 |     cfg.MODEL.ONE_FORMER.OVERSAMPLE_RATIO = 3.0
147 |     # Importance sampling parameter for PointRend point sampling during training. Parametr `beta` in
148 |     # the original paper.
149 |     cfg.MODEL.ONE_FORMER.IMPORTANCE_SAMPLE_RATIO = 0.75
150 | 
151 | def add_swin_config(cfg):
152 |     """
153 |     Add config forSWIN Backbone.
154 |     """
155 |     
156 |     # swin transformer backbone
157 |     cfg.MODEL.SWIN = CN()
158 |     cfg.MODEL.SWIN.PRETRAIN_IMG_SIZE = 224
159 |     cfg.MODEL.SWIN.PATCH_SIZE = 4
160 |     cfg.MODEL.SWIN.EMBED_DIM = 96
161 |     cfg.MODEL.SWIN.DEPTHS = [2, 2, 6, 2]
162 |     cfg.MODEL.SWIN.NUM_HEADS = [3, 6, 12, 24]
163 |     cfg.MODEL.SWIN.WINDOW_SIZE = 7
164 |     cfg.MODEL.SWIN.MLP_RATIO = 4.0
165 |     cfg.MODEL.SWIN.QKV_BIAS = True
166 |     cfg.MODEL.SWIN.QK_SCALE = None
167 |     cfg.MODEL.SWIN.DROP_RATE = 0.0
168 |     cfg.MODEL.SWIN.ATTN_DROP_RATE = 0.0
169 |     cfg.MODEL.SWIN.DROP_PATH_RATE = 0.3
170 |     cfg.MODEL.SWIN.APE = False
171 |     cfg.MODEL.SWIN.PATCH_NORM = True
172 |     cfg.MODEL.SWIN.OUT_FEATURES = ["res2", "res3", "res4", "res5"]
173 |     cfg.MODEL.SWIN.USE_CHECKPOINT = False
174 | 
175 | def add_dinat_config(cfg):
176 |     """
177 |     Add config for NAT Backbone.
178 |     """
179 | 
180 |     # DINAT transformer backbone
181 |     cfg.MODEL.DiNAT = CN()
182 |     cfg.MODEL.DiNAT.DEPTHS = [3, 4, 18, 5]
183 |     cfg.MODEL.DiNAT.OUT_FEATURES = ["res2", "res3", "res4", "res5"]
184 |     cfg.MODEL.DiNAT.EMBED_DIM = 64
185 |     cfg.MODEL.DiNAT.MLP_RATIO = 3.0
186 |     cfg.MODEL.DiNAT.NUM_HEADS = [2, 4, 8, 16]
187 |     cfg.MODEL.DiNAT.DROP_PATH_RATE = 0.2
188 |     cfg.MODEL.DiNAT.KERNEL_SIZE = 7
189 |     cfg.MODEL.DiNAT.DILATIONS = [[1, 16, 1], [1, 4, 1, 8], [1, 2, 1, 3, 1, 4], [1, 2, 1, 2, 1]]
190 |     cfg.MODEL.DiNAT.OUT_INDICES = (0, 1, 2, 3)
191 |     cfg.MODEL.DiNAT.QKV_BIAS = True
192 |     cfg.MODEL.DiNAT.QK_SCALE = None
193 |     cfg.MODEL.DiNAT.DROP_RATE = 0
194 |     cfg.MODEL.DiNAT.ATTN_DROP_RATE = 0.
195 |     cfg.MODEL.DiNAT.IN_PATCH_SIZE = 4
196 | 
197 | def add_convnext_config(cfg):
198 |     """
199 |     Add config for ConvNeXt Backbone.
200 |     """
201 |     
202 |     # swin transformer backbone
203 |     cfg.MODEL.CONVNEXT = CN()
204 |     cfg.MODEL.CONVNEXT.IN_CHANNELS = 3
205 |     cfg.MODEL.CONVNEXT.DEPTHS = [3, 3, 27, 3]
206 |     cfg.MODEL.CONVNEXT.DIMS = [192, 384, 768, 1536]
207 |     cfg.MODEL.CONVNEXT.DROP_PATH_RATE = 0.4
208 |     cfg.MODEL.CONVNEXT.LSIT = 1.0
209 |     cfg.MODEL.CONVNEXT.OUT_INDICES = [0, 1, 2, 3]
210 |     cfg.MODEL.CONVNEXT.OUT_FEATURES = ["res2", "res3", "res4", "res5"]


--------------------------------------------------------------------------------
/oneformer/data/__init__.py:
--------------------------------------------------------------------------------
1 | from . import datasets
2 | 


--------------------------------------------------------------------------------
/oneformer/data/bpe_simple_vocab_16e6.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SHI-Labs/OneFormer/4962ef6a96ffb76a76771bfa3e8b3587f209752b/oneformer/data/bpe_simple_vocab_16e6.txt.gz


--------------------------------------------------------------------------------
/oneformer/data/build.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------------
  2 | # Reference: https://github.com/facebookresearch/detectron2/blob/main/detectron2/data/build.py
  3 | # Modified by Jitesh Jain (https://github.com/praeclarumjj3)
  4 | # ------------------------------------------------------------------------------
  5 | 
  6 | from typing import Any, Callable, Dict, List, Optional, Union
  7 | import torch.utils.data as torchdata
  8 | 
  9 | from detectron2.config import configurable
 10 | 
 11 | 
 12 | from detectron2.data.common import DatasetFromList, MapDataset
 13 | from detectron2.data.dataset_mapper import DatasetMapper
 14 | from detectron2.data.samplers import (
 15 |     InferenceSampler,
 16 | )
 17 | from detectron2.data.build import (
 18 |     get_detection_dataset_dicts,
 19 |     trivial_batch_collator
 20 | )
 21 | """
 22 | This file contains the default logic to build a dataloader for training or testing.
 23 | """
 24 | 
 25 | __all__ = [
 26 |     "build_detection_test_loader",
 27 | ]
 28 | 
 29 | 
 30 | def _test_loader_from_config(cfg, dataset_name, mapper=None):
 31 |     """
 32 |     Uses the given `dataset_name` argument (instead of the names in cfg), because the
 33 |     standard practice is to evaluate each test set individually (not combining them).
 34 |     """
 35 |     if isinstance(dataset_name, str):
 36 |         dataset_name = [dataset_name]
 37 | 
 38 |     dataset = get_detection_dataset_dicts(
 39 |         dataset_name,
 40 |         filter_empty=False,
 41 |         proposal_files=[
 42 |             cfg.DATASETS.PROPOSAL_FILES_TEST[list(cfg.DATASETS.TEST).index(x)] for x in dataset_name
 43 |         ]
 44 |         if cfg.MODEL.LOAD_PROPOSALS
 45 |         else None,
 46 |     )
 47 |     if mapper is None:
 48 |         mapper = DatasetMapper(cfg, False)
 49 |     return {
 50 |         "dataset": dataset,
 51 |         "mapper": mapper,
 52 |         "num_workers": cfg.DATALOADER.NUM_WORKERS,
 53 |         "sampler": InferenceSampler(len(dataset))
 54 |         if not isinstance(dataset, torchdata.IterableDataset)
 55 |         else None,
 56 |     }
 57 | 
 58 | 
 59 | @configurable(from_config=_test_loader_from_config)
 60 | def build_detection_test_loader(
 61 |     dataset: Union[List[Any], torchdata.Dataset],
 62 |     *,
 63 |     mapper: Callable[[Dict[str, Any]], Any],
 64 |     sampler: Optional[torchdata.Sampler] = None,
 65 |     batch_size: int = 1,
 66 |     num_workers: int = 0,
 67 |     collate_fn: Optional[Callable[[List[Any]], Any]] = None,
 68 | ) -> torchdata.DataLoader:
 69 |     """
 70 |     Similar to `build_detection_train_loader`, with default batch size = 1,
 71 |     and sampler = :class:`InferenceSampler`. This sampler coordinates all workers
 72 |     to produce the exact set of all samples.
 73 | 
 74 |     Args:
 75 |         dataset: a list of dataset dicts,
 76 |             or a pytorch dataset (either map-style or iterable). They can be obtained
 77 |             by using :func:`DatasetCatalog.get` or :func:`get_detection_dataset_dicts`.
 78 |         mapper: a callable which takes a sample (dict) from dataset
 79 |            and returns the format to be consumed by the model.
 80 |            When using cfg, the default choice is ``DatasetMapper(cfg, is_train=False)``.
 81 |         sampler: a sampler that produces
 82 |             indices to be applied on ``dataset``. Default to :class:`InferenceSampler`,
 83 |             which splits the dataset across all workers. Sampler must be None
 84 |             if `dataset` is iterable.
 85 |         batch_size: the batch size of the data loader to be created.
 86 |             Default to 1 image per worker since this is the standard when reporting
 87 |             inference time in papers.
 88 |         num_workers: number of parallel data loading workers
 89 |         collate_fn: same as the argument of `torch.utils.data.DataLoader`.
 90 |             Defaults to do no collation and return a list of data.
 91 | 
 92 |     Returns:
 93 |         DataLoader: a torch DataLoader, that loads the given detection
 94 |         dataset, with test-time transformation and batching.
 95 | 
 96 |     Examples:
 97 |     ::
 98 |         data_loader = build_detection_test_loader(
 99 |             DatasetRegistry.get("my_test"),
100 |             mapper=DatasetMapper(...))
101 | 
102 |         # or, instantiate with a CfgNode:
103 |         data_loader = build_detection_test_loader(cfg, "my_test")
104 |     """
105 |     if isinstance(dataset, list):
106 |         dataset = DatasetFromList(dataset, copy=False)
107 |     if mapper is not None:
108 |         dataset = MapDataset(dataset, mapper)
109 |     if isinstance(dataset, torchdata.IterableDataset):
110 |         assert sampler is None, "sampler must be None if dataset is IterableDataset"
111 |     else:
112 |         if sampler is None:
113 |             sampler = InferenceSampler(len(dataset))
114 |     return torchdata.DataLoader(
115 |         dataset,
116 |         batch_size=batch_size,
117 |         sampler=sampler,
118 |         drop_last=False,
119 |         num_workers=num_workers,
120 |         collate_fn=trivial_batch_collator if collate_fn is None else collate_fn,
121 |     )


--------------------------------------------------------------------------------
/oneformer/data/dataset_mappers/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/oneformer/data/datasets/__init__.py:
--------------------------------------------------------------------------------
 1 | from . import (
 2 |     register_ade20k_panoptic,
 3 |     register_cityscapes_panoptic,
 4 |     register_coco_panoptic_annos_semseg,
 5 |     register_ade20k_instance,
 6 |     register_coco_panoptic2instance,
 7 |     register_mapillary_vistas,
 8 |     register_mapillary_vistas_panoptic,
 9 | )
10 | 


--------------------------------------------------------------------------------
/oneformer/data/datasets/register_ade20k_instance.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Reference: https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/data/datasets/register_ade20k_instance.py
 3 | # ------------------------------------------------------------------------------
 4 | 
 5 | import json
 6 | import logging
 7 | import numpy as np
 8 | import os
 9 | from PIL import Image
10 | 
11 | from detectron2.data import DatasetCatalog, MetadataCatalog
12 | from detectron2.data.datasets.coco import load_coco_json, register_coco_instances
13 | from detectron2.utils.file_io import PathManager
14 | 
15 | ADE_CATEGORIES = [{'id': 7, 'name': 'bed'}, {'id': 8, 'name': 'windowpane'}, {'id': 10, 'name': 'cabinet'}, {'id': 12, 'name': 'person'}, {'id': 14, 'name': 'door'}, {'id': 15, 'name': 'table'}, {'id': 18, 'name': 'curtain'}, {'id': 19, 'name': 'chair'}, {'id': 20, 'name': 'car'}, {'id': 22, 'name': 'painting'}, {'id': 23, 'name': 'sofa'}, {'id': 24, 'name': 'shelf'}, {'id': 27, 'name': 'mirror'}, {'id': 30, 'name': 'armchair'}, {'id': 31, 'name': 'seat'}, {'id': 32, 'name': 'fence'}, {'id': 33, 'name': 'desk'}, {'id': 35, 'name': 'wardrobe'}, {'id': 36, 'name': 'lamp'}, {'id': 37, 'name': 'bathtub'}, {'id': 38, 'name': 'railing'}, {'id': 39, 'name': 'cushion'}, {'id': 41, 'name': 'box'}, {'id': 42, 'name': 'column'}, {'id': 43, 'name': 'signboard'}, {'id': 44, 'name': 'chest of drawers'}, {'id': 45, 'name': 'counter'}, {'id': 47, 'name': 'sink'}, {'id': 49, 'name': 'fireplace'}, {'id': 50, 'name': 'refrigerator'}, {'id': 53, 'name': 'stairs'}, {'id': 55, 'name': 'case'}, {'id': 56, 'name': 'pool table'}, {'id': 57, 'name': 'pillow'}, {'id': 58, 'name': 'screen door'}, {'id': 62, 'name': 'bookcase'}, {'id': 64, 'name': 'coffee table'}, {'id': 65, 'name': 'toilet'}, {'id': 66, 'name': 'flower'}, {'id': 67, 'name': 'book'}, {'id': 69, 'name': 'bench'}, {'id': 70, 'name': 'countertop'}, {'id': 71, 'name': 'stove'}, {'id': 72, 'name': 'palm'}, {'id': 73, 'name': 'kitchen island'}, {'id': 74, 'name': 'computer'}, {'id': 75, 'name': 'swivel chair'}, {'id': 76, 'name': 'boat'}, {'id': 78, 'name': 'arcade machine'}, {'id': 80, 'name': 'bus'}, {'id': 81, 'name': 'towel'}, {'id': 82, 'name': 'light'}, {'id': 83, 'name': 'truck'}, {'id': 85, 'name': 'chandelier'}, {'id': 86, 'name': 'awning'}, {'id': 87, 'name': 'streetlight'}, {'id': 88, 'name': 'booth'}, {'id': 89, 'name': 'television receiver'}, {'id': 90, 'name': 'airplane'}, {'id': 92, 'name': 'apparel'}, {'id': 93, 'name': 'pole'}, {'id': 95, 'name': 'bannister'}, {'id': 97, 'name': 'ottoman'}, {'id': 98, 'name': 'bottle'}, {'id': 102, 'name': 'van'}, {'id': 103, 'name': 'ship'}, {'id': 104, 'name': 'fountain'}, {'id': 107, 'name': 'washer'}, {'id': 108, 'name': 'plaything'}, {'id': 110, 'name': 'stool'}, {'id': 111, 'name': 'barrel'}, {'id': 112, 'name': 'basket'}, {'id': 115, 'name': 'bag'}, {'id': 116, 'name': 'minibike'}, {'id': 118, 'name': 'oven'}, {'id': 119, 'name': 'ball'}, {'id': 120, 'name': 'food'}, {'id': 121, 'name': 'step'}, {'id': 123, 'name': 'trade name'}, {'id': 124, 'name': 'microwave'}, {'id': 125, 'name': 'pot'}, {'id': 126, 'name': 'animal'}, {'id': 127, 'name': 'bicycle'}, {'id': 129, 'name': 'dishwasher'}, {'id': 130, 'name': 'screen'}, {'id': 132, 'name': 'sculpture'}, {'id': 133, 'name': 'hood'}, {'id': 134, 'name': 'sconce'}, {'id': 135, 'name': 'vase'}, {'id': 136, 'name': 'traffic light'}, {'id': 137, 'name': 'tray'}, {'id': 138, 'name': 'ashcan'}, {'id': 139, 'name': 'fan'}, {'id': 142, 'name': 'plate'}, {'id': 143, 'name': 'monitor'}, {'id': 144, 'name': 'bulletin board'}, {'id': 146, 'name': 'radiator'}, {'id': 147, 'name': 'glass'}, {'id': 148, 'name': 'clock'}, {'id': 149, 'name': 'flag'}]
16 | 
17 | 
18 | _PREDEFINED_SPLITS = {
19 |     # point annotations without masks
20 |     "ade20k_instance_train": (
21 |         "ADEChallengeData2016/images/training",
22 |         "ADEChallengeData2016/ade20k_instance_train.json",
23 |     ),
24 |     "ade20k_instance_val": (
25 |         "ADEChallengeData2016/images/validation",
26 |         "ADEChallengeData2016/ade20k_instance_val.json",
27 |     ),
28 | }
29 | 
30 | 
31 | def _get_ade_instances_meta():
32 |     thing_ids = [k["id"] for k in ADE_CATEGORIES]
33 |     assert len(thing_ids) == 100, len(thing_ids)
34 |     # Mapping from the incontiguous ADE category id to an id in [0, 99]
35 |     thing_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(thing_ids)}
36 |     thing_classes = [k["name"] for k in ADE_CATEGORIES]
37 |     ret = {
38 |         "thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id,
39 |         "thing_classes": thing_classes,
40 |     }
41 |     return ret
42 | 
43 | 
44 | def register_all_ade20k_instance(root):
45 |     for key, (image_root, json_file) in _PREDEFINED_SPLITS.items():
46 |         # Assume pre-defined datasets live in `./datasets`.
47 |         register_coco_instances(
48 |             key,
49 |             _get_ade_instances_meta(),
50 |             os.path.join(root, json_file) if "://" not in json_file else json_file,
51 |             os.path.join(root, image_root),
52 |         )
53 | 
54 | 
55 | _root = os.getenv("DETECTRON2_DATASETS", "datasets")
56 | register_all_ade20k_instance(_root)
57 | 


--------------------------------------------------------------------------------
/oneformer/data/datasets/register_coco_panoptic2instance.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Reference: https://github.com/facebookresearch/detectron2/blob/main/detectron2/data/datasets/builtin.py
 3 | # Modified by Jitesh Jain (https://github.com/praeclarumjj3)
 4 | # ------------------------------------------------------------------------------
 5 | 
 6 | 
 7 | """
 8 | This file registers pre-defined datasets at hard-coded paths, and their metadata.
 9 | 
10 | We hard-code metadata for common datasets. This will enable:
11 | 1. Consistency check when loading the datasets
12 | 2. Use models on these standard datasets directly and run demos,
13 |    without having to download the dataset annotations
14 | 
15 | We hard-code some paths to the dataset that's assumed to
16 | exist in "./datasets/".
17 | 
18 | Users SHOULD NOT use this file to create new dataset / metadata for new dataset.
19 | To add new dataset, refer to the tutorial "docs/DATASETS.md".
20 | """
21 | 
22 | import os
23 | from detectron2.data.datasets.builtin_meta import  _get_builtin_metadata
24 | from detectron2.data.datasets.coco import register_coco_instances
25 | 
26 | 
27 | _PREDEFINED_SPLITS_COCO = {
28 |     "coco_2017_val_panoptic2instance": ("coco/val2017", "coco/annotations/panoptic2instances_val2017.json"),
29 | }
30 | 
31 | 
32 | def register_panoptic2instances_coco(root):
33 |     for key, (image_root, json_file) in _PREDEFINED_SPLITS_COCO.items():
34 |         # Assume pre-defined datasets live in `./datasets`.
35 |         register_coco_instances(
36 |             key,
37 |             _get_builtin_metadata("coco"),
38 |             os.path.join(root, json_file) if "://" not in json_file else json_file,
39 |             os.path.join(root, image_root),
40 |         )
41 | 
42 | 
43 | _root = os.path.expanduser(os.getenv("DETECTRON2_DATASETS", "datasets"))
44 | register_panoptic2instances_coco(_root)


--------------------------------------------------------------------------------
/oneformer/data/tokenizer.py:
--------------------------------------------------------------------------------
  1 | # -------------------------------------------------------------------------
  2 | # MIT License
  3 | #
  4 | # Copyright (c) 2021 OpenAI
  5 | #
  6 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  7 | # of this software and associated documentation files (the "Software"), to deal
  8 | # in the Software without restriction, including without limitation the rights
  9 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 10 | # copies of the Software, and to permit persons to whom the Software is
 11 | # furnished to do so, subject to the following conditions:
 12 | #
 13 | # The above copyright notice and this permission notice shall be included in all
 14 | # copies or substantial portions of the Software.
 15 | #
 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 18 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 19 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 20 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 21 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 22 | # SOFTWARE.
 23 | #
 24 | # Modified by Jiarui Xu
 25 | # -------------------------------------------------------------------------
 26 | 
 27 | import gzip
 28 | import html
 29 | import os
 30 | from functools import lru_cache
 31 | 
 32 | import ftfy
 33 | import regex as re
 34 | import torch
 35 | 
 36 | 
 37 | @lru_cache()
 38 | def default_bpe():
 39 |     return os.path.join(os.path.dirname(os.path.abspath(__file__)), 'bpe_simple_vocab_16e6.txt.gz')
 40 | 
 41 | 
 42 | @lru_cache()
 43 | def bytes_to_unicode():
 44 |     """Returns list of utf-8 byte and a corresponding list of unicode strings.
 45 | 
 46 |     The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
 47 |     if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for decent
 48 |     coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup tables
 49 |     between utf-8 bytes and unicode strings. And avoids mapping to whitespace/control characters the bpe code barfs on.
 50 |     """
 51 |     bs = list(range(ord('!'), ord('~') + 1)) + list(range(ord('¡'), ord('¬') + 1)) + list(range(ord('®'), ord('ÿ') + 1))
 52 |     cs = bs[:]
 53 |     n = 0
 54 |     for b in range(2**8):
 55 |         if b not in bs:
 56 |             bs.append(b)
 57 |             cs.append(2**8 + n)
 58 |             n += 1
 59 |     cs = [chr(n) for n in cs]
 60 |     return dict(zip(bs, cs))
 61 | 
 62 | 
 63 | def get_pairs(word):
 64 |     """Return set of symbol pairs in a word.
 65 | 
 66 |     Word is represented as tuple of symbols (symbols being variable-length strings).
 67 |     """
 68 |     pairs = set()
 69 |     prev_char = word[0]
 70 |     for char in word[1:]:
 71 |         pairs.add((prev_char, char))
 72 |         prev_char = char
 73 |     return pairs
 74 | 
 75 | 
 76 | def basic_clean(text):
 77 |     text = ftfy.fix_text(text)
 78 |     text = html.unescape(html.unescape(text))
 79 |     return text.strip()
 80 | 
 81 | 
 82 | def whitespace_clean(text):
 83 |     text = re.sub(r'\s+', ' ', text)
 84 |     text = text.strip()
 85 |     return text
 86 | 
 87 | class Tokenize:
 88 | 
 89 |     def __init__(self, tokenizer, max_seq_len=77, truncate=True):
 90 |         self.tokenizer = tokenizer
 91 |         self.max_seq_len = max_seq_len
 92 |         self.truncate = truncate
 93 | 
 94 |     def __call__(self, texts):
 95 |         expanded_dim = False
 96 |         if isinstance(texts, str):
 97 |             texts = [texts]
 98 |             expanded_dim = True
 99 | 
100 |         sot_token = self.tokenizer.encoder['<|startoftext|>']
101 |         eot_token = self.tokenizer.encoder['<|endoftext|>']
102 |         all_tokens = [[sot_token] + self.tokenizer.encode(text) + [eot_token] for text in texts]
103 |         result = torch.zeros(len(all_tokens), self.max_seq_len, dtype=torch.long)
104 | 
105 |         for i, tokens in enumerate(all_tokens):
106 |             if len(tokens) > self.max_seq_len:
107 |                 if self.truncate:
108 |                     tokens = tokens[:self.max_seq_len]
109 |                     tokens[-1] = eot_token
110 |                 else:
111 |                     raise RuntimeError(f'Input {texts[i]} is too long for context length {self.max_seq_len}')
112 |             result[i, :len(tokens)] = torch.tensor(tokens)
113 | 
114 |         if expanded_dim:
115 |             return result[0]
116 | 
117 |         return result
118 | 
119 | 
120 | class SimpleTokenizer(object):
121 | 
122 |     def __init__(self, bpe_path: str = default_bpe()):
123 |         self.byte_encoder = bytes_to_unicode()
124 |         self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
125 |         merges = gzip.open(bpe_path).read().decode('utf-8').split('\n')
126 |         merges = merges[1:49152 - 256 - 2 + 1]
127 |         merges = [tuple(merge.split()) for merge in merges]
128 |         vocab = list(bytes_to_unicode().values())
129 |         vocab = vocab + [v + '</w>' for v in vocab]
130 |         for merge in merges:
131 |             vocab.append(''.join(merge))
132 |         vocab.extend(['<|startoftext|>', '<|endoftext|>'])
133 |         self.encoder = dict(zip(vocab, range(len(vocab))))
134 |         self.decoder = {v: k for k, v in self.encoder.items()}
135 |         self.bpe_ranks = dict(zip(merges, range(len(merges))))
136 |         self.cache = {'<|startoftext|>': '<|startoftext|>', '<|endoftext|>': '<|endoftext|>'}
137 |         self.pat = re.compile(
138 |             r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""",
139 |             re.IGNORECASE)
140 | 
141 |     def bpe(self, token):
142 |         if token in self.cache:
143 |             return self.cache[token]
144 |         word = tuple(token[:-1]) + (token[-1] + '</w>', )
145 |         pairs = get_pairs(word)
146 | 
147 |         if not pairs:
148 |             return token + '</w>'
149 | 
150 |         while True:
151 |             bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
152 |             if bigram not in self.bpe_ranks:
153 |                 break
154 |             first, second = bigram
155 |             new_word = []
156 |             i = 0
157 |             while i < len(word):
158 |                 try:
159 |                     j = word.index(first, i)
160 |                     new_word.extend(word[i:j])
161 |                     i = j
162 |                 except:  # noqa: E722
163 |                     new_word.extend(word[i:])
164 |                     break
165 | 
166 |                 if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
167 |                     new_word.append(first + second)
168 |                     i += 2
169 |                 else:
170 |                     new_word.append(word[i])
171 |                     i += 1
172 |             new_word = tuple(new_word)
173 |             word = new_word
174 |             if len(word) == 1:
175 |                 break
176 |             else:
177 |                 pairs = get_pairs(word)
178 |         word = ' '.join(word)
179 |         self.cache[token] = word
180 |         return word
181 | 
182 |     def encode(self, text):
183 |         bpe_tokens = []
184 |         text = whitespace_clean(basic_clean(text)).lower()
185 |         for token in re.findall(self.pat, text):
186 |             token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
187 |             bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
188 |         return bpe_tokens
189 | 
190 |     def decode(self, tokens):
191 |         text = ''.join([self.decoder[token] for token in tokens])
192 |         text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors='replace').replace('</w>', ' ')
193 |         return text


--------------------------------------------------------------------------------
/oneformer/datasetmapper_tta.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | import numpy as np
 3 | from typing import List
 4 | import torch
 5 | from fvcore.transforms import NoOpTransform
 6 | from torch import nn
 7 | 
 8 | from detectron2.config import configurable
 9 | from detectron2.data.transforms import (
10 |     RandomFlip,
11 |     ResizeShortestEdge,
12 |     ResizeTransform,
13 |     apply_augmentations,
14 | )
15 | 
16 | __all__ = ["DatasetMapperTTA"]
17 | 
18 | 
19 | class DatasetMapperTTA:
20 |     """
21 |     Implement test-time augmentation for detection data.
22 |     It is a callable which takes a dataset dict from a detection dataset,
23 |     and returns a list of dataset dicts where the images
24 |     are augmented from the input image by the transformations defined in the config.
25 |     This is used for test-time augmentation.
26 |     """
27 | 
28 |     @configurable
29 |     def __init__(self, min_sizes: List[int], max_size: int, flip: bool):
30 |         """
31 |         Args:
32 |             min_sizes: list of short-edge size to resize the image to
33 |             max_size: maximum height or width of resized images
34 |             flip: whether to apply flipping augmentation
35 |         """
36 |         self.min_sizes = min_sizes
37 |         self.max_size = max_size
38 |         self.flip = flip
39 | 
40 |     @classmethod
41 |     def from_config(cls, cfg):
42 |         return {
43 |             "min_sizes": cfg.TEST.AUG.MIN_SIZES,
44 |             "max_size": cfg.TEST.AUG.MAX_SIZE,
45 |             "flip": cfg.TEST.AUG.FLIP,
46 |         }
47 | 
48 |     def __call__(self, dataset_dict):
49 |         """
50 |         Args:
51 |             dict: a dict in standard model input format. See tutorials for details.
52 |         Returns:
53 |             list[dict]:
54 |                 a list of dicts, which contain augmented version of the input image.
55 |                 The total number of dicts is ``len(min_sizes) * (2 if flip else 1)``.
56 |                 Each dict has field "transforms" which is a TransformList,
57 |                 containing the transforms that are used to generate this image.
58 |         """
59 |         numpy_image = dataset_dict["image"].permute(1, 2, 0).numpy()
60 |         shape = numpy_image.shape
61 |         orig_shape = (dataset_dict["height"], dataset_dict["width"])
62 |         
63 |         if shape[:2] != orig_shape:
64 |             # It transforms the "original" image in the dataset to the input image
65 |             pre_tfm = ResizeTransform(orig_shape[0], orig_shape[1], shape[0], shape[1])
66 |         else:
67 |             pre_tfm = NoOpTransform()
68 | 
69 |         # Create all combinations of augmentations to use
70 |         aug_candidates = []  # each element is a list[Augmentation]
71 |         for min_size in self.min_sizes:
72 |             resize = ResizeShortestEdge(min_size, self.max_size)
73 |             aug_candidates.append([resize])  # resize only
74 |             if self.flip:
75 |                 flip = RandomFlip(prob=1.0)
76 |                 aug_candidates.append([resize, flip])  # resize + flip
77 | 
78 |         # Apply all the augmentations
79 |         ret = []
80 |         for aug in aug_candidates:
81 |             new_image, tfms = apply_augmentations(aug, np.copy(numpy_image))
82 |             torch_image = torch.from_numpy(np.ascontiguousarray(new_image.transpose(2, 0, 1)))
83 | 
84 |             dic = copy.deepcopy(dataset_dict)
85 |             dic["transforms"] = pre_tfm + tfms
86 |             dic["image"] = torch_image
87 |             ret.append(dic)
88 |         return ret


--------------------------------------------------------------------------------
/oneformer/evaluation/__init__.py:
--------------------------------------------------------------------------------
1 | from .detection_coco_evaluator import *
2 | from .coco_evaluator import *
3 | from .cityscapes_evaluation import CityscapesInstanceEvaluator


--------------------------------------------------------------------------------
/oneformer/evaluation/instance_evaluation.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------------
  2 | # Reference: https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/evaluation/instance_evaluation.py
  3 | # ------------------------------------------------------------------------------
  4 | 
  5 | import contextlib
  6 | import copy
  7 | import io
  8 | import itertools
  9 | import json
 10 | import logging
 11 | import numpy as np
 12 | import os
 13 | import pickle
 14 | from collections import OrderedDict
 15 | import pycocotools.mask as mask_util
 16 | import torch
 17 | from pycocotools.coco import COCO
 18 | from pycocotools.cocoeval import COCOeval
 19 | from tabulate import tabulate
 20 | 
 21 | import detectron2.utils.comm as comm
 22 | from detectron2.config import CfgNode
 23 | from detectron2.data import MetadataCatalog
 24 | from detectron2.data.datasets.coco import convert_to_coco_json
 25 | from detectron2.evaluation.coco_evaluation import COCOEvaluator, _evaluate_predictions_on_coco
 26 | from detectron2.evaluation.fast_eval_api import COCOeval_opt
 27 | from detectron2.structures import Boxes, BoxMode, pairwise_iou
 28 | from detectron2.utils.file_io import PathManager
 29 | from detectron2.utils.logger import create_small_table
 30 | 
 31 | 
 32 | # modified from COCOEvaluator for instance segmetnat
 33 | class InstanceSegEvaluator(COCOEvaluator):
 34 |     """
 35 |     Evaluate AR for object proposals, AP for instance detection/segmentation, AP
 36 |     for keypoint detection outputs using COCO's metrics.
 37 |     See http://cocodataset.org/#detection-eval and
 38 |     http://cocodataset.org/#keypoints-eval to understand its metrics.
 39 |     The metrics range from 0 to 100 (instead of 0 to 1), where a -1 or NaN means
 40 |     the metric cannot be computed (e.g. due to no predictions made).
 41 | 
 42 |     In addition to COCO, this evaluator is able to support any bounding box detection,
 43 |     instance segmentation, or keypoint detection dataset.
 44 |     """
 45 | 
 46 |     def _eval_predictions(self, predictions, img_ids=None):
 47 |         """
 48 |         Evaluate predictions. Fill self._results with the metrics of the tasks.
 49 |         """
 50 |         self._logger.info("Preparing results for COCO format ...")
 51 |         coco_results = list(itertools.chain(*[x["instances"] for x in predictions]))
 52 |         tasks = self._tasks or self._tasks_from_predictions(coco_results)
 53 | 
 54 |         # unmap the category ids for COCO
 55 |         if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"):
 56 |             dataset_id_to_contiguous_id = self._metadata.thing_dataset_id_to_contiguous_id
 57 |             # all_contiguous_ids = list(dataset_id_to_contiguous_id.values())
 58 |             # num_classes = len(all_contiguous_ids)
 59 |             # assert min(all_contiguous_ids) == 0 and max(all_contiguous_ids) == num_classes - 1
 60 | 
 61 |             reverse_id_mapping = {v: k for k, v in dataset_id_to_contiguous_id.items()}
 62 |             for result in coco_results:
 63 |                 category_id = result["category_id"]
 64 |                 # assert category_id < num_classes, (
 65 |                 #     f"A prediction has class={category_id}, "
 66 |                 #     f"but the dataset only has {num_classes} classes and "
 67 |                 #     f"predicted class id should be in [0, {num_classes - 1}]."
 68 |                 # )
 69 |                 assert category_id in reverse_id_mapping, (
 70 |                     f"A prediction has class={category_id}, "
 71 |                     f"but the dataset only has class ids in {dataset_id_to_contiguous_id}."
 72 |                 )
 73 |                 result["category_id"] = reverse_id_mapping[category_id]
 74 | 
 75 |         if self._output_dir:
 76 |             file_path = os.path.join(self._output_dir, "coco_instances_results.json")
 77 |             self._logger.info("Saving results to {}".format(file_path))
 78 |             with PathManager.open(file_path, "w") as f:
 79 |                 f.write(json.dumps(coco_results))
 80 |                 f.flush()
 81 | 
 82 |         if not self._do_evaluation:
 83 |             self._logger.info("Annotations are not available for evaluation.")
 84 |             return
 85 | 
 86 |         self._logger.info(
 87 |             "Evaluating predictions with {} COCO API...".format(
 88 |                 "unofficial" if self._use_fast_impl else "official"
 89 |             )
 90 |         )
 91 |         for task in sorted(tasks):
 92 |             assert task in {"bbox", "segm", "keypoints"}, f"Got unknown task: {task}!"
 93 |             coco_eval = (
 94 |                 _evaluate_predictions_on_coco(
 95 |                     self._coco_api,
 96 |                     coco_results,
 97 |                     task,
 98 |                     kpt_oks_sigmas=self._kpt_oks_sigmas,
 99 |                     use_fast_impl=self._use_fast_impl,
100 |                     img_ids=img_ids,
101 |                     max_dets_per_image=self._max_dets_per_image,
102 |                 )
103 |                 if len(coco_results) > 0
104 |                 else None  # cocoapi does not handle empty results very well
105 |             )
106 | 
107 |             res = self._derive_coco_results(
108 |                 coco_eval, task, class_names=self._metadata.get("thing_classes")
109 |             )
110 |             self._results[task] = res
111 | 


--------------------------------------------------------------------------------
/oneformer/modeling/__init__.py:
--------------------------------------------------------------------------------
1 | from .backbone.swin import D2SwinTransformer
2 | from .backbone.dinat import D2DiNAT
3 | from .backbone.convnext import D2ConvNeXt
4 | from .pixel_decoder.fpn import BasePixelDecoder
5 | from .pixel_decoder.msdeformattn import MSDeformAttnPixelDecoder
6 | from .meta_arch.oneformer_head import OneFormerHead
7 | 


--------------------------------------------------------------------------------
/oneformer/modeling/backbone/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/oneformer/modeling/meta_arch/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/oneformer/modeling/meta_arch/oneformer_head.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------------
  2 | # Reference: https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/modeling/meta_arch/mask_former_head.py
  3 | # Modified by Jitesh Jain (https://github.com/praeclarumjj3)
  4 | # ------------------------------------------------------------------------------
  5 | 
  6 | import logging
  7 | from copy import deepcopy
  8 | from typing import Callable, Dict, List, Optional, Tuple, Union
  9 | 
 10 | import fvcore.nn.weight_init as weight_init
 11 | from torch import nn
 12 | from torch.nn import functional as F
 13 | 
 14 | from detectron2.config import configurable
 15 | from detectron2.layers import Conv2d, ShapeSpec, get_norm
 16 | from detectron2.modeling import SEM_SEG_HEADS_REGISTRY
 17 | from ..pixel_decoder.fpn import build_pixel_decoder
 18 | from ..transformer_decoder.oneformer_transformer_decoder import build_transformer_decoder
 19 | 
 20 | @SEM_SEG_HEADS_REGISTRY.register()
 21 | class OneFormerHead(nn.Module):
 22 | 
 23 |     _version = 2
 24 | 
 25 |     def _load_from_state_dict(
 26 |         self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
 27 |     ):
 28 |         version = local_metadata.get("version", None)
 29 |         if version is None or version < 2:
 30 |             # Do not warn if train from scratch
 31 |             scratch = True
 32 |             logger = logging.getLogger(__name__)
 33 |             for k in list(state_dict.keys()):
 34 |                 newk = k
 35 |                 if "sem_seg_head" in k and not k.startswith(prefix + "predictor"):
 36 |                     newk = k.replace(prefix, prefix + "pixel_decoder.")
 37 |                     # logger.debug(f"{k} ==> {newk}")
 38 |                 if newk != k:
 39 |                     state_dict[newk] = state_dict[k]
 40 |                     del state_dict[k]
 41 |                     scratch = False
 42 | 
 43 |             if not scratch:
 44 |                 logger.warning(
 45 |                     f"Weight format of {self.__class__.__name__} have changed! "
 46 |                     "Please upgrade your models. Applying automatic conversion now ..."
 47 |                 )
 48 | 
 49 |     @configurable
 50 |     def __init__(
 51 |         self,
 52 |         input_shape: Dict[str, ShapeSpec],
 53 |         *,
 54 |         num_classes: int,
 55 |         pixel_decoder: nn.Module,
 56 |         loss_weight: float = 1.0,
 57 |         ignore_value: int = -1,
 58 |         # extra parameters
 59 |         transformer_predictor: nn.Module,
 60 |         transformer_in_feature: str,
 61 |     ):
 62 |         """
 63 |         NOTE: this interface is experimental.
 64 |         Args:
 65 |             input_shape: shapes (channels and stride) of the input features
 66 |             num_classes: number of classes to predict
 67 |             pixel_decoder: the pixel decoder module
 68 |             loss_weight: loss weight
 69 |             ignore_value: category id to be ignored during training.
 70 |             transformer_predictor: the transformer decoder that makes prediction
 71 |             transformer_in_feature: input feature name to the transformer_predictor
 72 |         """
 73 |         super().__init__()
 74 |         input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride)
 75 |         self.in_features = [k for k, v in input_shape]
 76 |         feature_strides = [v.stride for k, v in input_shape]
 77 |         feature_channels = [v.channels for k, v in input_shape]
 78 | 
 79 |         self.ignore_value = ignore_value
 80 |         self.common_stride = 4
 81 |         self.loss_weight = loss_weight
 82 | 
 83 |         self.pixel_decoder = pixel_decoder
 84 |         self.predictor = transformer_predictor
 85 |         self.transformer_in_feature = transformer_in_feature
 86 | 
 87 |         self.num_classes = num_classes
 88 | 
 89 |     @classmethod
 90 |     def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
 91 |         # figure out in_channels to transformer predictor
 92 |         if cfg.MODEL.ONE_FORMER.TRANSFORMER_IN_FEATURE == "transformer_encoder":
 93 |             transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
 94 |         elif cfg.MODEL.ONE_FORMER.TRANSFORMER_IN_FEATURE == "pixel_embedding":
 95 |             transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM
 96 |         elif cfg.MODEL.ONE_FORMER.TRANSFORMER_IN_FEATURE == "multi_scale_pixel_decoder":
 97 |             transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
 98 |         else:
 99 |             transformer_predictor_in_channels = input_shape[cfg.MODEL.ONE_FORMER.TRANSFORMER_IN_FEATURE].channels
100 | 
101 |         return {
102 |             "input_shape": {
103 |                 k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES
104 |             },
105 |             "ignore_value": cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
106 |             "num_classes": cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES,
107 |             "pixel_decoder": build_pixel_decoder(cfg, input_shape),
108 |             "loss_weight": cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT,
109 |             "transformer_in_feature": cfg.MODEL.ONE_FORMER.TRANSFORMER_IN_FEATURE,
110 |             "transformer_predictor": build_transformer_decoder(
111 |                 cfg,
112 |                 transformer_predictor_in_channels,
113 |                 mask_classification=True,
114 |             ),
115 |         }
116 | 
117 |     def forward(self, features, tasks, mask=None):
118 |         return self.layers(features, tasks, mask)
119 | 
120 |     def layers(self, features, tasks, mask=None):
121 |         mask_features, transformer_encoder_features, multi_scale_features, _, _ = self.pixel_decoder.forward_features(features)
122 |         
123 |         if self.transformer_in_feature == "multi_scale_pixel_decoder":
124 |             predictions = self.predictor(multi_scale_features, mask_features, tasks, mask)
125 |         else:
126 |             if self.transformer_in_feature == "transformer_encoder":
127 |                 assert (
128 |                     transformer_encoder_features is not None
129 |                 ), "Please use the TransformerEncoderPixelDecoder."
130 |                 predictions = self.predictor(transformer_encoder_features, mask_features, mask)
131 |             elif self.transformer_in_feature == "pixel_embedding":
132 |                 predictions = self.predictor(mask_features, mask_features, mask)
133 |             else:
134 |                 predictions = self.predictor(features[self.transformer_in_feature], mask_features, mask)
135 |         return predictions
136 | 


--------------------------------------------------------------------------------
/oneformer/modeling/pixel_decoder/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/oneformer/modeling/pixel_decoder/ops/functions/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | from .ms_deform_attn_func import MSDeformAttnFunction
10 | 
11 | 


--------------------------------------------------------------------------------
/oneformer/modeling/pixel_decoder/ops/functions/ms_deform_attn_func.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from __future__ import absolute_import
13 | from __future__ import print_function
14 | from __future__ import division
15 | 
16 | import torch
17 | import torch.nn.functional as F
18 | from torch.autograd import Function
19 | from torch.autograd.function import once_differentiable
20 | 
21 | if torch.cuda.is_available():
22 |     try:
23 |         import MultiScaleDeformableAttention as MSDA
24 |     except ModuleNotFoundError as e:
25 |         info_string = (
26 |             "\n\nPlease compile MultiScaleDeformableAttention CUDA op with the following commands:\n"
27 |             "\t`cd mask2former/modeling/pixel_decoder/ops`\n"
28 |             "\t`sh make.sh`\n"
29 |         )
30 |         raise ModuleNotFoundError(info_string)
31 | else:
32 |     MultiScaleDeformableAttention = None
33 | 
34 | 
35 | class MSDeformAttnFunction(Function):
36 |     @staticmethod
37 |     def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step):
38 |         ctx.im2col_step = im2col_step
39 |         output = MSDA.ms_deform_attn_forward(
40 |             value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step)
41 |         ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights)
42 |         return output
43 | 
44 |     @staticmethod
45 |     @once_differentiable
46 |     def backward(ctx, grad_output):
47 |         value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors
48 |         grad_value, grad_sampling_loc, grad_attn_weight = \
49 |             MSDA.ms_deform_attn_backward(
50 |                 value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step)
51 | 
52 |         return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
53 | 
54 | 
55 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
56 |     # for debug and test only,
57 |     # need to use cuda version instead
58 |     N_, S_, M_, D_ = value.shape
59 |     _, Lq_, M_, L_, P_, _ = sampling_locations.shape
60 |     value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
61 |     sampling_grids = 2 * sampling_locations - 1
62 |     sampling_value_list = []
63 |     for lid_, (H_, W_) in enumerate(value_spatial_shapes):
64 |         # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
65 |         value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_)
66 |         # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
67 |         sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)
68 |         # N_*M_, D_, Lq_, P_
69 |         sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,
70 |                                           mode='bilinear', padding_mode='zeros', align_corners=False)
71 |         sampling_value_list.append(sampling_value_l_)
72 |     # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
73 |     attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_)
74 |     output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)
75 |     return output.transpose(1, 2).contiguous()
76 | 


--------------------------------------------------------------------------------
/oneformer/modeling/pixel_decoder/ops/make.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # ------------------------------------------------------------------------------------------------
 3 | # Deformable DETR
 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | # ------------------------------------------------------------------------------------------------
 7 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | # ------------------------------------------------------------------------------------------------
 9 | 
10 | # Copyright (c) Facebook, Inc. and its affiliates.
11 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
12 | 
13 | python setup.py build install
14 | 


--------------------------------------------------------------------------------
/oneformer/modeling/pixel_decoder/ops/modules/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | 
10 | from .ms_deform_attn import MSDeformAttn
11 | 


--------------------------------------------------------------------------------
/oneformer/modeling/pixel_decoder/ops/modules/ms_deform_attn.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------------------------------
  2 | # Deformable DETR
  3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
  4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  5 | # ------------------------------------------------------------------------------------------------
  6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
  7 | # ------------------------------------------------------------------------------------------------
  8 | 
  9 | # Copyright (c) Facebook, Inc. and its affiliates.
 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
 11 | 
 12 | from __future__ import absolute_import
 13 | from __future__ import print_function
 14 | from __future__ import division
 15 | 
 16 | import warnings
 17 | import math
 18 | 
 19 | import torch
 20 | from torch import nn
 21 | import torch.nn.functional as F
 22 | from torch.nn.init import xavier_uniform_, constant_
 23 | 
 24 | if torch.cuda.is_available():
 25 |     from ..functions import MSDeformAttnFunction
 26 | else:
 27 |     MSDeformAttnFunction = None
 28 | from ..functions.ms_deform_attn_func import ms_deform_attn_core_pytorch
 29 | 
 30 | 
 31 | def _is_power_of_2(n):
 32 |     if (not isinstance(n, int)) or (n < 0):
 33 |         raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n)))
 34 |     return (n & (n-1) == 0) and n != 0
 35 | 
 36 | 
 37 | class MSDeformAttn(nn.Module):
 38 |     def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4):
 39 |         """
 40 |         Multi-Scale Deformable Attention Module
 41 |         :param d_model      hidden dimension
 42 |         :param n_levels     number of feature levels
 43 |         :param n_heads      number of attention heads
 44 |         :param n_points     number of sampling points per attention head per feature level
 45 |         """
 46 |         super().__init__()
 47 |         if d_model % n_heads != 0:
 48 |             raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads))
 49 |         _d_per_head = d_model // n_heads
 50 |         # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation
 51 |         if not _is_power_of_2(_d_per_head):
 52 |             warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 "
 53 |                           "which is more efficient in our CUDA implementation.")
 54 | 
 55 |         self.im2col_step = 128
 56 | 
 57 |         self.d_model = d_model
 58 |         self.n_levels = n_levels
 59 |         self.n_heads = n_heads
 60 |         self.n_points = n_points
 61 | 
 62 |         self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2)
 63 |         self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points)
 64 |         self.value_proj = nn.Linear(d_model, d_model)
 65 |         self.output_proj = nn.Linear(d_model, d_model)
 66 | 
 67 |         self._reset_parameters()
 68 | 
 69 |     def _reset_parameters(self):
 70 |         constant_(self.sampling_offsets.weight.data, 0.)
 71 |         thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
 72 |         grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
 73 |         grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1)
 74 |         for i in range(self.n_points):
 75 |             grid_init[:, :, i, :] *= i + 1
 76 |         with torch.no_grad():
 77 |             self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
 78 |         constant_(self.attention_weights.weight.data, 0.)
 79 |         constant_(self.attention_weights.bias.data, 0.)
 80 |         xavier_uniform_(self.value_proj.weight.data)
 81 |         constant_(self.value_proj.bias.data, 0.)
 82 |         xavier_uniform_(self.output_proj.weight.data)
 83 |         constant_(self.output_proj.bias.data, 0.)
 84 | 
 85 |     def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None):
 86 |         """
 87 |         :param query                       (N, Length_{query}, C)
 88 |         :param reference_points            (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area
 89 |                                         or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes
 90 |         :param input_flatten               (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C)
 91 |         :param input_spatial_shapes        (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
 92 |         :param input_level_start_index     (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}]
 93 |         :param input_padding_mask          (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements
 94 | 
 95 |         :return output                     (N, Length_{query}, C)
 96 |         """
 97 |         N, Len_q, _ = query.shape
 98 |         N, Len_in, _ = input_flatten.shape
 99 |         assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in
100 | 
101 |         value = self.value_proj(input_flatten)
102 |         if input_padding_mask is not None:
103 |             value = value.masked_fill(input_padding_mask[..., None], float(0))
104 |         value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads)
105 |         sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2)
106 |         attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points)
107 |         attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points)
108 |         # N, Len_q, n_heads, n_levels, n_points, 2
109 |         if reference_points.shape[-1] == 2:
110 |             offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1)
111 |             sampling_locations = reference_points[:, :, None, :, None, :] \
112 |                                  + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
113 |         elif reference_points.shape[-1] == 4:
114 |             sampling_locations = reference_points[:, :, None, :, None, :2] \
115 |                                  + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
116 |         else:
117 |             raise ValueError(
118 |                 'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1]))
119 |         if torch.cuda.is_available():
120 |             output = MSDeformAttnFunction.apply(
121 |                 value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step)
122 |         else:
123 |             ## CPU
124 |             output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights)
125 |         output = self.output_proj(output)
126 |         return output
127 | 


--------------------------------------------------------------------------------
/oneformer/modeling/pixel_decoder/ops/setup.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | import os
13 | import glob
14 | 
15 | import torch
16 | 
17 | from torch.utils.cpp_extension import CUDA_HOME
18 | from torch.utils.cpp_extension import CppExtension
19 | from torch.utils.cpp_extension import CUDAExtension
20 | 
21 | from setuptools import find_packages
22 | from setuptools import setup
23 | 
24 | requirements = ["torch", "torchvision"]
25 | 
26 | def get_extensions():
27 |     this_dir = os.path.dirname(os.path.abspath(__file__))
28 |     extensions_dir = os.path.join(this_dir, "src")
29 | 
30 |     main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
31 |     source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
32 |     source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
33 | 
34 |     sources = main_file + source_cpu
35 |     extension = CppExtension
36 |     extra_compile_args = {"cxx": []}
37 |     define_macros = []
38 | 
39 |     # Force cuda since torch ask for a device, not if cuda is in fact available.
40 |     if (os.environ.get('FORCE_CUDA') or torch.cuda.is_available()) and CUDA_HOME is not None:
41 |         extension = CUDAExtension
42 |         sources += source_cuda
43 |         define_macros += [("WITH_CUDA", None)]
44 |         extra_compile_args["nvcc"] = [
45 |             "-DCUDA_HAS_FP16=1",
46 |             "-D__CUDA_NO_HALF_OPERATORS__",
47 |             "-D__CUDA_NO_HALF_CONVERSIONS__",
48 |             "-D__CUDA_NO_HALF2_OPERATORS__",
49 |         ]
50 |     else:
51 |         if CUDA_HOME is None:
52 |             raise NotImplementedError('CUDA_HOME is None. Please set environment variable CUDA_HOME.')
53 |         else:
54 |             raise NotImplementedError('No CUDA runtime is found. Please set FORCE_CUDA=1 or test it by running torch.cuda.is_available().')
55 | 
56 |     sources = [os.path.join(extensions_dir, s) for s in sources]
57 |     include_dirs = [extensions_dir]
58 |     ext_modules = [
59 |         extension(
60 |             "MultiScaleDeformableAttention",
61 |             sources,
62 |             include_dirs=include_dirs,
63 |             define_macros=define_macros,
64 |             extra_compile_args=extra_compile_args,
65 |         )
66 |     ]
67 |     return ext_modules
68 | 
69 | setup(
70 |     name="MultiScaleDeformableAttention",
71 |     version="1.0",
72 |     author="Weijie Su",
73 |     url="https://github.com/fundamentalvision/Deformable-DETR",
74 |     description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention",
75 |     packages=find_packages(exclude=("configs", "tests",)),
76 |     ext_modules=get_extensions(),
77 |     cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
78 | )
79 | 


--------------------------------------------------------------------------------
/oneformer/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.cpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #include <vector>
17 | 
18 | #include <ATen/ATen.h>
19 | #include <ATen/cuda/CUDAContext.h>
20 | 
21 | 
22 | at::Tensor
23 | ms_deform_attn_cpu_forward(
24 |     const at::Tensor &value, 
25 |     const at::Tensor &spatial_shapes,
26 |     const at::Tensor &level_start_index,
27 |     const at::Tensor &sampling_loc,
28 |     const at::Tensor &attn_weight,
29 |     const int im2col_step)
30 | {
31 |     AT_ERROR("Not implement on cpu");
32 | }
33 | 
34 | std::vector<at::Tensor>
35 | ms_deform_attn_cpu_backward(
36 |     const at::Tensor &value, 
37 |     const at::Tensor &spatial_shapes,
38 |     const at::Tensor &level_start_index,
39 |     const at::Tensor &sampling_loc,
40 |     const at::Tensor &attn_weight,
41 |     const at::Tensor &grad_output,
42 |     const int im2col_step)
43 | {
44 |     AT_ERROR("Not implement on cpu");
45 | }
46 | 
47 | 


--------------------------------------------------------------------------------
/oneformer/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #pragma once
17 | #include <torch/extension.h>
18 | 
19 | at::Tensor
20 | ms_deform_attn_cpu_forward(
21 |     const at::Tensor &value, 
22 |     const at::Tensor &spatial_shapes,
23 |     const at::Tensor &level_start_index,
24 |     const at::Tensor &sampling_loc,
25 |     const at::Tensor &attn_weight,
26 |     const int im2col_step);
27 | 
28 | std::vector<at::Tensor>
29 | ms_deform_attn_cpu_backward(
30 |     const at::Tensor &value, 
31 |     const at::Tensor &spatial_shapes,
32 |     const at::Tensor &level_start_index,
33 |     const at::Tensor &sampling_loc,
34 |     const at::Tensor &attn_weight,
35 |     const at::Tensor &grad_output,
36 |     const int im2col_step);
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/oneformer/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.cu:
--------------------------------------------------------------------------------
  1 | /*!
  2 | **************************************************************************************************
  3 | * Deformable DETR
  4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
  5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  6 | **************************************************************************************************
  7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
  8 | **************************************************************************************************
  9 | */
 10 | 
 11 | /*!
 12 | * Copyright (c) Facebook, Inc. and its affiliates.
 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
 14 | */
 15 | 
 16 | #include <vector>
 17 | #include "cuda/ms_deform_im2col_cuda.cuh"
 18 | 
 19 | #include <ATen/ATen.h>
 20 | #include <ATen/cuda/CUDAContext.h>
 21 | #include <cuda.h>
 22 | #include <cuda_runtime.h>
 23 | 
 24 | 
 25 | at::Tensor ms_deform_attn_cuda_forward(
 26 |     const at::Tensor &value, 
 27 |     const at::Tensor &spatial_shapes,
 28 |     const at::Tensor &level_start_index,
 29 |     const at::Tensor &sampling_loc,
 30 |     const at::Tensor &attn_weight,
 31 |     const int im2col_step)
 32 | {
 33 |     AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
 34 |     AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
 35 |     AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
 36 |     AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
 37 |     AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
 38 | 
 39 |     AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
 40 |     AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
 41 |     AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
 42 |     AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
 43 |     AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
 44 | 
 45 |     const int batch = value.size(0);
 46 |     const int spatial_size = value.size(1);
 47 |     const int num_heads = value.size(2);
 48 |     const int channels = value.size(3);
 49 | 
 50 |     const int num_levels = spatial_shapes.size(0);
 51 | 
 52 |     const int num_query = sampling_loc.size(1);
 53 |     const int num_point = sampling_loc.size(4);
 54 | 
 55 |     const int im2col_step_ = std::min(batch, im2col_step);
 56 | 
 57 |     AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
 58 |     
 59 |     auto output = at::zeros({batch, num_query, num_heads, channels}, value.options());
 60 | 
 61 |     const int batch_n = im2col_step_;
 62 |     auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
 63 |     auto per_value_size = spatial_size * num_heads * channels;
 64 |     auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
 65 |     auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
 66 |     for (int n = 0; n < batch/im2col_step_; ++n)
 67 |     {
 68 |         auto columns = output_n.select(0, n);
 69 |         AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_forward_cuda", ([&] {
 70 |             ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(),
 71 |                 value.data<scalar_t>() + n * im2col_step_ * per_value_size,
 72 |                 spatial_shapes.data<int64_t>(),
 73 |                 level_start_index.data<int64_t>(),
 74 |                 sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
 75 |                 attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
 76 |                 batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
 77 |                 columns.data<scalar_t>());
 78 | 
 79 |         }));
 80 |     }
 81 | 
 82 |     output = output.view({batch, num_query, num_heads*channels});
 83 | 
 84 |     return output;
 85 | }
 86 | 
 87 | 
 88 | std::vector<at::Tensor> ms_deform_attn_cuda_backward(
 89 |     const at::Tensor &value, 
 90 |     const at::Tensor &spatial_shapes,
 91 |     const at::Tensor &level_start_index,
 92 |     const at::Tensor &sampling_loc,
 93 |     const at::Tensor &attn_weight,
 94 |     const at::Tensor &grad_output,
 95 |     const int im2col_step)
 96 | {
 97 | 
 98 |     AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
 99 |     AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
100 |     AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
101 |     AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
102 |     AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
103 |     AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous");
104 | 
105 |     AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
106 |     AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
107 |     AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
108 |     AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
109 |     AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
110 |     AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor");
111 | 
112 |     const int batch = value.size(0);
113 |     const int spatial_size = value.size(1);
114 |     const int num_heads = value.size(2);
115 |     const int channels = value.size(3);
116 | 
117 |     const int num_levels = spatial_shapes.size(0);
118 | 
119 |     const int num_query = sampling_loc.size(1);
120 |     const int num_point = sampling_loc.size(4);
121 | 
122 |     const int im2col_step_ = std::min(batch, im2col_step);
123 | 
124 |     AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
125 | 
126 |     auto grad_value = at::zeros_like(value);
127 |     auto grad_sampling_loc = at::zeros_like(sampling_loc);
128 |     auto grad_attn_weight = at::zeros_like(attn_weight);
129 | 
130 |     const int batch_n = im2col_step_;
131 |     auto per_value_size = spatial_size * num_heads * channels;
132 |     auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
133 |     auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
134 |     auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
135 |     
136 |     for (int n = 0; n < batch/im2col_step_; ++n)
137 |     {
138 |         auto grad_output_g = grad_output_n.select(0, n);
139 |         AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_backward_cuda", ([&] {
140 |             ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(),
141 |                                     grad_output_g.data<scalar_t>(),
142 |                                     value.data<scalar_t>() + n * im2col_step_ * per_value_size,
143 |                                     spatial_shapes.data<int64_t>(),
144 |                                     level_start_index.data<int64_t>(),
145 |                                     sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
146 |                                     attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
147 |                                     batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
148 |                                     grad_value.data<scalar_t>() +  n * im2col_step_ * per_value_size,
149 |                                     grad_sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
150 |                                     grad_attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size);
151 | 
152 |         }));
153 |     }
154 | 
155 |     return {
156 |         grad_value, grad_sampling_loc, grad_attn_weight
157 |     };
158 | }


--------------------------------------------------------------------------------
/oneformer/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #pragma once
17 | #include <torch/extension.h>
18 | 
19 | at::Tensor ms_deform_attn_cuda_forward(
20 |     const at::Tensor &value, 
21 |     const at::Tensor &spatial_shapes,
22 |     const at::Tensor &level_start_index,
23 |     const at::Tensor &sampling_loc,
24 |     const at::Tensor &attn_weight,
25 |     const int im2col_step);
26 | 
27 | std::vector<at::Tensor> ms_deform_attn_cuda_backward(
28 |     const at::Tensor &value, 
29 |     const at::Tensor &spatial_shapes,
30 |     const at::Tensor &level_start_index,
31 |     const at::Tensor &sampling_loc,
32 |     const at::Tensor &attn_weight,
33 |     const at::Tensor &grad_output,
34 |     const int im2col_step);
35 | 
36 | 


--------------------------------------------------------------------------------
/oneformer/modeling/pixel_decoder/ops/src/ms_deform_attn.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #pragma once
17 | 
18 | #include "cpu/ms_deform_attn_cpu.h"
19 | 
20 | #ifdef WITH_CUDA
21 | #include "cuda/ms_deform_attn_cuda.h"
22 | #endif
23 | 
24 | 
25 | at::Tensor
26 | ms_deform_attn_forward(
27 |     const at::Tensor &value, 
28 |     const at::Tensor &spatial_shapes,
29 |     const at::Tensor &level_start_index,
30 |     const at::Tensor &sampling_loc,
31 |     const at::Tensor &attn_weight,
32 |     const int im2col_step)
33 | {
34 |     if (value.type().is_cuda())
35 |     {
36 | #ifdef WITH_CUDA
37 |         return ms_deform_attn_cuda_forward(
38 |             value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step);
39 | #else
40 |         AT_ERROR("Not compiled with GPU support");
41 | #endif
42 |     }
43 |     AT_ERROR("Not implemented on the CPU");
44 | }
45 | 
46 | std::vector<at::Tensor>
47 | ms_deform_attn_backward(
48 |     const at::Tensor &value, 
49 |     const at::Tensor &spatial_shapes,
50 |     const at::Tensor &level_start_index,
51 |     const at::Tensor &sampling_loc,
52 |     const at::Tensor &attn_weight,
53 |     const at::Tensor &grad_output,
54 |     const int im2col_step)
55 | {
56 |     if (value.type().is_cuda())
57 |     {
58 | #ifdef WITH_CUDA
59 |         return ms_deform_attn_cuda_backward(
60 |             value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step);
61 | #else
62 |         AT_ERROR("Not compiled with GPU support");
63 | #endif
64 |     }
65 |     AT_ERROR("Not implemented on the CPU");
66 | }
67 | 
68 | 


--------------------------------------------------------------------------------
/oneformer/modeling/pixel_decoder/ops/src/vision.cpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #include "ms_deform_attn.h"
17 | 
18 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
19 |   m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward");
20 |   m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward");
21 | }
22 | 


--------------------------------------------------------------------------------
/oneformer/modeling/pixel_decoder/ops/test.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from __future__ import absolute_import
13 | from __future__ import print_function
14 | from __future__ import division
15 | 
16 | import time
17 | import torch
18 | import torch.nn as nn
19 | from torch.autograd import gradcheck
20 | 
21 | from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch
22 | 
23 | 
24 | N, M, D = 1, 2, 2
25 | Lq, L, P = 2, 2, 2
26 | shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda()
27 | level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1]))
28 | S = sum([(H*W).item() for H, W in shapes])
29 | 
30 | 
31 | torch.manual_seed(3)
32 | 
33 | 
34 | @torch.no_grad()
35 | def check_forward_equal_with_pytorch_double():
36 |     value = torch.rand(N, S, M, D).cuda() * 0.01
37 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
38 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
39 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
40 |     im2col_step = 2
41 |     output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu()
42 |     output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu()
43 |     fwdok = torch.allclose(output_cuda, output_pytorch)
44 |     max_abs_err = (output_cuda - output_pytorch).abs().max()
45 |     max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
46 | 
47 |     print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
48 | 
49 | 
50 | @torch.no_grad()
51 | def check_forward_equal_with_pytorch_float():
52 |     value = torch.rand(N, S, M, D).cuda() * 0.01
53 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
54 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
55 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
56 |     im2col_step = 2
57 |     output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu()
58 |     output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu()
59 |     fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3)
60 |     max_abs_err = (output_cuda - output_pytorch).abs().max()
61 |     max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
62 | 
63 |     print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
64 | 
65 | 
66 | def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True):
67 | 
68 |     value = torch.rand(N, S, M, channels).cuda() * 0.01
69 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
70 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
71 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
72 |     im2col_step = 2
73 |     func = MSDeformAttnFunction.apply
74 | 
75 |     value.requires_grad = grad_value
76 |     sampling_locations.requires_grad = grad_sampling_loc
77 |     attention_weights.requires_grad = grad_attn_weight
78 | 
79 |     gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step))
80 | 
81 |     print(f'* {gradok} check_gradient_numerical(D={channels})')
82 | 
83 | 
84 | if __name__ == '__main__':
85 |     check_forward_equal_with_pytorch_double()
86 |     check_forward_equal_with_pytorch_float()
87 | 
88 |     for channels in [30, 32, 64, 71, 1025, 2048, 3096]:
89 |         check_gradient_numerical(channels, True, True, True)
90 | 
91 | 
92 | 
93 | 


--------------------------------------------------------------------------------
/oneformer/modeling/transformer_decoder/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from .oneformer_transformer_decoder import ContrastiveMultiScaleMaskedTransformerDecoder


--------------------------------------------------------------------------------
/oneformer/modeling/transformer_decoder/position_encoding.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Reference: https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/modeling/transformer_decoder/position_encoding.py
 3 | # Modified by Jitesh Jain (https://github.com/praeclarumjj3)
 4 | # ------------------------------------------------------------------------------
 5 | 
 6 | """
 7 | Various positional encodings for the transformer.
 8 | """
 9 | import math
10 | 
11 | import torch
12 | from torch import nn
13 | 
14 | 
15 | class PositionEmbeddingSine(nn.Module):
16 |     """
17 |     This is a more standard version of the position embedding, very similar to the one
18 |     used by the Attention is all you need paper, generalized to work on images.
19 |     """
20 | 
21 |     def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
22 |         super().__init__()
23 |         self.num_pos_feats = num_pos_feats
24 |         self.temperature = temperature
25 |         self.normalize = normalize
26 |         if scale is not None and normalize is False:
27 |             raise ValueError("normalize should be True if scale is passed")
28 |         if scale is None:
29 |             scale = 2 * math.pi
30 |         self.scale = scale
31 | 
32 |     def forward(self, x, mask=None):
33 |         if mask is None:
34 |             mask = torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool)
35 |         not_mask = ~mask
36 |         y_embed = not_mask.cumsum(1, dtype=torch.float32)
37 |         x_embed = not_mask.cumsum(2, dtype=torch.float32)
38 |         if self.normalize:
39 |             eps = 1e-6
40 |             y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
41 |             x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
42 | 
43 |         dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
44 |         dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
45 | 
46 |         pos_x = x_embed[:, :, :, None] / dim_t
47 |         pos_y = y_embed[:, :, :, None] / dim_t
48 |         pos_x = torch.stack(
49 |             (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4
50 |         ).flatten(3)
51 |         pos_y = torch.stack(
52 |             (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4
53 |         ).flatten(3)
54 |         pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
55 |         return pos
56 |     
57 |     def __repr__(self, _repr_indent=4):
58 |         head = "Positional encoding " + self.__class__.__name__
59 |         body = [
60 |             "num_pos_feats: {}".format(self.num_pos_feats),
61 |             "temperature: {}".format(self.temperature),
62 |             "normalize: {}".format(self.normalize),
63 |             "scale: {}".format(self.scale),
64 |         ]
65 |         # _repr_indent = 4
66 |         lines = [head] + [" " * _repr_indent + line for line in body]
67 |         return "\n".join(lines)
68 | 


--------------------------------------------------------------------------------
/oneformer/test_time_augmentation.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------------
  2 | # Reference: https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/test_time_augmentation.py
  3 | # ------------------------------------------------------------------------------
  4 | 
  5 | import copy
  6 | import logging
  7 | from itertools import count
  8 | 
  9 | import numpy as np
 10 | import torch
 11 | from fvcore.transforms import HFlipTransform
 12 | from torch import nn
 13 | from torch.nn.parallel import DistributedDataParallel
 14 | 
 15 | from detectron2.data.detection_utils import read_image
 16 | from .datasetmapper_tta import DatasetMapperTTA
 17 | import torch.nn.functional as F
 18 | 
 19 | __all__ = [
 20 |     "SemanticSegmentorWithTTA",
 21 | ]
 22 | 
 23 | 
 24 | class SemanticSegmentorWithTTA(nn.Module):
 25 |     """
 26 |     A SemanticSegmentor with test-time augmentation enabled.
 27 |     Its :meth:`__call__` method has the same interface as :meth:`SemanticSegmentor.forward`.
 28 |     """
 29 | 
 30 |     def __init__(self, cfg, model, tta_mapper=None, batch_size=1):
 31 |         """
 32 |         Args:
 33 |             cfg (CfgNode):
 34 |             model (SemanticSegmentor): a SemanticSegmentor to apply TTA on.
 35 |             tta_mapper (callable): takes a dataset dict and returns a list of
 36 |                 augmented versions of the dataset dict. Defaults to
 37 |                 `DatasetMapperTTA(cfg)`.
 38 |             batch_size (int): batch the augmented images into this batch size for inference.
 39 |         """
 40 |         super().__init__()
 41 |         if isinstance(model, DistributedDataParallel):
 42 |             model = model.module
 43 |         self.cfg = cfg.clone()
 44 |         self.num_classes = self.cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES
 45 | 
 46 |         self.model = model
 47 | 
 48 |         if tta_mapper is None:
 49 |             tta_mapper = DatasetMapperTTA(cfg)
 50 |         self.tta_mapper = tta_mapper
 51 |         self.batch_size = batch_size
 52 | 
 53 |     def __call__(self, batched_inputs):
 54 |         """
 55 |         Same input/output format as :meth:`SemanticSegmentor.forward`
 56 |         """
 57 | 
 58 |         def _maybe_read_image(dataset_dict):
 59 |             ret = copy.copy(dataset_dict)
 60 |             if "image" not in ret:
 61 |                 image = read_image(ret.pop("file_name"), self.model.input_format)
 62 |                 image = torch.from_numpy(np.ascontiguousarray(image.transpose(2, 0, 1)))  # CHW
 63 |                 ret["image"] = image
 64 |             if "height" not in ret and "width" not in ret:
 65 |                 ret["height"] = image.shape[1]
 66 |                 ret["width"] = image.shape[2]
 67 |             return ret
 68 | 
 69 |         processed_results = []
 70 |         for x in batched_inputs:
 71 |             result = self._inference_one_image(_maybe_read_image(x))
 72 |             processed_results.append(result)
 73 |         return processed_results
 74 | 
 75 |     def _inference_one_image(self, input):
 76 |         """
 77 |         Args:
 78 |             input (dict): one dataset dict with "image" field being a CHW tensor
 79 |         Returns:
 80 |             dict: one output dict
 81 |         """
 82 |         orig_shape = (input["height"], input["width"])
 83 |         augmented_inputs, tfms = self._get_augmented_inputs(input)
 84 | 
 85 |         final_predictions = None
 86 |         count_predictions = 0
 87 |         for input, tfm in zip(augmented_inputs, tfms):
 88 |             count_predictions += 1
 89 |             with torch.no_grad():
 90 |                 if final_predictions is None:
 91 |                     if any(isinstance(t, HFlipTransform) for t in tfm.transforms):
 92 |                         final_predictions = self.model([input])[0].pop("sem_seg").flip(dims=[2])
 93 |                     else:
 94 |                         final_predictions = self.model([input])[0].pop("sem_seg")
 95 |                 else:
 96 |                     if any(isinstance(t, HFlipTransform) for t in tfm.transforms):
 97 |                         final_predictions += self.model([input])[0].pop("sem_seg").flip(dims=[2])
 98 |                     else:
 99 |                         final_predictions += self.model([input])[0].pop("sem_seg")
100 | 
101 |         final_predictions = final_predictions / count_predictions
102 |         return {"sem_seg": final_predictions}
103 | 
104 |     def _get_augmented_inputs(self, input):
105 |         augmented_inputs = self.tta_mapper(input)
106 |         tfms = [x.pop("transforms") for x in augmented_inputs]
107 |         return augmented_inputs, tfms


--------------------------------------------------------------------------------
/oneformer/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from .events import setup_wandb, WandbWriter


--------------------------------------------------------------------------------
/oneformer/utils/box_ops.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  2 | """
  3 | Utilities for bounding box manipulation and GIoU.
  4 | """
  5 | import torch, os
  6 | from torchvision.ops.boxes import box_area
  7 | 
  8 | 
  9 | def box_cxcywh_to_xyxy(x):
 10 |     x_c, y_c, w, h = x.unbind(-1)
 11 |     b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
 12 |          (x_c + 0.5 * w), (y_c + 0.5 * h)]
 13 |     return torch.stack(b, dim=-1)
 14 | 
 15 | 
 16 | def box_xyxy_to_cxcywh(x):
 17 |     x0, y0, x1, y1 = x.unbind(-1)
 18 |     b = [(x0 + x1) / 2, (y0 + y1) / 2,
 19 |          (x1 - x0), (y1 - y0)]
 20 |     return torch.stack(b, dim=-1)
 21 | 
 22 | 
 23 | # modified from torchvision to also return the union
 24 | def box_iou(boxes1, boxes2):
 25 |     area1 = box_area(boxes1)
 26 |     area2 = box_area(boxes2)
 27 | 
 28 |     # import ipdb; ipdb.set_trace()
 29 |     lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
 30 |     rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
 31 | 
 32 |     wh = (rb - lt).clamp(min=0)  # [N,M,2]
 33 |     inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
 34 | 
 35 |     union = area1[:, None] + area2 - inter
 36 | 
 37 |     iou = inter / (union + 1e-6)
 38 |     return iou, union
 39 | 
 40 | 
 41 | def generalized_box_iou(boxes1, boxes2):
 42 |     """
 43 |     Generalized IoU from https://giou.stanford.edu/
 44 |     The boxes should be in [x0, y0, x1, y1] format
 45 |     Returns a [N, M] pairwise matrix, where N = len(boxes1)
 46 |     and M = len(boxes2)
 47 |     """
 48 |     # degenerate boxes gives inf / nan results
 49 |     # so do an early check
 50 |     assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
 51 |     assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
 52 |     # except:
 53 |     #     import ipdb; ipdb.set_trace()
 54 |     iou, union = box_iou(boxes1, boxes2)
 55 | 
 56 |     lt = torch.min(boxes1[:, None, :2], boxes2[:, :2])
 57 |     rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
 58 | 
 59 |     wh = (rb - lt).clamp(min=0)  # [N,M,2]
 60 |     area = wh[:, :, 0] * wh[:, :, 1]
 61 | 
 62 |     return iou - (area - union) / (area + 1e-6)
 63 | 
 64 | 
 65 | 
 66 | # modified from torchvision to also return the union
 67 | def box_iou_pairwise(boxes1, boxes2):
 68 |     area1 = box_area(boxes1)
 69 |     area2 = box_area(boxes2)
 70 | 
 71 |     lt = torch.max(boxes1[:, :2], boxes2[:, :2])  # [N,2]
 72 |     rb = torch.min(boxes1[:, 2:], boxes2[:, 2:])  # [N,2]
 73 | 
 74 |     wh = (rb - lt).clamp(min=0)  # [N,2]
 75 |     inter = wh[:, 0] * wh[:, 1]  # [N]
 76 | 
 77 |     union = area1 + area2 - inter
 78 | 
 79 |     iou = inter / union
 80 |     return iou, union
 81 | 
 82 | 
 83 | def generalized_box_iou_pairwise(boxes1, boxes2):
 84 |     """
 85 |     Generalized IoU from https://giou.stanford.edu/
 86 |     Input:
 87 |         - boxes1, boxes2: N,4
 88 |     Output:
 89 |         - giou: N, 4
 90 |     """
 91 |     # degenerate boxes gives inf / nan results
 92 |     # so do an early check
 93 |     assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
 94 |     assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
 95 |     assert boxes1.shape == boxes2.shape
 96 |     iou, union = box_iou_pairwise(boxes1, boxes2) # N, 4
 97 | 
 98 |     lt = torch.min(boxes1[:, :2], boxes2[:, :2])
 99 |     rb = torch.max(boxes1[:, 2:], boxes2[:, 2:])
100 | 
101 |     wh = (rb - lt).clamp(min=0)  # [N,2]
102 |     area = wh[:, 0] * wh[:, 1]
103 | 
104 |     return iou - (area - union) / area
105 | 
106 | def masks_to_boxes(masks):
107 |     """Compute the bounding boxes around the provided masks
108 |     The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions.
109 |     Returns a [N, 4] tensors, with the boxes in xyxy format
110 |     """
111 |     if masks.numel() == 0:
112 |         return torch.zeros((0, 4), device=masks.device)
113 | 
114 |     h, w = masks.shape[-2:]
115 | 
116 |     y = torch.arange(0, h, dtype=torch.float)
117 |     x = torch.arange(0, w, dtype=torch.float)
118 |     y, x = torch.meshgrid(y, x)
119 | 
120 |     x_mask = (masks * x.unsqueeze(0))
121 |     x_max = x_mask.flatten(1).max(-1)[0]
122 |     x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
123 | 
124 |     y_mask = (masks * y.unsqueeze(0))
125 |     y_max = y_mask.flatten(1).max(-1)[0]
126 |     y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
127 | 
128 |     return torch.stack([x_min, y_min, x_max, y_max], 1)
129 | 
130 | if __name__ == '__main__':
131 |     x = torch.rand(5, 4)
132 |     y = torch.rand(3, 4)
133 |     iou, union = box_iou(x, y)


--------------------------------------------------------------------------------
/oneformer/utils/events.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import wandb
  3 | from detectron2.utils import comm
  4 | from detectron2.utils.events import EventWriter, get_event_storage
  5 | 
  6 | 
  7 | def setup_wandb(cfg, args):
  8 |     if comm.is_main_process():
  9 |         init_args = {
 10 |             k.lower(): v
 11 |             for k, v in cfg.WANDB.items()
 12 |             if isinstance(k, str) and k not in ["config"]
 13 |         }
 14 |         # only include most related part to avoid too big table
 15 |         # TODO: add configurable params to select which part of `cfg` should be saved in config
 16 |         if "config_exclude_keys" in init_args:
 17 |             init_args["config"] = cfg
 18 |             init_args["config"]["cfg_file"] = args.config_file
 19 |         else:
 20 |             init_args["config"] = {
 21 |                 "model": cfg.MODEL,
 22 |                 "solver": cfg.SOLVER,
 23 |                 "cfg_file": args.config_file,
 24 |             }
 25 |         if ("name" not in init_args) or (init_args["name"] is None):
 26 |             init_args["name"] = os.path.basename(args.config_file)
 27 |         else:
 28 |             init_args["name"] = init_args["name"] + '_' + os.path.basename(args.config_file)
 29 |         wandb.init(**init_args)
 30 | 
 31 | 
 32 | class BaseRule(object):
 33 |     def __call__(self, target):
 34 |         return target
 35 | 
 36 | 
 37 | class IsIn(BaseRule):
 38 |     def __init__(self, keyword: str):
 39 |         self.keyword = keyword
 40 | 
 41 |     def __call__(self, target):
 42 |         return self.keyword in target
 43 | 
 44 | 
 45 | class Prefix(BaseRule):
 46 |     def __init__(self, keyword: str):
 47 |         self.keyword = keyword
 48 | 
 49 |     def __call__(self, target):
 50 |         return "/".join([self.keyword, target])
 51 | 
 52 | 
 53 | class WandbWriter(EventWriter):
 54 |     """
 55 |     Write all scalars to a tensorboard file.
 56 |     """
 57 | 
 58 |     def __init__(self):
 59 |         """
 60 |         Args:
 61 |             log_dir (str): the directory to save the output events
 62 |             kwargs: other arguments passed to `torch.utils.tensorboard.SummaryWriter(...)`
 63 |         """
 64 |         self._last_write = -1
 65 |         self._group_rules = [
 66 |             (IsIn("/"), BaseRule()),
 67 |             (IsIn("loss"), Prefix("train")),
 68 |         ]
 69 | 
 70 |     def write(self):
 71 | 
 72 |         storage = get_event_storage()
 73 | 
 74 |         def _group_name(scalar_name):
 75 |             for (rule, op) in self._group_rules:
 76 |                 if rule(scalar_name):
 77 |                     return op(scalar_name)
 78 |             return scalar_name
 79 | 
 80 |         stats = {
 81 |             _group_name(name): scalars[0]
 82 |             for name, scalars in storage.latest().items()
 83 |             if scalars[1] > self._last_write
 84 |         }
 85 |         if len(stats) > 0:
 86 |             self._last_write = max([v[1] for k, v in storage.latest().items()])
 87 | 
 88 |         # storage.put_{image,histogram} is only meant to be used by
 89 |         # tensorboard writer. So we access its internal fields directly from here.
 90 |         if len(storage._vis_data) >= 1:
 91 |             stats["image"] = [
 92 |                 wandb.Image(img, caption=img_name)
 93 |                 for img_name, img, step_num in storage._vis_data
 94 |             ]
 95 |             # Storage stores all image data and rely on this writer to clear them.
 96 |             # As a result it assumes only one writer will use its image data.
 97 |             # An alternative design is to let storage store limited recent
 98 |             # data (e.g. only the most recent image) that all writers can access.
 99 |             # In that case a writer may not see all image data if its period is long.
100 |             storage.clear_images()
101 | 
102 |         if len(storage._histograms) >= 1:
103 | 
104 |             def create_bar(tag, bucket_limits, bucket_counts, **kwargs):
105 |                 data = [
106 |                     [label, val] for (label, val) in zip(bucket_limits, bucket_counts)
107 |                 ]
108 |                 table = wandb.Table(data=data, columns=["label", "value"])
109 |                 return wandb.plot.bar(table, "label", "value", title=tag)
110 | 
111 |             stats["hist"] = [create_bar(**params) for params in storage._histograms]
112 | 
113 |             storage.clear_histograms()
114 | 
115 |         if len(stats) == 0:
116 |             return
117 |         wandb.log(stats, step=storage.iter)
118 | 
119 |     def close(self):
120 |         wandb.finish()


--------------------------------------------------------------------------------
/oneformer/utils/misc.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/util/misc.py
  3 | """
  4 | Misc functions, including distributed helpers.
  5 | 
  6 | Mostly copy-paste from torchvision references.
  7 | """
  8 | from typing import List, Optional
  9 | 
 10 | import torch
 11 | import torch.distributed as dist
 12 | import torchvision
 13 | from torch import Tensor
 14 | import warnings
 15 | import torch.nn.functional as F
 16 | import math
 17 | 
 18 | def inverse_sigmoid(x, eps=1e-3):
 19 |     x = x.clamp(min=0, max=1)
 20 |     x1 = x.clamp(min=eps)
 21 |     x2 = (1 - x).clamp(min=eps)
 22 |     return torch.log(x1/x2)
 23 | 
 24 | def _no_grad_trunc_normal_(tensor, mean, std, a, b):
 25 |     # Cut & paste from PyTorch official master until it's in a few official releases - RW
 26 |     # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
 27 |     def norm_cdf(x):
 28 |         # Computes standard normal cumulative distribution function
 29 |         return (1. + math.erf(x / math.sqrt(2.))) / 2.
 30 | 
 31 |     if (mean < a - 2 * std) or (mean > b + 2 * std):
 32 |         warnings.warn("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
 33 |                       "The distribution of values may be incorrect.",
 34 |                       stacklevel=2)
 35 | 
 36 |     with torch.no_grad():
 37 |         # Values are generated by using a truncated uniform distribution and
 38 |         # then using the inverse CDF for the normal distribution.
 39 |         # Get upper and lower cdf values
 40 |         l = norm_cdf((a - mean) / std)
 41 |         u = norm_cdf((b - mean) / std)
 42 | 
 43 |         # Uniformly fill tensor with values from [l, u], then translate to
 44 |         # [2l-1, 2u-1].
 45 |         tensor.uniform_(2 * l - 1, 2 * u - 1)
 46 | 
 47 |         # Use inverse cdf transform for normal distribution to get truncated
 48 |         # standard normal
 49 |         tensor.erfinv_()
 50 | 
 51 |         # Transform to proper mean, std
 52 |         tensor.mul_(std * math.sqrt(2.))
 53 |         tensor.add_(mean)
 54 | 
 55 |         # Clamp to ensure it's in the proper range
 56 |         tensor.clamp_(min=a, max=b)
 57 |         return tensor
 58 | 
 59 | def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
 60 |     # type: (Tensor, float, float, float, float) -> Tensor
 61 |     r"""Fills the input Tensor with values drawn from a truncated
 62 |     normal distribution. The values are effectively drawn from the
 63 |     normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
 64 |     with values outside :math:`[a, b]` redrawn until they are within
 65 |     the bounds. The method used for generating the random values works
 66 |     best when :math:`a \leq \text{mean} \leq b`.
 67 |     Args:
 68 |         tensor: an n-dimensional `torch.Tensor`
 69 |         mean: the mean of the normal distribution
 70 |         std: the standard deviation of the normal distribution
 71 |         a: the minimum cutoff value
 72 |         b: the maximum cutoff value
 73 |     Examples:
 74 |         >>> w = torch.empty(3, 5)
 75 |         >>> nn.init.trunc_normal_(w)
 76 |     """
 77 |     return _no_grad_trunc_normal_(tensor, mean, std, a, b)
 78 | 
 79 | def resize(input,
 80 |            size=None,
 81 |            scale_factor=None,
 82 |            mode='nearest',
 83 |            align_corners=None,
 84 |            warning=True):
 85 |     if warning:
 86 |         if size is not None and align_corners:
 87 |             input_h, input_w = tuple(int(x) for x in input.shape[2:])
 88 |             output_h, output_w = tuple(int(x) for x in size)
 89 |             if output_h > input_h or output_w > output_h:
 90 |                 if ((output_h > 1 and output_w > 1 and input_h > 1
 91 |                      and input_w > 1) and (output_h - 1) % (input_h - 1)
 92 |                         and (output_w - 1) % (input_w - 1)):
 93 |                     warnings.warn(
 94 |                         f'When align_corners={align_corners}, '
 95 |                         'the output would more aligned if '
 96 |                         f'input size {(input_h, input_w)} is `x+1` and '
 97 |                         f'out size {(output_h, output_w)} is `nx+1`')
 98 |     if isinstance(size, torch.Size):
 99 |         size = tuple(int(x) for x in size)
100 |     return F.interpolate(input, size, scale_factor, mode, align_corners)
101 | 
102 | def _max_by_axis(the_list):
103 |     # type: (List[List[int]]) -> List[int]
104 |     maxes = the_list[0]
105 |     for sublist in the_list[1:]:
106 |         for index, item in enumerate(sublist):
107 |             maxes[index] = max(maxes[index], item)
108 |     return maxes
109 | 
110 | 
111 | class NestedTensor(object):
112 |     def __init__(self, tensors, mask: Optional[Tensor]):
113 |         self.tensors = tensors
114 |         self.mask = mask
115 | 
116 |     def to(self, device):
117 |         # type: (Device) -> NestedTensor # noqa
118 |         cast_tensor = self.tensors.to(device)
119 |         mask = self.mask
120 |         if mask is not None:
121 |             assert mask is not None
122 |             cast_mask = mask.to(device)
123 |         else:
124 |             cast_mask = None
125 |         return NestedTensor(cast_tensor, cast_mask)
126 | 
127 |     def decompose(self):
128 |         return self.tensors, self.mask
129 | 
130 |     def __repr__(self):
131 |         return str(self.tensors)
132 | 
133 | 
134 | def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
135 |     # TODO make this more general
136 |     if tensor_list[0].ndim == 3:
137 |         if torchvision._is_tracing():
138 |             # nested_tensor_from_tensor_list() does not export well to ONNX
139 |             # call _onnx_nested_tensor_from_tensor_list() instead
140 |             return _onnx_nested_tensor_from_tensor_list(tensor_list)
141 | 
142 |         # TODO make it support different-sized images
143 |         max_size = _max_by_axis([list(img.shape) for img in tensor_list])
144 |         # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list]))
145 |         batch_shape = [len(tensor_list)] + max_size
146 |         b, c, h, w = batch_shape
147 |         dtype = tensor_list[0].dtype
148 |         device = tensor_list[0].device
149 |         tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
150 |         mask = torch.ones((b, h, w), dtype=torch.bool, device=device)
151 |         for img, pad_img, m in zip(tensor_list, tensor, mask):
152 |             pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
153 |             m[: img.shape[1], : img.shape[2]] = False
154 |     else:
155 |         raise ValueError("not supported")
156 |     return NestedTensor(tensor, mask)
157 | 
158 | 
159 | # _onnx_nested_tensor_from_tensor_list() is an implementation of
160 | # nested_tensor_from_tensor_list() that is supported by ONNX tracing.
161 | @torch.jit.unused
162 | def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor:
163 |     max_size = []
164 |     for i in range(tensor_list[0].dim()):
165 |         max_size_i = torch.max(
166 |             torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32)
167 |         ).to(torch.int64)
168 |         max_size.append(max_size_i)
169 |     max_size = tuple(max_size)
170 | 
171 |     # work around for
172 |     # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
173 |     # m[: img.shape[1], :img.shape[2]] = False
174 |     # which is not yet supported in onnx
175 |     padded_imgs = []
176 |     padded_masks = []
177 |     for img in tensor_list:
178 |         padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))]
179 |         padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0]))
180 |         padded_imgs.append(padded_img)
181 | 
182 |         m = torch.zeros_like(img[0], dtype=torch.int, device=img.device)
183 |         padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1)
184 |         padded_masks.append(padded_mask.to(torch.bool))
185 | 
186 |     tensor = torch.stack(padded_imgs)
187 |     mask = torch.stack(padded_masks)
188 | 
189 |     return NestedTensor(tensor, mask=mask)
190 | 
191 | 
192 | def is_dist_avail_and_initialized():
193 |     if not dist.is_available():
194 |         return False
195 |     if not dist.is_initialized():
196 |         return False
197 |     return True
198 | 


--------------------------------------------------------------------------------
/oneformer/utils/pos_embed.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # Position embedding utils
  3 | # --------------------------------------------------------
  4 | 
  5 | from typing import Tuple
  6 | 
  7 | import numpy as np
  8 | import torch
  9 | 
 10 | 
 11 | # --------------------------------------------------------
 12 | # 2D sine-cosine position embedding
 13 | # References:
 14 | # Transformer: https://github.com/tensorflow/models/blob/master/official/nlp/transformer/model_utils.py
 15 | # MoCo v3: https://github.com/facebookresearch/moco-v3
 16 | # --------------------------------------------------------
 17 | def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False):
 18 |     """
 19 |     grid_size: int of the grid height and width
 20 |     return:
 21 |     pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
 22 |     """
 23 |     grid_h = np.arange(grid_size, dtype=np.float32)
 24 |     grid_w = np.arange(grid_size, dtype=np.float32)
 25 |     grid = np.meshgrid(grid_w, grid_h)  # here w goes first
 26 |     grid = np.stack(grid, axis=0)
 27 | 
 28 |     grid = grid.reshape([2, 1, grid_size, grid_size])
 29 |     pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
 30 |     if cls_token:
 31 |         pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
 32 |     return pos_embed
 33 | 
 34 | 
 35 | def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
 36 |     assert embed_dim % 2 == 0
 37 | 
 38 |     # use half of dimensions to encode grid_h
 39 |     emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
 40 |     emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
 41 | 
 42 |     emb = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
 43 |     return emb
 44 | 
 45 | 
 46 | def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
 47 |     """
 48 |     embed_dim: output dimension for each position
 49 |     pos: a list of positions to be encoded: size (M,)
 50 |     out: (M, D)
 51 |     """
 52 |     assert embed_dim % 2 == 0
 53 |     omega = np.arange(embed_dim // 2, dtype=np.float)
 54 |     omega /= embed_dim / 2.0
 55 |     omega = 1.0 / 10000 ** omega  # (D/2,)
 56 | 
 57 |     pos = pos.reshape(-1)  # (M,)
 58 |     out = np.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
 59 | 
 60 |     emb_sin = np.sin(out)  # (M, D/2)
 61 |     emb_cos = np.cos(out)  # (M, D/2)
 62 | 
 63 |     emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
 64 |     return emb
 65 | 
 66 | 
 67 | # --------------------------------------------------------
 68 | # Interpolate position embeddings for high-resolution
 69 | # References:
 70 | # DeiT: https://github.com/facebookresearch/deit
 71 | # --------------------------------------------------------
 72 | def interpolate_pos_embed(model, checkpoint_model, pos_embed_key):
 73 |     if pos_embed_key in checkpoint_model:
 74 |         pos_embed_checkpoint = checkpoint_model[pos_embed_key]
 75 |         embedding_size = pos_embed_checkpoint.shape[-1]
 76 |         num_patches = model.num_patches
 77 |         if pos_embed_key.startswith("decoder"):
 78 |             num_extra_tokens = model.decoder_pos_embed.shape[-2] - num_patches
 79 |         else:
 80 |             num_extra_tokens = model.pos_embed.shape[-2] - num_patches
 81 |         # height (== width) for the checkpoint position embedding
 82 |         orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5)
 83 |         # height (== width) for the new position embedding
 84 |         new_size = int(num_patches ** 0.5)
 85 |         # class_token and dist_token are kept unchanged
 86 |         if orig_size != new_size:
 87 |             print(
 88 |                 "Position interpolate from %dx%d to %dx%d"
 89 |                 % (orig_size, orig_size, new_size, new_size)
 90 |             )
 91 |             extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
 92 |             # only the position tokens are interpolated
 93 |             pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
 94 |             pos_tokens = pos_tokens.reshape(
 95 |                 -1, orig_size, orig_size, embedding_size
 96 |             ).permute(0, 3, 1, 2)
 97 |             pos_tokens = torch.nn.functional.interpolate(
 98 |                 pos_tokens,
 99 |                 size=(new_size, new_size),
100 |                 mode="bicubic",
101 |                 align_corners=False,
102 |             )
103 |             pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
104 |             new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
105 |             checkpoint_model[pos_embed_key] = new_pos_embed
106 | 
107 | 
108 | def interpolate_pos_embed_online(
109 |     pos_embed, orig_size: Tuple[int], new_size: Tuple[int], num_extra_tokens: int
110 | ):
111 |     extra_tokens = pos_embed[:, :num_extra_tokens]
112 |     pos_tokens = pos_embed[:, num_extra_tokens:]
113 |     embedding_size = pos_tokens.shape[-1]
114 |     pos_tokens = pos_tokens.reshape(
115 |         -1, orig_size[0], orig_size[1], embedding_size
116 |     ).permute(0, 3, 1, 2)
117 |     pos_tokens = torch.nn.functional.interpolate(
118 |         pos_tokens, size=new_size, mode="bicubic", align_corners=False,
119 |     )
120 |     pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
121 |     new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
122 |     return new_pos_embed
123 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | cython
 2 | scipy==1.8.1
 3 | shapely
 4 | h5py==3.7.0
 5 | submitit==1.4.2
 6 | scikit-image
 7 | timm==0.4.12
 8 | einops==0.4.1
 9 | icecream==2.1.2
10 | setuptools==59.5.0
11 | wandb==0.12.20
12 | ftfy==6.1.1
13 | regex==2022.6.2
14 | inflect==5.6.0
15 | diffdist==0.1
16 | pytorch_lightning==1.6.4
17 | tqdm==4.64.0
18 | mmcv==1.6.2
19 | -f https://shi-labs.com/natten/wheels/cu113/torch1.10.1/index.html
20 | natten==0.14.4


--------------------------------------------------------------------------------
/tools/README.md:
--------------------------------------------------------------------------------
 1 | # OneFormer Tools
 2 | 
 3 | ## Download Pretrained Weights
 4 | 
 5 | It's common to initialize from backbone models pre-trained on ImageNet classification tasks. We use [Swin-Tranformer](https://github.com/microsoft/Swin-Transformer), [ConvNeXt](https://github.com/facebookresearch/ConvNeXt), and [DiNAT](https://github.com/SHI-Labs/Neighborhood-Attention-Transformer) for our experiments.
 6 | 
 7 | <details>
 8 | <summary>Swin-Transformer</summary>
 9 | 
10 | - [Official Repo](https://github.com/microsoft/Swin-Transformer)
11 | - `convert-pretrained-model-to-d2.py`: Tool to convert Swin Transformer pre-trained weights for D2.
12 | 
13 |     ```bash
14 |     pip install timm
15 | 
16 |     wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth
17 |     python tools/convert-pretrained-model-to-d2.py swin_large_patch4_window12_384_22k.pth swin_large_patch4_window12_384_22k.pkl
18 | 
19 |     wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22kto1k.pth
20 |     python tools/convert-pretrained-model-to-d2.py swin_large_patch4_window12_384_22kto1k.pth swin_large_patch4_window12_384_22kto1k.pkl
21 |     ```
22 | 
23 | </details>
24 | 
25 | <details>
26 | <summary>ConvNeXt</summary>
27 | 
28 | - [Official Repo](https://github.com/facebookresearch/ConvNeXt)
29 | - `convert-pretrained-model-to-d2.py`: Tool to convert ConvNeXt pre-trained weights for D2.
30 | 
31 |     ```bash
32 |     wget https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_1k_384.pth
33 |     python tools/convert-pretrained-model-to-d2.py convnext_large_22k_1k_384.pth convnext_large_22k_1k_384.pkl
34 | 
35 |     wget https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_1k_384_ema.pth
36 |     python tools/convert-pretrained-model-to-d2.py convnext_xlarge_22k_1k_384_ema.pth convnext_xlarge_22k_1k_384_ema.pkl
37 |     ```
38 | 
39 | </details>
40 | 
41 | <details>
42 | <summary>DiNAT</summary>
43 | 
44 | - [Official Repo](https://github.com/SHI-Labs/Neighborhood-Attention-Transformer)
45 | - `convert-pretrained-nat-model-to-d2.py`: Tool to convert DiNAT pre-trained weights for D2.
46 | 
47 |     ```bash
48 |     wget https://shi-labs.com/projects/dinat/checkpoints/imagenet1k/dinat_large_in22k_in1k_384_11x11.pth
49 |     python tools/convert-pretrained-nat-model-to-d2.py dinat_large_in22k_in1k_384_11x11.pth dinat_large_in22k_in1k_384_11x11.pkl
50 | 
51 |     wget https://shi-labs.com/projects/dinat/checkpoints/imagenet22k/dinat_large_in22k_224.pth
52 |     python tools/convert-pretrained-nat-model-to-d2.py dinat_large_in22k_224.pth dinat_large_in22k_224.pkl
53 |     ```
54 |     
55 | </details>
56 | 
57 | ## Analyze Model
58 | 
59 | - Tool to analyze model parameters, flops and speed.
60 | - We use dummy image to compute flops on ADE20K and Cityscapes.
61 | - For COCO, we use random 100 validation images.
62 | - We set `task = panoptic` by default.
63 | 
64 | ```bash
65 | python tools/analyze_model.py --num-inputs 100 --tasks [flop speed] \
66 |     --config-file configs/ade20k/swin/oneformer_swin_large_IN21k_384_bs16_160k.yaml \
67 |     MODEL.WEIGHTS <path-to-checkpoint> [--use-fixed-input-size] MODEL.TEST.SEMANTIC_ON False MODEL.TEST.INSTANCE_ON False
68 | ```
69 | 
70 | ## Training Throughput
71 | 
72 | - Tool to compute throughput.
73 | - We compute throughput for 500 iterations by default.
74 | 
75 | ```bash
76 | python tools/calc_throughput.py --dist-url 'tcp://127.0.0.1:50162' \
77 | --num-gpus 8 \
78 | --config-file configs/ade20k/swin/oneformer_swin_large_IN21k_384_bs16_160k.yaml \
79 | MODEL.WEIGHTS pretrain/swin_large_patch4_window12_384_22kto1k.pkl \
80 | OUTPUT_DIR tp_out SOLVER.MAX_ITER 500
81 | 
82 | rm -rf tp_out
83 | ```
84 | 


--------------------------------------------------------------------------------
/tools/analyze_model.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import numpy as np
  3 | from collections import Counter
  4 | import tqdm
  5 | from fvcore.nn import flop_count_table  # can also try flop_count_str
  6 | 
  7 | from detectron2.checkpoint import DetectionCheckpointer
  8 | from detectron2.config import CfgNode, LazyConfig, get_cfg, instantiate
  9 | from detectron2.engine import default_argument_parser
 10 | from detectron2.modeling import build_model
 11 | from detectron2.projects.deeplab import add_deeplab_config
 12 | from detectron2.utils.analysis import (
 13 |     FlopCountAnalysis,
 14 |     activation_count_operators,
 15 |     parameter_count_table,
 16 | )
 17 | from detectron2.utils.logger import setup_logger
 18 | 
 19 | # fmt: off
 20 | import os
 21 | import sys
 22 | sys.path.insert(1, os.path.join(sys.path[0], '..'))
 23 | # fmt: on
 24 | 
 25 | from oneformer.data.build import *
 26 | from oneformer.data.dataset_mappers.dataset_mapper import DatasetMapper
 27 | from oneformer import (
 28 |     add_oneformer_config,
 29 |     add_common_config,
 30 |     add_swin_config,
 31 |     add_dinat_config,
 32 |     add_beit_adapter_config,
 33 |     add_convnext_config,
 34 | )
 35 | 
 36 | logger = logging.getLogger("detectron2")
 37 | 
 38 | 
 39 | def setup(args):
 40 |     if args.config_file.endswith(".yaml"):
 41 |         cfg = get_cfg()
 42 |         add_deeplab_config(cfg)
 43 |         add_common_config(cfg)
 44 |         add_swin_config(cfg)
 45 |         add_dinat_config(cfg)
 46 |         add_beit_adapter_config(cfg)
 47 |         add_oneformer_config(cfg)
 48 |         add_convnext_config(cfg)
 49 |         cfg.merge_from_file(args.config_file)
 50 |         cfg.DATALOADER.NUM_WORKERS = 0
 51 |         cfg.merge_from_list(args.opts)
 52 |         cfg.freeze()
 53 |     else:
 54 |         cfg = LazyConfig.load(args.config_file)
 55 |         cfg = LazyConfig.apply_overrides(cfg, args.opts)
 56 |     setup_logger(name="fvcore")
 57 |     setup_logger()
 58 |     return cfg
 59 | 
 60 | 
 61 | def do_flop(cfg):
 62 |     if isinstance(cfg, CfgNode):
 63 |         mapper = DatasetMapper(cfg, False)
 64 |         data_loader = build_detection_test_loader(cfg, cfg.DATASETS.TEST_PANOPTIC[0], mapper=mapper)
 65 |         model = build_model(cfg)
 66 |         DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS)
 67 |     else:
 68 |         data_loader = instantiate(cfg.dataloader.test)
 69 |         model = instantiate(cfg.model)
 70 |         model.to(cfg.train.device)
 71 |         DetectionCheckpointer(model).load(cfg.train.init_checkpoint)
 72 |     model.eval()
 73 | 
 74 |     counts = Counter()
 75 |     total_flops = []
 76 |     for idx, data in zip(tqdm.trange(args.num_inputs), data_loader):  # noqa
 77 |         if args.use_fixed_input_size and isinstance(cfg, CfgNode):
 78 |             import torch
 79 |             crop_size = cfg.INPUT.CROP.SIZE
 80 |             data[0]["image"] = torch.zeros((3, crop_size[0], crop_size[1]))
 81 |         flops = FlopCountAnalysis(model, data)
 82 |         if idx > 0:
 83 |             flops.unsupported_ops_warnings(False).uncalled_modules_warnings(False)
 84 |         counts += flops.by_operator()
 85 |         total_flops.append(flops.total())
 86 | 
 87 |     logger.info("Flops table computed from only one input sample:\n" + flop_count_table(flops))
 88 |     logger.info(
 89 |         "Average GFlops for each type of operators:\n"
 90 |         + str([(k, v / (idx + 1) / 1e9) for k, v in counts.items()])
 91 |     )
 92 |     logger.info(
 93 |         "Total GFlops: {:.1f}±{:.1f}".format(np.mean(total_flops) / 1e9, np.std(total_flops) / 1e9)
 94 |     )
 95 | 
 96 | 
 97 | def do_activation(cfg):
 98 |     if isinstance(cfg, CfgNode):
 99 |         mapper = DatasetMapper(cfg, False)
100 |         data_loader = build_detection_test_loader(cfg, cfg.DATASETS.TEST_PANOPTIC[0], mapper=mapper)
101 |         model = build_model(cfg)
102 |         DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS)
103 |     else:
104 |         data_loader = instantiate(cfg.dataloader.test)
105 |         model = instantiate(cfg.model)
106 |         model.to(cfg.train.device)
107 |         DetectionCheckpointer(model).load(cfg.train.init_checkpoint)
108 |     model.eval()
109 | 
110 |     counts = Counter()
111 |     total_activations = []
112 |     for idx, data in zip(tqdm.trange(args.num_inputs), data_loader):  # noqa
113 |         count = activation_count_operators(model, data)
114 |         counts += count
115 |         total_activations.append(sum(count.values()))
116 |     logger.info(
117 |         "(Million) Activations for Each Type of Operators:\n"
118 |         + str([(k, v / idx) for k, v in counts.items()])
119 |     )
120 |     logger.info(
121 |         "Total (Million) Activations: {}±{}".format(
122 |             np.mean(total_activations), np.std(total_activations)
123 |         )
124 |     )
125 | 
126 | def do_speed(cfg):
127 |     if isinstance(cfg, CfgNode):
128 |         model = build_model(cfg)
129 |         DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS)
130 |     else:
131 |         model = instantiate(cfg.model)
132 |         model.to(cfg.train.device)
133 |         DetectionCheckpointer(model).load(cfg.train.init_checkpoint)
134 |     model.eval()
135 |     import torch
136 |     crop_size = cfg.INPUT.CROP.SIZE
137 |     data = [{}]
138 |     data[0]["image"] = torch.zeros((3, crop_size[0], crop_size[1]))
139 |     data[0]["task"] = "the task is panoptic"
140 |     total_times = []
141 |     for _ in tqdm.trange(100):  # noqa
142 |         model(data)
143 |         torch.cuda.synchronize()
144 |     tstart = torch.cuda.Event(enable_timing=True)
145 |     tend = torch.cuda.Event(enable_timing=True)
146 |     fps = []
147 |     times = []
148 |     for _ in range(5):
149 |         for _ in tqdm.trange(args.num_inputs):  # noqa    
150 |             tstart.record()
151 |             model(data)
152 |             tend.record()
153 |             torch.cuda.synchronize()
154 |             total_times.append(tstart.elapsed_time(tend))
155 |         times.append(np.mean(total_times))
156 |         fps.append(1000/np.mean(total_times))
157 | 
158 |     logger.info(
159 |         "Average Time per {}x{} Image : {:.1f} ± {:.1f} milli-seconds".format(crop_size, crop_size, np.mean(times), np.std(times))
160 |     )
161 |     logger.info(
162 |         "FPS : {:.2f} ± {:.2f}".format(np.mean(fps), np.std(fps))
163 |     )
164 | 
165 | def do_parameter(cfg):
166 |     if isinstance(cfg, CfgNode):
167 |         model = build_model(cfg)
168 |     else:
169 |         model = instantiate(cfg.model)
170 |     logger.info("Parameter Count:\n" + parameter_count_table(model, max_depth=5))
171 | 
172 | 
173 | def do_structure(cfg):
174 |     if isinstance(cfg, CfgNode):
175 |         model = build_model(cfg)
176 |     else:
177 |         model = instantiate(cfg.model)
178 |     logger.info("Model Structure:\n" + str(model))
179 | 
180 | 
181 | if __name__ == "__main__":
182 |     parser = default_argument_parser(
183 |         epilog="""
184 | Examples:
185 | To show parameters of a model:
186 | $ ./analyze_model.py --tasks parameter \\
187 |     --config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml
188 | Flops and activations are data-dependent, therefore inputs and model weights
189 | are needed to count them:
190 | $ ./analyze_model.py --num-inputs 100 --tasks flop \\
191 |     --config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \\
192 |     MODEL.WEIGHTS /path/to/model.pkl
193 | """
194 |     )
195 |     parser.add_argument(
196 |         "--tasks",
197 |         choices=["flop", "speed", "activation", "parameter", "structure"],
198 |         required=True,
199 |         nargs="+",
200 |     )
201 |     parser.add_argument(
202 |         "-n",
203 |         "--num-inputs",
204 |         default=100,
205 |         type=int,
206 |         help="number of inputs used to compute statistics for flops/activations, "
207 |         "both are data dependent.",
208 |     )
209 |     parser.add_argument(
210 |         "--use-fixed-input-size",
211 |         action="store_true",
212 |         help="use fixed input size when calculating flops",
213 |     )
214 |     args = parser.parse_args()
215 |     assert not args.eval_only
216 |     assert args.num_gpus == 1
217 | 
218 |     cfg = setup(args)
219 | 
220 |     for task in args.tasks:
221 |         {
222 |             "flop": do_flop,
223 |             "speed": do_speed,
224 |             "activation": do_activation,
225 |             "parameter": do_parameter,
226 |             "structure": do_structure,
227 |         }[task](cfg)
228 | 


--------------------------------------------------------------------------------
/tools/convert-pretrained-model-to-d2.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 3 | 
 4 | import pickle as pkl
 5 | import sys
 6 | 
 7 | import torch
 8 | 
 9 | """
10 | Usage:
11 |   # download pretrained swin model:
12 |   wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth
13 |   # run the conversion
14 |   ./convert-pretrained-model-to-d2.py swin_tiny_patch4_window7_224.pth swin_tiny_patch4_window7_224.pkl
15 |   # Then, use swin_tiny_patch4_window7_224.pkl with the following changes in config:
16 | MODEL:
17 |   WEIGHTS: "/path/to/swin_tiny_patch4_window7_224.pkl"
18 | INPUT:
19 |   FORMAT: "RGB"
20 | """
21 | 
22 | if __name__ == "__main__":
23 |     input = sys.argv[1]
24 | 
25 |     obj = torch.load(input, map_location="cpu")["model"]
26 | 
27 |     res = {"model": obj, "__author__": "third_party", "matching_heuristics": True}
28 | 
29 |     with open(sys.argv[2], "wb") as f:
30 |         pkl.dump(res, f)
31 | 


--------------------------------------------------------------------------------
/tools/convert-pretrained-nat-model-to-d2.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 3 | 
 4 | import pickle as pkl
 5 | import sys
 6 | 
 7 | import torch
 8 | 
 9 | """
10 | Usage:
11 |   # download pretrained swin model:
12 |   wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth
13 |   # run the conversion
14 |   ./convert-pretrained-model-to-d2.py swin_tiny_patch4_window7_224.pth swin_tiny_patch4_window7_224.pkl
15 |   # Then, use swin_tiny_patch4_window7_224.pkl with the following changes in config:
16 | MODEL:
17 |   WEIGHTS: "/path/to/swin_tiny_patch4_window7_224.pkl"
18 | INPUT:
19 |   FORMAT: "RGB"
20 | """
21 | 
22 | if __name__ == "__main__":
23 |     input = sys.argv[1]
24 | 
25 |     obj = torch.load(input, map_location="cpu")
26 | 
27 |     res = {"model": obj, "__author__": "third_party", "matching_heuristics": True}
28 | 
29 |     with open(sys.argv[2], "wb") as f:
30 |         pkl.dump(res, f)
31 | 


--------------------------------------------------------------------------------
/tools/convert-torchvision-to-d2.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | 
 4 | import pickle as pkl
 5 | import sys
 6 | 
 7 | import torch
 8 | 
 9 | """
10 | Usage:
11 |   # download one of the ResNet{18,34,50,101,152} models from torchvision:
12 |   wget https://download.pytorch.org/models/resnet50-19c8e357.pth -O r50.pth
13 |   # run the conversion
14 |   ./convert-torchvision-to-d2.py r50.pth r50.pkl
15 |   # Then, use r50.pkl with the following changes in config:
16 | MODEL:
17 |   WEIGHTS: "/path/to/r50.pkl"
18 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
19 |   PIXEL_STD: [58.395, 57.120, 57.375]
20 |   RESNETS:
21 |     DEPTH: 50
22 |     STRIDE_IN_1X1: False
23 | INPUT:
24 |   FORMAT: "RGB"
25 | """
26 | 
27 | if __name__ == "__main__":
28 |     input = sys.argv[1]
29 | 
30 |     obj = torch.load(input, map_location="cpu")
31 | 
32 |     newmodel = {}
33 |     for k in list(obj.keys()):
34 |         old_k = k
35 |         if "layer" not in k:
36 |             k = "stem." + k
37 |         for t in [1, 2, 3, 4]:
38 |             k = k.replace("layer{}".format(t), "res{}".format(t + 1))
39 |         for t in [1, 2, 3]:
40 |             k = k.replace("bn{}".format(t), "conv{}.norm".format(t))
41 |         k = k.replace("downsample.0", "shortcut")
42 |         k = k.replace("downsample.1", "shortcut.norm")
43 |         print(old_k, "->", k)
44 |         newmodel[k] = obj.pop(old_k).detach().numpy()
45 | 
46 |     res = {"model": newmodel, "__author__": "torchvision", "matching_heuristics": True}
47 | 
48 |     with open(sys.argv[2], "wb") as f:
49 |         pkl.dump(res, f)
50 |     if obj:
51 |         print("Unconverted keys:", obj.keys())
52 | 


--------------------------------------------------------------------------------
/tools/setup_detectron2.py:
--------------------------------------------------------------------------------
 1 | import sys, os, distutils.core, subprocess
 2 | 
 3 | if not os.path.exists('./detectron2'):
 4 |     subprocess.run(['git', 'clone', 'https://github.com/facebookresearch/detectron2'])
 5 | 
 6 | dist = distutils.core.run_setup("./detectron2/setup.py")
 7 | 
 8 | for x in dist.install_requires:
 9 |     subprocess.run(['python', '-m', 'pip', 'install', x])
10 | 
11 | sys.path.insert(0, os.path.abspath('./detectron2'))


--------------------------------------------------------------------------------