├── .gitignore
├── GETTING_STARTED.md
├── LICENSE
├── MANIFEST.in
├── README.md
├── configs
    ├── Panoptic
    │   ├── odise_caption_coco_50e.py
    │   └── odise_label_coco_50e.py
    └── common
    │   ├── data
    │       ├── coco_panoptic_semseg.py
    │       └── pano_open_d2_eval.py
    │   ├── models
    │       ├── mask_generator_with_caption.py
    │       ├── mask_generator_with_label.py
    │       ├── odise_with_caption.py
    │       └── odise_with_label.py
    │   ├── optim.py
    │   ├── schedule.py
    │   └── train.py
├── datasets
    ├── README.md
    ├── ade20k_instance_catid_mapping.txt
    ├── ade20k_instance_imgCatIds.json
    ├── prepare_ade20k_full_sem_seg.py
    ├── prepare_ade20k_ins_seg.py
    ├── prepare_ade20k_pan_seg.py
    ├── prepare_ade20k_sem_seg.py
    ├── prepare_coco_caption.py
    ├── prepare_coco_semantic_annos_from_panoptic_annos.py
    ├── prepare_lvis_openseg_labels.py
    ├── prepare_pascal_ctx_full_sem_seg.py
    ├── prepare_pascal_ctx_sem_seg.py
    └── prepare_pascal_voc_sem_seg.py
├── demo
    ├── app.py
    ├── demo.ipynb
    ├── demo.py
    └── examples
    │   ├── ade.jpg
    │   ├── coco.jpg
    │   ├── ego4d.jpg
    │   └── purse.jpeg
├── docker
    └── Dockerfile
├── figs
    ├── github_arch.gif
    ├── github_vis_ade_0.gif
    ├── github_vis_ade_1.gif
    ├── github_vis_coco_0.gif
    ├── github_vis_coco_1.gif
    ├── github_vis_ego4d_0.gif
    ├── github_vis_ego4d_1.gif
    └── teaser.jpg
├── odise
    ├── __init__.py
    ├── checkpoint
    │   ├── __init__.py
    │   └── odise_checkpointer.py
    ├── config
    │   ├── __init__.py
    │   ├── instantiate.py
    │   └── utils.py
    ├── data
    │   ├── __init__.py
    │   ├── build.py
    │   ├── dataset_mapper.py
    │   └── datasets
    │   │   ├── __init__.py
    │   │   ├── openseg_labels
    │   │       ├── README.md
    │   │       ├── ade20k_150.txt
    │   │       ├── ade20k_150_with_prompt_eng.txt
    │   │       ├── ade20k_847.txt
    │   │       ├── ade20k_847_with_prompt_eng.txt
    │   │       ├── coco_panoptic.txt
    │   │       ├── coco_panoptic_with_prompt_eng.txt
    │   │       ├── lvis_1203.txt
    │   │       ├── lvis_1203_with_prompt_eng.txt
    │   │       ├── pascal_context_459.txt
    │   │       ├── pascal_context_459_with_prompt_eng.txt
    │   │       ├── pascal_context_59.txt
    │   │       ├── pascal_context_59_with_prompt_eng.txt
    │   │       ├── pascal_voc_21.txt
    │   │       └── pascal_voc_21_with_prompt_eng.txt
    │   │   ├── register_coco_caption.py
    │   │   └── register_pascal.py
    ├── engine
    │   ├── __init__.py
    │   ├── defaults.py
    │   ├── hooks.py
    │   └── train_loop.py
    ├── evaluation
    │   ├── __init__.py
    │   ├── d2_evaluator.py
    │   └── evaluator.py
    ├── model_zoo
    │   ├── __init__.py
    │   └── model_zoo.py
    ├── modeling
    │   ├── __init__.py
    │   ├── backbone
    │   │   ├── __init__.py
    │   │   └── feature_extractor.py
    │   ├── diffusion
    │   │   ├── __init__.py
    │   │   ├── diffusion_builder.py
    │   │   ├── gaussian_diffusion.py
    │   │   ├── resample.py
    │   │   └── respace.py
    │   ├── meta_arch
    │   │   ├── __init__.py
    │   │   ├── clip.py
    │   │   ├── helper.py
    │   │   ├── ldm.py
    │   │   └── odise.py
    │   ├── preprocess.py
    │   └── wrapper
    │   │   ├── __init__.py
    │   │   └── pano_wrapper.py
    └── utils
    │   ├── __init__.py
    │   ├── collect_env.py
    │   ├── events.py
    │   ├── file_io.py
    │   └── parameter_count.py
├── setup.cfg
├── setup.py
├── third_party
    └── Mask2Former
    │   ├── .gitignore
    │   ├── ADVANCED_USAGE.md
    │   ├── CODE_OF_CONDUCT.md
    │   ├── CONTRIBUTING.md
    │   ├── GETTING_STARTED.md
    │   ├── INSTALL.md
    │   ├── LICENSE
    │   ├── MODEL_ZOO.md
    │   ├── README.md
    │   ├── cog.yaml
    │   ├── configs
    │       ├── ade20k
    │       │   ├── instance-segmentation
    │       │   │   ├── Base-ADE20K-InstanceSegmentation.yaml
    │       │   │   ├── maskformer2_R50_bs16_160k.yaml
    │       │   │   └── swin
    │       │   │   │   └── maskformer2_swin_large_IN21k_384_bs16_160k.yaml
    │       │   ├── panoptic-segmentation
    │       │   │   ├── Base-ADE20K-PanopticSegmentation.yaml
    │       │   │   ├── maskformer2_R50_bs16_160k.yaml
    │       │   │   └── swin
    │       │   │   │   └── maskformer2_swin_large_IN21k_384_bs16_160k.yaml
    │       │   └── semantic-segmentation
    │       │   │   ├── Base-ADE20K-SemanticSegmentation.yaml
    │       │   │   ├── maskformer2_R101_bs16_90k.yaml
    │       │   │   ├── maskformer2_R50_bs16_160k.yaml
    │       │   │   └── swin
    │       │   │       ├── maskformer2_swin_base_384_bs16_160k_res640.yaml
    │       │   │       ├── maskformer2_swin_base_IN21k_384_bs16_160k_res640.yaml
    │       │   │       ├── maskformer2_swin_large_IN21k_384_bs16_160k_res640.yaml
    │       │   │       ├── maskformer2_swin_small_bs16_160k.yaml
    │       │   │       └── maskformer2_swin_tiny_bs16_160k.yaml
    │       ├── cityscapes
    │       │   ├── instance-segmentation
    │       │   │   ├── Base-Cityscapes-InstanceSegmentation.yaml
    │       │   │   ├── maskformer2_R101_bs16_90k.yaml
    │       │   │   ├── maskformer2_R50_bs16_90k.yaml
    │       │   │   └── swin
    │       │   │   │   ├── maskformer2_swin_base_IN21k_384_bs16_90k.yaml
    │       │   │   │   ├── maskformer2_swin_large_IN21k_384_bs16_90k.yaml
    │       │   │   │   ├── maskformer2_swin_small_bs16_90k.yaml
    │       │   │   │   └── maskformer2_swin_tiny_bs16_90k.yaml
    │       │   ├── panoptic-segmentation
    │       │   │   ├── Base-Cityscapes-PanopticSegmentation.yaml
    │       │   │   ├── maskformer2_R101_bs16_90k.yaml
    │       │   │   ├── maskformer2_R50_bs16_90k.yaml
    │       │   │   └── swin
    │       │   │   │   ├── maskformer2_swin_base_IN21k_384_bs16_90k.yaml
    │       │   │   │   ├── maskformer2_swin_large_IN21k_384_bs16_90k.yaml
    │       │   │   │   ├── maskformer2_swin_small_bs16_90k.yaml
    │       │   │   │   └── maskformer2_swin_tiny_bs16_90k.yaml
    │       │   └── semantic-segmentation
    │       │   │   ├── Base-Cityscapes-SemanticSegmentation.yaml
    │       │   │   ├── maskformer2_R101_bs16_90k.yaml
    │       │   │   ├── maskformer2_R50_bs16_90k.yaml
    │       │   │   └── swin
    │       │   │       ├── maskformer2_swin_base_IN21k_384_bs16_90k.yaml
    │       │   │       ├── maskformer2_swin_large_IN21k_384_bs16_90k.yaml
    │       │   │       ├── maskformer2_swin_small_bs16_90k.yaml
    │       │   │       └── maskformer2_swin_tiny_bs16_90k.yaml
    │       ├── coco
    │       │   ├── instance-segmentation
    │       │   │   ├── Base-COCO-InstanceSegmentation.yaml
    │       │   │   ├── maskformer2_R101_bs16_50ep.yaml
    │       │   │   ├── maskformer2_R50_bs16_50ep.yaml
    │       │   │   └── swin
    │       │   │   │   ├── maskformer2_swin_base_384_bs16_50ep.yaml
    │       │   │   │   ├── maskformer2_swin_base_IN21k_384_bs16_50ep.yaml
    │       │   │   │   ├── maskformer2_swin_large_IN21k_384_bs16_100ep.yaml
    │       │   │   │   ├── maskformer2_swin_small_bs16_50ep.yaml
    │       │   │   │   └── maskformer2_swin_tiny_bs16_50ep.yaml
    │       │   └── panoptic-segmentation
    │       │   │   ├── Base-COCO-PanopticSegmentation.yaml
    │       │   │   ├── maskformer2_R101_bs16_50ep.yaml
    │       │   │   ├── maskformer2_R50_bs16_50ep.yaml
    │       │   │   └── swin
    │       │   │       ├── maskformer2_swin_base_384_bs16_50ep.yaml
    │       │   │       ├── maskformer2_swin_base_IN21k_384_bs16_50ep.yaml
    │       │   │       ├── maskformer2_swin_large_IN21k_384_bs16_100ep.yaml
    │       │   │       ├── maskformer2_swin_small_bs16_50ep.yaml
    │       │   │       └── maskformer2_swin_tiny_bs16_50ep.yaml
    │       ├── mapillary-vistas
    │       │   ├── panoptic-segmentation
    │       │   │   ├── Base-MapillaryVistas-PanopticSegmentation.yaml
    │       │   │   ├── maskformer_R50_bs16_300k.yaml
    │       │   │   └── swin
    │       │   │   │   └── maskformer2_swin_large_IN21k_384_bs16_300k.yaml
    │       │   └── semantic-segmentation
    │       │   │   ├── Base-MapillaryVistas-SemanticSegmentation.yaml
    │       │   │   ├── maskformer2_R50_bs16_300k.yaml
    │       │   │   └── swin
    │       │   │       └── maskformer2_swin_large_IN21k_384_bs16_300k.yaml
    │       ├── youtubevis_2019
    │       │   ├── Base-YouTubeVIS-VideoInstanceSegmentation.yaml
    │       │   ├── swin
    │       │   │   ├── video_maskformer2_swin_base_IN21k_384_bs16_8ep.yaml
    │       │   │   ├── video_maskformer2_swin_large_IN21k_384_bs16_8ep.yaml
    │       │   │   ├── video_maskformer2_swin_small_bs16_8ep.yaml
    │       │   │   └── video_maskformer2_swin_tiny_bs16_8ep.yaml
    │       │   ├── video_maskformer2_R101_bs16_8ep.yaml
    │       │   └── video_maskformer2_R50_bs16_8ep.yaml
    │       └── youtubevis_2021
    │       │   ├── Base-YouTubeVIS-VideoInstanceSegmentation.yaml
    │       │   ├── swin
    │       │       ├── video_maskformer2_swin_base_IN21k_384_bs16_8ep.yaml
    │       │       ├── video_maskformer2_swin_large_IN21k_384_bs16_8ep.yaml
    │       │       ├── video_maskformer2_swin_small_bs16_8ep.yaml
    │       │       └── video_maskformer2_swin_tiny_bs16_8ep.yaml
    │       │   ├── video_maskformer2_R101_bs16_8ep.yaml
    │       │   └── video_maskformer2_R50_bs16_8ep.yaml
    │   ├── datasets
    │       ├── README.md
    │       ├── ade20k_instance_catid_mapping.txt
    │       ├── ade20k_instance_imgCatIds.json
    │       ├── prepare_ade20k_ins_seg.py
    │       ├── prepare_ade20k_pan_seg.py
    │       ├── prepare_ade20k_sem_seg.py
    │       └── prepare_coco_semantic_annos_from_panoptic_annos.py
    │   ├── demo
    │       ├── README.md
    │       ├── demo.py
    │       └── predictor.py
    │   ├── demo_video
    │       ├── README.md
    │       ├── demo.py
    │       ├── predictor.py
    │       └── visualizer.py
    │   ├── mask2former
    │       ├── __init__.py
    │       ├── config.py
    │       ├── data
    │       │   ├── __init__.py
    │       │   ├── dataset_mappers
    │       │   │   ├── __init__.py
    │       │   │   ├── coco_instance_new_baseline_dataset_mapper.py
    │       │   │   ├── coco_panoptic_new_baseline_dataset_mapper.py
    │       │   │   ├── mask_former_instance_dataset_mapper.py
    │       │   │   ├── mask_former_panoptic_dataset_mapper.py
    │       │   │   └── mask_former_semantic_dataset_mapper.py
    │       │   └── datasets
    │       │   │   ├── __init__.py
    │       │   │   ├── register_ade20k_full.py
    │       │   │   ├── register_ade20k_instance.py
    │       │   │   ├── register_ade20k_panoptic.py
    │       │   │   ├── register_coco_panoptic_annos_semseg.py
    │       │   │   ├── register_coco_stuff_10k.py
    │       │   │   ├── register_mapillary_vistas.py
    │       │   │   └── register_mapillary_vistas_panoptic.py
    │       ├── evaluation
    │       │   ├── __init__.py
    │       │   └── instance_evaluation.py
    │       ├── maskformer_model.py
    │       ├── modeling
    │       │   ├── __init__.py
    │       │   ├── backbone
    │       │   │   ├── __init__.py
    │       │   │   └── swin.py
    │       │   ├── criterion.py
    │       │   ├── matcher.py
    │       │   ├── meta_arch
    │       │   │   ├── __init__.py
    │       │   │   ├── mask_former_head.py
    │       │   │   └── per_pixel_baseline.py
    │       │   ├── pixel_decoder
    │       │   │   ├── __init__.py
    │       │   │   ├── fpn.py
    │       │   │   ├── msdeformattn.py
    │       │   │   └── ops
    │       │   │   │   ├── __init__.py
    │       │   │   │   ├── functions
    │       │   │   │       ├── __init__.py
    │       │   │   │       └── ms_deform_attn_func.py
    │       │   │   │   ├── modules
    │       │   │   │       ├── __init__.py
    │       │   │   │       └── ms_deform_attn.py
    │       │   │   │   ├── src
    │       │   │   │       ├── cpu
    │       │   │   │       │   ├── ms_deform_attn_cpu.cpp
    │       │   │   │       │   └── ms_deform_attn_cpu.h
    │       │   │   │       ├── cuda
    │       │   │   │       │   ├── ms_deform_attn_cuda.cu
    │       │   │   │       │   ├── ms_deform_attn_cuda.h
    │       │   │   │       │   └── ms_deform_im2col_cuda.cuh
    │       │   │   │       ├── ms_deform_attn.h
    │       │   │   │       └── vision.cpp
    │       │   │   │   └── test.py
    │       │   └── transformer_decoder
    │       │   │   ├── __init__.py
    │       │   │   ├── mask2former_transformer_decoder.py
    │       │   │   ├── maskformer_transformer_decoder.py
    │       │   │   ├── position_encoding.py
    │       │   │   └── transformer.py
    │       ├── test_time_augmentation.py
    │       └── utils
    │       │   ├── __init__.py
    │       │   └── misc.py
    │   ├── mask2former_video
    │       ├── __init__.py
    │       ├── config.py
    │       ├── data_video
    │       │   ├── __init__.py
    │       │   ├── augmentation.py
    │       │   ├── build.py
    │       │   ├── dataset_mapper.py
    │       │   ├── datasets
    │       │   │   ├── __init__.py
    │       │   │   ├── builtin.py
    │       │   │   ├── ytvis.py
    │       │   │   └── ytvis_api
    │       │   │   │   ├── __init__.py
    │       │   │   │   ├── ytvos.py
    │       │   │   │   └── ytvoseval.py
    │       │   └── ytvis_eval.py
    │       ├── modeling
    │       │   ├── __init__.py
    │       │   ├── criterion.py
    │       │   ├── matcher.py
    │       │   └── transformer_decoder
    │       │   │   ├── __init__.py
    │       │   │   ├── position_encoding.py
    │       │   │   └── video_mask2former_transformer_decoder.py
    │       ├── utils
    │       │   ├── __init__.py
    │       │   └── memory.py
    │       └── video_maskformer_model.py
    │   ├── predict.py
    │   ├── requirements.txt
    │   ├── setup.py
    │   ├── tools
    │       ├── README.md
    │       ├── analyze_model.py
    │       ├── convert-pretrained-swin-model-to-d2.py
    │       ├── convert-torchvision-to-d2.py
    │       ├── evaluate_coco_boundary_ap.py
    │       └── evaluate_pq_for_semantic_segmentation.py
    │   ├── train_net.py
    │   └── train_net_video.py
└── tools
    └── train_net.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # output dir
 2 | output
 3 | instant_test_output
 4 | inference_test_output
 5 | 
 6 | 
 7 | *.png
 8 | *.json
 9 | *.diff
10 | *.jpg
11 | !/projects/DensePose/doc/images/*.jpg
12 | 
13 | # compilation and distribution
14 | __pycache__
15 | _ext
16 | *.pyc
17 | *.pyd
18 | *.so
19 | *.dll
20 | *.egg-info/
21 | build/
22 | dist/
23 | wheels/
24 | 
25 | # pytorch/python/numpy formats
26 | *.pth
27 | *.pkl
28 | *.npy
29 | *.ts
30 | model_ts*.txt
31 | 
32 | # ipython/jupyter notebooks
33 | *.ipynb
34 | **/.ipynb_checkpoints/
35 | 
36 | # Editor temporaries
37 | *.swn
38 | *.swo
39 | *.swp
40 | *~
41 | 
42 | # editor settings
43 | .idea
44 | .vscode
45 | _darcs
46 | 
47 | # project dirs
48 | /detectron2/model_zoo/configs
49 | /odise/model_zoo/configs
50 | /datasets/*
51 | !/datasets/*.*
52 | /projects/*/datasets
53 | /models
54 | /snippet
55 | 
56 | # Mac
57 | *.DS_Store
58 | 
59 | # Gradio
60 | gradio_queue.db
61 | 
62 | # CLIP
63 | *.pt
64 | 
65 | # stable diffusion
66 | *.ckpt
67 | 
68 | *.o


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include odise/data/datasets/openseg_labels/*.txt
2 | 


--------------------------------------------------------------------------------
/configs/Panoptic/odise_caption_coco_50e.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
 7 | #
 8 | # Written by Jiarui Xu
 9 | # ------------------------------------------------------------------------------
10 | 
11 | from detectron2.config import LazyCall as L
12 | from detectron2.solver import WarmupParamScheduler
13 | from fvcore.common.param_scheduler import MultiStepParamScheduler
14 | 
15 | from ..common.models.odise_with_caption import model
16 | from ..common.data.coco_panoptic_semseg import dataloader
17 | from ..common.train import train
18 | from ..common.optim import AdamW as optimizer
19 | from ..common.data.pano_open_d2_eval import (
20 |     ade150_open_eval as _ade150_eval,
21 |     ctx59_open_eval as _ctx59_eval,
22 |     ade847_open_eval as _ade847_eval,
23 |     ctx459_open_eval as _ctx459_eval,
24 |     pas21_open_eval as _pas21_eval,
25 | )
26 | 
27 | train.max_iter = 92_188
28 | train.grad_clip = 0.01
29 | train.checkpointer.period = 4500
30 | 
31 | lr_multiplier = L(WarmupParamScheduler)(
32 |     scheduler=L(MultiStepParamScheduler)(
33 |         values=[1.0, 0.1, 0.01],
34 |         # assume 100e with batch-size 64 as original LSJ
35 |         # Equivalent to 100 epochs.
36 |         # 100 ep = 184375 iters * 64 images/iter / 118000 images/ep
37 |         milestones=[163889, 177546],
38 |         num_updates=184375,
39 |     ),
40 |     # for warmup length we adopted COCO LSJ setting
41 |     warmup_length=500 / 184375,
42 |     warmup_factor=0.067,
43 | )
44 | 
45 | optimizer.lr = 1e-4
46 | optimizer.weight_decay = 0.05
47 | 
48 | dataloader.train.dataset.names = "coco_2017_train_panoptic_caption_with_sem_seg"
49 | 
50 | _ade847_eval.final_iter_only = True
51 | _ctx459_eval.final_iter_only = True
52 | 
53 | dataloader.extra_task = dict(
54 |     eval_ade150=_ade150_eval,
55 |     eval_ctx59=_ctx59_eval,
56 |     eval_ade847=_ade847_eval,
57 |     eval_ctx459=_ctx459_eval,
58 |     eval_pas21=_pas21_eval,
59 | )
60 | 


--------------------------------------------------------------------------------
/configs/Panoptic/odise_label_coco_50e.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
 7 | #
 8 | # Written by Jiarui Xu
 9 | # ------------------------------------------------------------------------------
10 | 
11 | from detectron2.config import LazyCall as L
12 | from detectron2.solver import WarmupParamScheduler
13 | from fvcore.common.param_scheduler import MultiStepParamScheduler
14 | 
15 | from ..common.models.odise_with_label import model
16 | from ..common.data.coco_panoptic_semseg import dataloader
17 | from ..common.train import train
18 | from ..common.optim import AdamW as optimizer
19 | from ..common.data.pano_open_d2_eval import (
20 |     ade150_open_eval as _ade150_eval,
21 |     ctx59_open_eval as _ctx59_eval,
22 |     ade847_open_eval as _ade847_eval,
23 |     ctx459_open_eval as _ctx459_eval,
24 |     pas21_open_eval as _pas21_eval,
25 | )
26 | 
27 | train.max_iter = 92_188
28 | train.grad_clip = 0.01
29 | train.checkpointer.period = 4500
30 | 
31 | lr_multiplier = L(WarmupParamScheduler)(
32 |     scheduler=L(MultiStepParamScheduler)(
33 |         values=[1.0, 0.1, 0.01],
34 |         # assume 100e with batch-size 64 as original LSJ
35 |         # Equivalent to 100 epochs.
36 |         # 100 ep = 184375 iters * 64 images/iter / 118000 images/ep
37 |         milestones=[163889, 177546],
38 |         num_updates=184375,
39 |     ),
40 |     # for warmup length we adopted COCO LSJ setting
41 |     warmup_length=500 / 184375,
42 |     warmup_factor=0.067,
43 | )
44 | 
45 | optimizer.lr = 1e-4
46 | optimizer.weight_decay = 0.05
47 | 
48 | _ade847_eval.final_iter_only = True
49 | _ctx459_eval.final_iter_only = True
50 | 
51 | dataloader.extra_task = dict(
52 |     eval_ade150=_ade150_eval,
53 |     eval_ctx59=_ctx59_eval,
54 |     eval_ade847=_ade847_eval,
55 |     eval_ctx459=_ctx459_eval,
56 |     eval_pas21=_pas21_eval,
57 | )
58 | 


--------------------------------------------------------------------------------
/configs/common/data/coco_panoptic_semseg.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | # To view a copy of this license, visit
 4 | # https://github.com/facebookresearch/detectron2/blob/main/LICENSE
 5 | # ------------------------------------------------------------------------------
 6 | #
 7 | # ------------------------------------------------------------------------------
 8 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 9 | #
10 | # This work is made available under the Nvidia Source Code License.
11 | # To view a copy of this license, visit
12 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
13 | #
14 | # Written by Jiarui Xu
15 | # ------------------------------------------------------------------------------
16 | 
17 | from omegaconf import OmegaConf
18 | 
19 | import detectron2.data.transforms as T
20 | from detectron2.config import LazyCall as L
21 | from detectron2.data import get_detection_dataset_dicts
22 | from detectron2.data import DatasetMapper
23 | 
24 | from odise.data import (
25 |     COCOPanopticDatasetMapper,
26 |     build_d2_test_dataloader,
27 |     build_d2_train_dataloader,
28 |     get_openseg_labels,
29 | )
30 | from odise.evaluation.d2_evaluator import (
31 |     COCOEvaluator,
32 |     COCOPanopticEvaluator,
33 |     SemSegEvaluator,
34 | )
35 | from odise.modeling.wrapper.pano_wrapper import OpenPanopticInference
36 | from detectron2.data import MetadataCatalog
37 | 
38 | dataloader = OmegaConf.create()
39 | 
40 | dataloader.train = L(build_d2_train_dataloader)(
41 |     dataset=L(get_detection_dataset_dicts)(
42 |         names="coco_2017_train_panoptic_with_sem_seg", filter_empty=True
43 |     ),
44 |     mapper=L(COCOPanopticDatasetMapper)(
45 |         is_train=True,
46 |         # COCO LSJ aug
47 |         augmentations=[
48 |             L(T.RandomFlip)(horizontal=True),
49 |             L(T.ResizeScale)(
50 |                 min_scale=0.1,
51 |                 max_scale=2.0,
52 |                 target_height=1024,
53 |                 target_width=1024,
54 |             ),
55 |             L(T.FixedSizeCrop)(crop_size=(1024, 1024)),
56 |         ],
57 |         image_format="RGB",
58 |     ),
59 |     total_batch_size=64,
60 |     num_workers=4,
61 | )
62 | 
63 | dataloader.test = L(build_d2_test_dataloader)(
64 |     dataset=L(get_detection_dataset_dicts)(
65 |         names="coco_2017_val_panoptic_with_sem_seg",
66 |         filter_empty=False,
67 |     ),
68 |     mapper=L(DatasetMapper)(
69 |         is_train=False,
70 |         augmentations=[
71 |             L(T.ResizeShortestEdge)(short_edge_length=1024, sample_style="choice", max_size=2560),
72 |         ],
73 |         image_format="${...train.mapper.image_format}",
74 |     ),
75 |     local_batch_size=1,
76 |     num_workers=1,
77 | )
78 | 
79 | dataloader.evaluator = [
80 |     L(COCOEvaluator)(
81 |         dataset_name="${...test.dataset.names}",
82 |         tasks=("segm",),
83 |     ),
84 |     L(SemSegEvaluator)(
85 |         dataset_name="${...test.dataset.names}",
86 |     ),
87 |     L(COCOPanopticEvaluator)(
88 |         dataset_name="${...test.dataset.names}",
89 |     ),
90 | ]
91 | 
92 | dataloader.wrapper = L(OpenPanopticInference)(
93 |     labels=L(get_openseg_labels)(dataset="coco_panoptic", prompt_engineered=True),
94 |     metadata=L(MetadataCatalog.get)(name="${...test.dataset.names}"),
95 | )
96 | 


--------------------------------------------------------------------------------
/configs/common/models/odise_with_caption.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
 7 | #
 8 | # Written by Jiarui Xu
 9 | # ------------------------------------------------------------------------------
10 | 
11 | from detectron2.config import LazyCall as L
12 | from odise.modeling.meta_arch.ldm import LdmImplicitCaptionerExtractor
13 | from odise.modeling.backbone.feature_extractor import FeatureExtractorBackbone
14 | from .mask_generator_with_caption import model
15 | 
16 | model.backbone = L(FeatureExtractorBackbone)(
17 |     feature_extractor=L(LdmImplicitCaptionerExtractor)(
18 |         encoder_block_indices=(5, 7),
19 |         unet_block_indices=(2, 5, 8, 11),
20 |         decoder_block_indices=(2, 5),
21 |         steps=(0,),
22 |         learnable_time_embed=True,
23 |         num_timesteps=1,
24 |         clip_model_name="ViT-L-14-336",
25 |     ),
26 |     out_features=["s2", "s3", "s4", "s5"],
27 |     use_checkpoint=True,
28 |     slide_training=True,
29 | )
30 | model.sem_seg_head.pixel_decoder.transformer_in_features = ["s3", "s4", "s5"]
31 | model.clip_head.alpha = 0.35
32 | model.clip_head.beta = 0.65
33 | 


--------------------------------------------------------------------------------
/configs/common/models/odise_with_label.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
 7 | #
 8 | # Written by Jiarui Xu
 9 | # ------------------------------------------------------------------------------
10 | 
11 | from detectron2.config import LazyCall as L
12 | from odise.modeling.meta_arch.ldm import LdmImplicitCaptionerExtractor
13 | from odise.modeling.backbone.feature_extractor import FeatureExtractorBackbone
14 | from .mask_generator_with_label import model
15 | 
16 | model.backbone = L(FeatureExtractorBackbone)(
17 |     feature_extractor=L(LdmImplicitCaptionerExtractor)(
18 |         encoder_block_indices=(5, 7),
19 |         unet_block_indices=(2, 5, 8, 11),
20 |         decoder_block_indices=(2, 5),
21 |         steps=(0,),
22 |         learnable_time_embed=True,
23 |         num_timesteps=1,
24 |         clip_model_name="ViT-L-14-336",
25 |     ),
26 |     out_features=["s2", "s3", "s4", "s5"],
27 |     use_checkpoint=True,
28 |     slide_training=True,
29 | )
30 | model.sem_seg_head.pixel_decoder.transformer_in_features = ["s3", "s4", "s5"]
31 | model.clip_head.alpha = 0.3
32 | model.clip_head.beta = 0.7
33 | 


--------------------------------------------------------------------------------
/configs/common/optim.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | # To view a copy of this license, visit
 4 | # https://github.com/facebookresearch/detectron2/blob/main/LICENSE
 5 | # ------------------------------------------------------------------------------
 6 | #
 7 | # ------------------------------------------------------------------------------
 8 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 9 | #
10 | # This work is made available under the Nvidia Source Code License.
11 | # To view a copy of this license, visit
12 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
13 | #
14 | # Written by Jiarui Xu
15 | # ------------------------------------------------------------------------------
16 | 
17 | import torch
18 | 
19 | from detectron2.config import LazyCall as L
20 | from detectron2.solver.build import get_default_optimizer_params
21 | 
22 | 
23 | AdamW = L(torch.optim.AdamW)(
24 |     params=L(get_default_optimizer_params)(
25 |         # params.model is meant to be set to the model object, before instantiating
26 |         # the optimizer.
27 |         weight_decay_norm=0.0,
28 |         weight_decay_bias=0.0,
29 |     ),
30 |     lr="???",
31 |     weight_decay="???",
32 | )
33 | 


--------------------------------------------------------------------------------
/configs/common/schedule.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | # To view a copy of this license, visit
 4 | # https://github.com/facebookresearch/detectron2/blob/main/LICENSE
 5 | # ------------------------------------------------------------------------------
 6 | #
 7 | # ------------------------------------------------------------------------------
 8 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 9 | #
10 | # This work is made available under the Nvidia Source Code License.
11 | # To view a copy of this license, visit
12 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
13 | #
14 | # Written by Jiarui Xu
15 | # ------------------------------------------------------------------------------
16 | 
17 | from fvcore.common.param_scheduler import CosineParamScheduler
18 | 
19 | from detectron2.config import LazyCall as L
20 | from detectron2.solver import WarmupParamScheduler
21 | 
22 | cosine_lr_multiplier = L(WarmupParamScheduler)(
23 |     scheduler=L(CosineParamScheduler)(start_value=1.0, end_value=0.01),
24 |     warmup_length="???",
25 |     warmup_method="linear",
26 |     warmup_factor=0.001,
27 | )
28 | 


--------------------------------------------------------------------------------
/configs/common/train.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | # To view a copy of this license, visit
 4 | # https://github.com/facebookresearch/detectron2/blob/main/LICENSE
 5 | # ------------------------------------------------------------------------------
 6 | #
 7 | # ------------------------------------------------------------------------------
 8 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 9 | #
10 | # This work is made available under the Nvidia Source Code License.
11 | # To view a copy of this license, visit
12 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
13 | #
14 | # Written by Jiarui Xu
15 | # ------------------------------------------------------------------------------
16 | 
17 | # Common training-related configs that are designed for "tools/lazyconfig_train_net.py"
18 | # You can use your own instead, together with your own train_net.py
19 | 
20 | train = dict(
21 |     output_dir="./output",
22 |     init_checkpoint="",
23 |     max_iter="???",
24 |     amp=dict(
25 |         enabled=False,
26 |         opt_level=None,
27 |     ),  # options for Automatic Mixed Precision
28 |     grad_clip=None,
29 |     ddp=dict(  # options for DistributedDataParallel
30 |         broadcast_buffers=False,
31 |         find_unused_parameters=False,
32 |         fp16_compression=False,
33 |     ),
34 |     checkpointer=dict(period=5000, max_to_keep=2),  # options for PeriodicCheckpointer
35 |     eval_period="${train.checkpointer.period}",
36 |     log_period=50,
37 |     device="cuda",
38 |     seed=42,
39 |     # ...
40 |     wandb=dict(
41 |         enable_writer=False,
42 |         resume=False,
43 |         project="ODISE",
44 |     ),
45 |     cfg_name="",
46 |     run_name="",
47 |     run_tag="",
48 |     reference_world_size=0,
49 | )
50 | 


--------------------------------------------------------------------------------
/datasets/ade20k_instance_catid_mapping.txt:
--------------------------------------------------------------------------------
  1 | Instacne100	SceneParse150	FullADE20K
  2 | 1		8		165
  3 | 2		9		3055
  4 | 3		11		350
  5 | 4		13		1831
  6 | 5		15		774
  7 | 5		15		783
  8 | 6		16		2684
  9 | 7		19		687
 10 | 8		20		471
 11 | 9		21		401
 12 | 10		23		1735
 13 | 11		24		2473
 14 | 12		25		2329
 15 | 13		28		1564
 16 | 14		31		57
 17 | 15		32		2272
 18 | 16		33		907
 19 | 17		34		724
 20 | 18		36		2985
 21 | 18		36		533
 22 | 19		37		1395
 23 | 20		38		155
 24 | 21		39		2053
 25 | 22		40		689
 26 | 23		42		266
 27 | 24		43		581
 28 | 25		44		2380
 29 | 26		45		491
 30 | 27		46		627
 31 | 28		48		2388
 32 | 29		50		943
 33 | 30		51		2096
 34 | 31		54		2530
 35 | 32		56		420
 36 | 33		57		1948
 37 | 34		58		1869
 38 | 35		59		2251
 39 | 36		63		239
 40 | 37		65		571
 41 | 38		66		2793
 42 | 39		67		978
 43 | 40		68		236
 44 | 41		70		181
 45 | 42		71		629
 46 | 43		72		2598
 47 | 44		73		1744
 48 | 45		74		1374
 49 | 46		75		591
 50 | 47		76		2679
 51 | 48		77		223
 52 | 49		79		47
 53 | 50		81		327
 54 | 51		82		2821
 55 | 52		83		1451
 56 | 53		84		2880
 57 | 54		86		480
 58 | 55		87		77
 59 | 56		88		2616
 60 | 57		89		246
 61 | 57		89		247
 62 | 58		90		2733
 63 | 59		91		14
 64 | 60		93		38
 65 | 61		94		1936
 66 | 62		96		120
 67 | 63		98		1702
 68 | 64		99		249
 69 | 65		103		2928
 70 | 66		104		2337
 71 | 67		105		1023
 72 | 68		108		2989
 73 | 69		109		1930
 74 | 70		111		2586
 75 | 71		112		131
 76 | 72		113		146
 77 | 73		116		95
 78 | 74		117		1563
 79 | 75		119		1708
 80 | 76		120		103
 81 | 77		121		1002
 82 | 78		122		2569
 83 | 79		124		2833
 84 | 80		125		1551
 85 | 81		126		1981
 86 | 82		127		29
 87 | 83		128		187
 88 | 84		130		747
 89 | 85		131		2254
 90 | 86		133		2262
 91 | 87		134		1260
 92 | 88		135		2243
 93 | 89		136		2932
 94 | 90		137		2836
 95 | 91		138		2850
 96 | 92		139		64
 97 | 93		140		894
 98 | 94		143		1919
 99 | 95		144		1583
100 | 96		145		318
101 | 97		147		2046
102 | 98		148		1098
103 | 99		149		530
104 | 100		150		954
105 | 


--------------------------------------------------------------------------------
/datasets/prepare_ade20k_sem_seg.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | #
 4 | # ------------------------------------------------------------------------------
 5 | # Copyright (c) Facebook, Inc. and its affiliates.
 6 | # To view a copy of this license, visit
 7 | # https://github.com/facebookresearch/Mask2Former/blob/main/LICENSE
 8 | # ------------------------------------------------------------------------------
 9 | 
10 | import os
11 | from pathlib import Path
12 | 
13 | import numpy as np
14 | import tqdm
15 | from PIL import Image
16 | 
17 | 
18 | def convert(input, output):
19 |     img = np.asarray(Image.open(input))
20 |     assert img.dtype == np.uint8
21 |     img = img - 1  # 0 (ignore) becomes 255. others are shifted by 1
22 |     Image.fromarray(img).save(output)
23 | 
24 | 
25 | if __name__ == "__main__":
26 |     dataset_dir = (
27 |         Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "ade" / "ADEChallengeData2016"
28 |     )
29 |     for name in ["training", "validation"]:
30 |         annotation_dir = dataset_dir / "annotations" / name
31 |         output_dir = dataset_dir / "annotations_detectron2" / name
32 |         output_dir.mkdir(parents=True, exist_ok=True)
33 |         for file in tqdm.tqdm(list(annotation_dir.iterdir())):
34 |             output_file = output_dir / file.name
35 |             convert(file, output_file)
36 | 


--------------------------------------------------------------------------------
/datasets/prepare_coco_caption.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
 7 | #
 8 | # Written by Jiarui Xu
 9 | # ------------------------------------------------------------------------------
10 | 
11 | # Convert adding COCO captions into annotation json
12 | 
13 | import json
14 | import os
15 | from collections import defaultdict
16 | 
17 | 
18 | def load_coco_caption():
19 |     id2caption = defaultdict(list)
20 |     dataset_dir = os.path.join(os.getenv("DETECTRON2_DATASETS", "datasets"), "coco")
21 |     for json_file in ["captions_train2017.json", "captions_val2017.json"]:
22 |         with open(os.path.join(dataset_dir, "annotations", json_file)) as f:
23 |             obj = json.load(f)
24 |             for ann in obj["annotations"]:
25 |                 id2caption[int(ann["image_id"])].append(ann["caption"])
26 | 
27 |     return id2caption
28 | 
29 | 
30 | def create_annotation_with_caption(input_json, output_json):
31 |     id2coco_caption = load_coco_caption()
32 | 
33 |     with open(input_json) as f:
34 |         obj = json.load(f)
35 | 
36 |     coco_count = 0
37 | 
38 |     print(f"Starting to add captions to {input_json} ...")
39 |     print(f"Total images: {len(obj['annotations'])}")
40 |     for ann in obj["annotations"]:
41 |         image_id = int(ann["image_id"])
42 |         if image_id in id2coco_caption:
43 |             ann["coco_captions"] = id2coco_caption[image_id]
44 |             coco_count += 1
45 |     print(f"Found {coco_count} captions from COCO ")
46 | 
47 |     print(f"Start writing to {output_json} ...")
48 |     with open(output_json, "w") as f:
49 |         json.dump(obj, f)
50 | 
51 | 
52 | if __name__ == "__main__":
53 |     dataset_dir = os.path.join(os.getenv("DETECTRON2_DATASETS", "datasets"), "coco")
54 |     for s in ["val2017", "val2017_100", "train2017"]:
55 |         create_annotation_with_caption(
56 |             os.path.join(dataset_dir, "annotations/panoptic_{}.json".format(s)),
57 |             os.path.join(dataset_dir, "annotations/panoptic_caption_{}.json".format(s)),
58 |         )
59 | 


--------------------------------------------------------------------------------
/datasets/prepare_lvis_openseg_labels.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
 7 | #
 8 | # Written by Jiarui Xu
 9 | # ------------------------------------------------------------------------------
10 | 
11 | import json
12 | import os
13 | 
14 | if __name__ == "__main__":
15 |     dataset_dir = os.path.join(os.getenv("DETECTRON2_DATASETS", "datasets"), "coco")
16 |     ann = os.path.join(dataset_dir, "annotations/lvis_v1_val.json")
17 |     print("Loading", ann)
18 |     data = json.load(open(ann, "r"))
19 |     cat_names = [x["name"] for x in sorted(data["categories"], key=lambda x: x["id"])]
20 |     nonrare_names = [
21 |         x["name"]
22 |         for x in sorted(data["categories"], key=lambda x: x["id"])
23 |         if x["frequency"] != "r"
24 |     ]
25 | 
26 |     synonyms = [x["synonyms"] for x in sorted(data["categories"], key=lambda x: x["id"])]
27 |     nonrare_synonyms = [
28 |         x["synonyms"]
29 |         for x in sorted(data["categories"], key=lambda x: x["id"])
30 |         if x["frequency"] != "r"
31 |     ]
32 | 
33 |     with open("datasets/openseg/lvis_1203.txt", "w") as f:
34 |         for idx, cat in enumerate(cat_names):
35 |             cat = cat.replace("_", " ")
36 |             f.write(f"{idx+1}:{cat}\n")
37 | 
38 |     with open("datasets/openseg/lvis_1203_with_prompt_eng.txt", "w") as f:
39 |         for idx, syns in enumerate(synonyms):
40 |             cat = ",".join(syns)
41 |             cat = cat.replace("_", " ")
42 |             f.write(f"{idx+1}:{cat}\n")
43 | 
44 |     with open("datasets/openseg/lvis_nonrare_866.txt", "w") as f:
45 |         for idx, cat in enumerate(nonrare_names):
46 |             cat = cat.replace("_", " ")
47 |             f.write(f"{idx+1}:{cat}\n")
48 | 
49 |     with open("datasets/openseg/lvis_nonrare_866_with_prompt_eng.txt", "w") as f:
50 |         for idx, syns in enumerate(nonrare_synonyms):
51 |             cat = ",".join(syns)
52 |             cat = cat.replace("_", " ")
53 |             f.write(f"{idx+1}:{cat}\n")
54 | 


--------------------------------------------------------------------------------
/datasets/prepare_pascal_ctx_full_sem_seg.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
 7 | #
 8 | # Written by Jiarui Xu
 9 | # ------------------------------------------------------------------------------
10 | 
11 | import os
12 | import numpy as np
13 | from pathlib import Path
14 | from PIL import Image
15 | import scipy.io as sio
16 | 
17 | import tqdm
18 | 
19 | 
20 | def generate_labels(mat_file, out_dir):
21 | 
22 |     mat = sio.loadmat(mat_file)
23 |     label_map = mat["LabelMap"]
24 |     assert label_map.dtype == np.uint16
25 |     label_map[label_map == 0] = 65535
26 |     label_map = label_map - 1
27 |     label_map[label_map == 65534] = 65535
28 | 
29 |     out_file = out_dir / Path(mat_file.name).with_suffix(".tif")
30 |     Image.fromarray(label_map).save(out_file)
31 | 
32 | 
33 | if __name__ == "__main__":
34 |     dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "pascal_ctx_d2"
35 |     voc_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "VOCdevkit/VOC2010"
36 |     mat_dir = voc_dir / "trainval"
37 |     for split in ["training", "validation"]:
38 |         file_names = list((dataset_dir / "images" / split).glob("*.jpg"))
39 |         output_img_dir = dataset_dir / "images" / split
40 |         output_ann_dir = dataset_dir / "annotations_ctx459" / split
41 | 
42 |         output_img_dir.mkdir(parents=True, exist_ok=True)
43 |         output_ann_dir.mkdir(parents=True, exist_ok=True)
44 | 
45 |         for file_name in tqdm.tqdm(file_names):
46 |             mat_file_path = mat_dir / f"{file_name.stem}.mat"
47 | 
48 |             generate_labels(mat_file_path, output_ann_dir)
49 | 


--------------------------------------------------------------------------------
/datasets/prepare_pascal_ctx_sem_seg.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
 7 | #
 8 | # Written by Jiarui Xu
 9 | # ------------------------------------------------------------------------------
10 | 
11 | import os
12 | from pathlib import Path
13 | import shutil
14 | 
15 | import numpy as np
16 | import tqdm
17 | from PIL import Image
18 | import multiprocessing as mp
19 | import functools
20 | from detail import Detail
21 | 
22 | # fmt: off
23 | _mapping = np.sort(
24 |     np.array([
25 |         0, 2, 259, 260, 415, 324, 9, 258, 144, 18, 19, 22, 23, 397, 25, 284,
26 |         158, 159, 416, 33, 162, 420, 454, 295, 296, 427, 44, 45, 46, 308, 59,
27 |         440, 445, 31, 232, 65, 354, 424, 68, 326, 72, 458, 34, 207, 80, 355,
28 |         85, 347, 220, 349, 360, 98, 187, 104, 105, 366, 189, 368, 113, 115
29 |     ]))
30 | # fmt: on
31 | _key = np.array(range(len(_mapping))).astype("uint8")
32 | 
33 | 
34 | def generate_labels(img_info, detail_api, out_dir):
35 |     def _class_to_index(mask, _mapping, _key):
36 |         # assert the values
37 |         values = np.unique(mask)
38 |         for i in range(len(values)):
39 |             assert values[i] in _mapping
40 |         index = np.digitize(mask.ravel(), _mapping, right=True)
41 |         return _key[index].reshape(mask.shape)
42 | 
43 |     sem_seg = _class_to_index(detail_api.getMask(img_info), _mapping=_mapping, _key=_key)
44 |     sem_seg = sem_seg - 1  # 0 (ignore) becomes 255. others are shifted by 1
45 |     filename = img_info["file_name"]
46 | 
47 |     Image.fromarray(sem_seg).save(out_dir / filename.replace("jpg", "png"))
48 | 
49 | 
50 | def copy_images(img_info, img_dir, out_dir):
51 |     filename = img_info["file_name"]
52 |     shutil.copy2(img_dir / filename, out_dir / filename)
53 | 
54 | 
55 | if __name__ == "__main__":
56 |     dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "pascal_ctx_d2"
57 |     voc_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "VOCdevkit/VOC2010"
58 |     for split in ["training", "validation"]:
59 |         img_dir = voc_dir / "JPEGImages"
60 |         if split == "training":
61 |             detail_api = Detail(voc_dir / "trainval_merged.json", img_dir, "train")
62 |         else:
63 |             detail_api = Detail(voc_dir / "trainval_merged.json", img_dir, "val")
64 |         img_infos = detail_api.getImgs()
65 | 
66 |         output_img_dir = dataset_dir / "images" / split
67 |         output_ann_dir = dataset_dir / "annotations_ctx59" / split
68 | 
69 |         output_img_dir.mkdir(parents=True, exist_ok=True)
70 |         output_ann_dir.mkdir(parents=True, exist_ok=True)
71 | 
72 |         pool = mp.Pool(processes=max(mp.cpu_count() // 2, 4))
73 | 
74 |         pool.map(
75 |             functools.partial(copy_images, img_dir=img_dir, out_dir=output_img_dir),
76 |             tqdm.tqdm(img_infos, desc=f"Writing {split} images to {output_img_dir} ..."),
77 |             chunksize=100,
78 |         )
79 | 
80 |         pool.map(
81 |             functools.partial(generate_labels, detail_api=detail_api, out_dir=output_ann_dir),
82 |             tqdm.tqdm(img_infos, desc=f"Writing {split} images to {output_ann_dir} ..."),
83 |             chunksize=100,
84 |         )
85 | 


--------------------------------------------------------------------------------
/datasets/prepare_pascal_voc_sem_seg.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
 7 | #
 8 | # Written by Jiarui Xu
 9 | # ------------------------------------------------------------------------------
10 | 
11 | import os
12 | from pathlib import Path
13 | import shutil
14 | 
15 | import numpy as np
16 | import tqdm
17 | from PIL import Image
18 | 
19 | 
20 | def convert(input, output):
21 |     img = np.asarray(Image.open(input))
22 |     assert img.dtype == np.uint8
23 |     # do nothing
24 |     Image.fromarray(img).save(output)
25 | 
26 | 
27 | if __name__ == "__main__":
28 |     dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "pascal_voc_d2"
29 |     voc_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "VOCdevkit/VOC2012"
30 |     for split in ["training", "validation"]:
31 |         if split == "training":
32 |             img_name_path = voc_dir / "ImageSets/Segmentation/train.txt"
33 |         else:
34 |             img_name_path = voc_dir / "ImageSets/Segmentation/val.txt"
35 |         img_dir = voc_dir / "JPEGImages"
36 |         ann_dir = voc_dir / "SegmentationClass"
37 | 
38 |         output_img_dir = dataset_dir / "images" / split
39 |         output_ann_dir = dataset_dir / "annotations_pascal21" / split
40 | 
41 |         output_img_dir.mkdir(parents=True, exist_ok=True)
42 |         output_ann_dir.mkdir(parents=True, exist_ok=True)
43 | 
44 |         with open(img_name_path) as f:
45 |             for line in tqdm.tqdm(f.readlines()):
46 |                 img_name = line.strip()
47 |                 img_path = img_dir / f"{img_name}.jpg"
48 |                 ann_path = ann_dir / f"{img_name}.png"
49 | 
50 |                 # print(f'copy2 {output_img_dir}')
51 |                 shutil.copy2(img_path, output_img_dir)
52 |                 # print(f"convert {ann_dir} to {output_ann_dir / f'{img_name}.png'}")
53 |                 convert(ann_path, output_ann_dir / f"{img_name}.png")
54 | 


--------------------------------------------------------------------------------
/demo/examples/ade.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVlabs/ODISE/2b187e4b2ff4c3d5da342aec2cc234b537720a65/demo/examples/ade.jpg


--------------------------------------------------------------------------------
/demo/examples/coco.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVlabs/ODISE/2b187e4b2ff4c3d5da342aec2cc234b537720a65/demo/examples/coco.jpg


--------------------------------------------------------------------------------
/demo/examples/ego4d.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVlabs/ODISE/2b187e4b2ff4c3d5da342aec2cc234b537720a65/demo/examples/ego4d.jpg


--------------------------------------------------------------------------------
/demo/examples/purse.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVlabs/ODISE/2b187e4b2ff4c3d5da342aec2cc234b537720a65/demo/examples/purse.jpeg


--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
 7 | #
 8 | # Written by Jiarui Xu
 9 | # ------------------------------------------------------------------------------
10 | 
11 | FROM pytorch/pytorch:1.13.1-cuda11.6-cudnn8-devel
12 | 
13 | WORKDIR /workspace
14 | 
15 | ARG DEBIAN_FRONTEND=noninteractive
16 | ENV TZ=US/Pacific
17 | 
18 | RUN apt-get update && apt-get install -y \
19 |     build-essential \
20 |     cmake \
21 |     curl \
22 |     g++ \
23 |     wget \
24 |     bzip2 \
25 |     git \
26 |     vim \
27 |     tmux \
28 |     htop \
29 |     git \
30 |     zip \
31 |     unzip \
32 |     ca-certificates \
33 |     libosmesa6-dev \
34 |     libgl1-mesa-glx \
35 |     libglfw3 \
36 |     patchelf \
37 |     libglu1-mesa \
38 |     libxext6 \
39 |     libxtst6 \
40 |     libxrender1 \
41 |     libxi6 \
42 |     libjpeg-dev \
43 |     libpng-dev \
44 |     libopenblas-dev \
45 |     libopencv-dev \
46 |     libyaml-dev \
47 |     libavformat-dev \
48 |     libavcodec-dev \
49 |     libswscale-dev \
50 |     libavutil-dev \
51 |     libavfilter-dev \
52 |     libavdevice-dev \
53 |     libswresample-dev \
54 |     less \
55 |     groff \
56 |     mpich 
57 | 
58 | RUN apt-get clean && rm -rf /var/lib/apt/lists/*
59 | 
60 | # Install git lfs
61 | RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash
62 | RUN apt-get install -y git-lfs
63 | RUN git lfs install
64 | 
65 | 
66 | RUN curl https://rclone.org/install.sh | bash
67 | 
68 | # Set timezone
69 | RUN ln -sf /usr/share/zoneinfo/US/Pacific /etc/localtime
70 | 
71 | # Set CUDA_ROOT
72 | RUN export CUDA_HOME="/usr/local/cuda"
73 | 
74 | # Install pytorch
75 | #RUN conda install pytorch torchvision cudatoolkit=11.1 -c pytorch -c conda-forge -y
76 | 
77 | # Install zsh
78 | RUN sh -c "$(wget -O- https://github.com/deluan/zsh-in-docker/releases/download/v1.1.1/zsh-in-docker.sh)" -t robbyrussell -p git
79 | 
80 | # Set a fixed model cache directory.
81 | ENV FVCORE_CACHE="/tmp"
82 | 
83 | ENV HOME /workspace
84 | 


--------------------------------------------------------------------------------
/figs/github_arch.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVlabs/ODISE/2b187e4b2ff4c3d5da342aec2cc234b537720a65/figs/github_arch.gif


--------------------------------------------------------------------------------
/figs/github_vis_ade_0.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVlabs/ODISE/2b187e4b2ff4c3d5da342aec2cc234b537720a65/figs/github_vis_ade_0.gif


--------------------------------------------------------------------------------
/figs/github_vis_ade_1.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVlabs/ODISE/2b187e4b2ff4c3d5da342aec2cc234b537720a65/figs/github_vis_ade_1.gif


--------------------------------------------------------------------------------
/figs/github_vis_coco_0.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVlabs/ODISE/2b187e4b2ff4c3d5da342aec2cc234b537720a65/figs/github_vis_coco_0.gif


--------------------------------------------------------------------------------
/figs/github_vis_coco_1.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVlabs/ODISE/2b187e4b2ff4c3d5da342aec2cc234b537720a65/figs/github_vis_coco_1.gif


--------------------------------------------------------------------------------
/figs/github_vis_ego4d_0.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVlabs/ODISE/2b187e4b2ff4c3d5da342aec2cc234b537720a65/figs/github_vis_ego4d_0.gif


--------------------------------------------------------------------------------
/figs/github_vis_ego4d_1.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVlabs/ODISE/2b187e4b2ff4c3d5da342aec2cc234b537720a65/figs/github_vis_ego4d_1.gif


--------------------------------------------------------------------------------
/figs/teaser.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVlabs/ODISE/2b187e4b2ff4c3d5da342aec2cc234b537720a65/figs/teaser.jpg


--------------------------------------------------------------------------------
/odise/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
 7 | #
 8 | # Written by Jiarui Xu
 9 | # ------------------------------------------------------------------------------
10 | 
11 | # This line will be programatically read/write by setup.py.
12 | # Leave them at the bottom of this file and don't touch them.
13 | __version__ = "0.1"
14 | 


--------------------------------------------------------------------------------
/odise/checkpoint/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
 7 | #
 8 | # Written by Jiarui Xu
 9 | # ------------------------------------------------------------------------------
10 | 
11 | from .odise_checkpointer import ODISECheckpointer
12 | 
13 | __all__ = ["ODISECheckpointer"]
14 | 


--------------------------------------------------------------------------------
/odise/config/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
 7 | #
 8 | # Written by Jiarui Xu
 9 | # ------------------------------------------------------------------------------
10 | 
11 | from .instantiate import instantiate_odise
12 | from .utils import auto_scale_workers
13 | 
14 | __all__ = [
15 |     "instantiate_odise",
16 |     "auto_scale_workers",
17 | ]
18 | 


--------------------------------------------------------------------------------
/odise/config/instantiate.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
 7 | #
 8 | # Written by Jiarui Xu
 9 | # ------------------------------------------------------------------------------
10 | 
11 | from detectron2.config import instantiate
12 | 
13 | 
14 | def instantiate_odise(cfg):
15 |     backbone = instantiate(cfg.backbone)
16 |     cfg.sem_seg_head.input_shape = backbone.output_shape()
17 |     cfg.sem_seg_head.pixel_decoder.input_shape = backbone.output_shape()
18 |     cfg.backbone = backbone
19 |     model = instantiate(cfg)
20 | 
21 |     return model
22 | 


--------------------------------------------------------------------------------
/odise/config/utils.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | # To view a copy of this license, visit
 4 | # https://github.com/facebookresearch/detectron2/blob/main/LICENSE
 5 | # ------------------------------------------------------------------------------
 6 | #
 7 | # ------------------------------------------------------------------------------
 8 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 9 | #
10 | # This work is made available under the Nvidia Source Code License.
11 | # To view a copy of this license, visit
12 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
13 | #
14 | # Written by Jiarui Xu
15 | # ------------------------------------------------------------------------------
16 | 
17 | from copy import deepcopy
18 | 
19 | 
20 | def auto_scale_workers(cfg, num_workers: int):
21 |     """
22 |     When the config is defined for certain number of workers (according to
23 |     ``cfg.train.reference_world_size``) that's different from the number of
24 |     workers currently in use, returns a new cfg where the total batch size
25 |     is scaled so that the per-GPU batch size stays the same as the
26 |     original ``total_batch_size // reference_world_size``.
27 | 
28 |     Other config options are also scaled accordingly:
29 |     * training steps and warmup steps are scaled inverse proportionally.
30 |     * learning rate are scaled proportionally, following :paper:`ImageNet in 1h`.
31 | 
32 |     For example, with the original config like the following:
33 | 
34 |     .. code-block:: yaml
35 | 
36 |         dataloader.train.total_batch_size: 16
37 |         optimizer.lr: 0.1
38 |         train.reference_world_size: 8
39 |         train.max_iter: 5000
40 |         train.checkpointer.period: 1000
41 | 
42 |     When this config is used on 16 GPUs instead of the reference number 8,
43 |     calling this method will return a new config with:
44 | 
45 |     .. code-block:: yaml
46 | 
47 |         dataloader.train.total_batch_size: 32
48 |         optimizer.lr: 0.2
49 |         train.reference_world_size: 16
50 |         train.max_iter: 2500
51 |         train.checkpointer.period: 500
52 | 
53 |     Note that both the original config and this new config can be trained on 16 GPUs.
54 |     It's up to user whether to enable this feature (by setting ``reference_world_size``).
55 | 
56 |     Returns:
57 |         CfgNode: a new config. Same as original if ``cfg.SOLVER.REFERENCE_WORLD_SIZE==0``.
58 |     """
59 |     old_world_size = cfg.train.reference_world_size
60 |     if old_world_size == 0 or old_world_size == num_workers:
61 |         print("No need to scale the config.")
62 |         return cfg
63 |     cfg = deepcopy(cfg)
64 | 
65 |     assert cfg.dataloader.train.total_batch_size % old_world_size == 0, (
66 |         f"Invalid reference_world_size in config! "
67 |         f"{cfg.dataloader.train.total_batch_size} % {old_world_size} != 0"
68 |     )
69 |     scale = num_workers / old_world_size
70 |     bs = cfg.dataloader.train.total_batch_size = int(
71 |         round(cfg.dataloader.train.total_batch_size * scale)
72 |     )
73 |     lr = cfg.optimizer.lr = cfg.optimizer.lr * scale
74 |     max_iter = cfg.train.max_iter = int(round(cfg.train.max_iter / scale))
75 |     cfg.train.eval_period = int(round(cfg.train.eval_period / scale))
76 |     cfg.train.checkpointer.period = int(round(cfg.train.checkpointer.period / scale))
77 |     cfg.train.reference_world_size = num_workers  # maintain invariant
78 |     print(
79 |         f"Auto-scaling the config to batch_size={bs}, learning_rate={lr}, " f"max_iter={max_iter}."
80 |     )
81 | 
82 |     return cfg
83 | 


--------------------------------------------------------------------------------
/odise/data/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
 7 | #
 8 | # Written by Jiarui Xu
 9 | # ------------------------------------------------------------------------------
10 | 
11 | 
12 | from .build import get_openseg_labels, build_d2_train_dataloader, build_d2_test_dataloader
13 | from .dataset_mapper import COCOPanopticDatasetMapper
14 | from .datasets import (
15 |     register_all_ctx59,
16 |     register_all_pascal21,
17 |     register_all_ctx459,
18 |     register_all_coco_panoptic_annos_sem_seg_caption,
19 | )
20 | 
21 | __all__ = [
22 |     "COCOPanopticDatasetMapper",
23 |     "get_openseg_labels",
24 |     "build_d2_train_dataloader",
25 |     "build_d2_test_dataloader",
26 |     "register_all_ctx59",
27 |     "register_all_pascal21",
28 |     "register_all_ctx459",
29 |     "register_all_coco_panoptic_annos_sem_seg_caption",
30 | ]
31 | 


--------------------------------------------------------------------------------
/odise/data/datasets/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
 7 | #
 8 | # Written by Jiarui Xu
 9 | # ------------------------------------------------------------------------------
10 | 
11 | from .register_pascal import register_all_ctx59, register_all_pascal21, register_all_ctx459
12 | from .register_coco_caption import register_all_coco_panoptic_annos_sem_seg_caption
13 | 
14 | __all__ = [
15 |     "register_all_ctx59",
16 |     "register_all_pascal21",
17 |     "register_all_ctx459",
18 |     "register_all_coco_panoptic_annos_sem_seg_caption",
19 | ]
20 | 


--------------------------------------------------------------------------------
/odise/data/datasets/openseg_labels/README.md:
--------------------------------------------------------------------------------
 1 | # Acknowledgement
 2 | 
 3 | We thank Golnaz Ghiasi for providing the [OpenSeg](https://arxiv.org/abs/2112.12143) labels for evaluation.
 4 | 
 5 | 
 6 | ## Citation
 7 | 
 8 | ```BiBTeX
 9 | @inproceedings{ghiasi2022scaling,
10 |   title={Scaling open-vocabulary image segmentation with image-level labels},
11 |   author={Ghiasi, Golnaz and Gu, Xiuye and Cui, Yin and Lin, Tsung-Yi},
12 |   booktitle={Computer Vision--ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23--27, 2022, Proceedings, Part XXXVI},
13 |   pages={540--557},
14 |   year={2022},
15 |   organization={Springer}
16 | }
17 | ```
18 | 


--------------------------------------------------------------------------------
/odise/data/datasets/openseg_labels/coco_panoptic.txt:
--------------------------------------------------------------------------------
  1 | 0:invalid_class_id
  2 | 1:person
  3 | 2:bicycle
  4 | 3:car
  5 | 4:motorcycle
  6 | 5:airplane
  7 | 6:bus
  8 | 7:train
  9 | 8:truck
 10 | 9:boat
 11 | 10:traffic light
 12 | 11:fire hydrant
 13 | 12:invalid_class_id
 14 | 13:stop sign
 15 | 14:parking meter
 16 | 15:bench
 17 | 16:bird
 18 | 17:cat
 19 | 18:dog
 20 | 19:horse
 21 | 20:sheep
 22 | 21:cow
 23 | 22:elephant
 24 | 23:bear
 25 | 24:zebra
 26 | 25:giraffe
 27 | 26:invalid_class_id
 28 | 27:backpack
 29 | 28:umbrella
 30 | 29:invalid_class_id
 31 | 30:invalid_class_id
 32 | 31:handbag
 33 | 32:tie
 34 | 33:suitcase
 35 | 34:frisbee
 36 | 35:skis
 37 | 36:snowboard
 38 | 37:sports ball
 39 | 38:kite
 40 | 39:baseball bat
 41 | 40:baseball glove
 42 | 41:skateboard
 43 | 42:surfboard
 44 | 43:tennis racket
 45 | 44:bottle
 46 | 45:invalid_class_id
 47 | 46:wine glass
 48 | 47:cup
 49 | 48:fork
 50 | 49:knife
 51 | 50:spoon
 52 | 51:bowl
 53 | 52:banana
 54 | 53:apple
 55 | 54:sandwich
 56 | 55:orange
 57 | 56:broccoli
 58 | 57:carrot
 59 | 58:hot dog
 60 | 59:pizza
 61 | 60:donut
 62 | 61:cake
 63 | 62:chair
 64 | 63:couch
 65 | 64:potted plant
 66 | 65:bed
 67 | 66:invalid_class_id
 68 | 67:dining table
 69 | 68:invalid_class_id
 70 | 69:invalid_class_id
 71 | 70:toilet
 72 | 71:invalid_class_id
 73 | 72:tv
 74 | 73:laptop
 75 | 74:mouse
 76 | 75:remote
 77 | 76:keyboard
 78 | 77:cell phone
 79 | 78:microwave
 80 | 79:oven
 81 | 80:toaster
 82 | 81:sink
 83 | 82:refrigerator
 84 | 83:invalid_class_id
 85 | 84:book
 86 | 85:clock
 87 | 86:vase
 88 | 87:scissors
 89 | 88:teddy bear
 90 | 89:hair drier
 91 | 90:toothbrush
 92 | 91:invalid_class_id
 93 | 92:banner
 94 | 93:blanket
 95 | 94:invalid_class_id
 96 | 95:bridge
 97 | 96:invalid_class_id
 98 | 97:invalid_class_id
 99 | 98:invalid_class_id
100 | 99:invalid_class_id
101 | 100:cardboard
102 | 101:invalid_class_id
103 | 102:invalid_class_id
104 | 103:invalid_class_id
105 | 104:invalid_class_id
106 | 105:invalid_class_id
107 | 106:invalid_class_id
108 | 107:counter
109 | 108:invalid_class_id
110 | 109:curtain
111 | 110:invalid_class_id
112 | 111:invalid_class_id
113 | 112:door
114 | 113:invalid_class_id
115 | 114:invalid_class_id
116 | 115:invalid_class_id
117 | 116:invalid_class_id
118 | 117:invalid_class_id
119 | 118:wood floor
120 | 119:flower
121 | 120:invalid_class_id
122 | 121:invalid_class_id
123 | 122:fruit
124 | 123:invalid_class_id
125 | 124:invalid_class_id
126 | 125:gravel
127 | 126:invalid_class_id
128 | 127:invalid_class_id
129 | 128:house
130 | 129:invalid_class_id
131 | 130:light
132 | 131:invalid_class_id
133 | 132:invalid_class_id
134 | 133:mirror-stuff
135 | 134:invalid_class_id
136 | 135:invalid_class_id
137 | 136:invalid_class_id
138 | 137:invalid_class_id
139 | 138:net
140 | 139:invalid_class_id
141 | 140:invalid_class_id
142 | 141:pillow
143 | 142:invalid_class_id
144 | 143:invalid_class_id
145 | 144:platform
146 | 145:playingfield
147 | 146:invalid_class_id
148 | 147:railroad
149 | 148:river
150 | 149:road
151 | 150:invalid_class_id
152 | 151:roof
153 | 152:invalid_class_id
154 | 153:invalid_class_id
155 | 154:sand
156 | 155:sea
157 | 156:shelf
158 | 157:invalid_class_id
159 | 158:invalid_class_id
160 | 159:snow
161 | 160:invalid_class_id
162 | 161:stairs
163 | 162:invalid_class_id
164 | 163:invalid_class_id
165 | 164:invalid_class_id
166 | 165:invalid_class_id
167 | 166:tent
168 | 167:invalid_class_id
169 | 168:towel
170 | 169:invalid_class_id
171 | 170:invalid_class_id
172 | 171:brick wall
173 | 172:invalid_class_id
174 | 173:invalid_class_id
175 | 174:invalid_class_id
176 | 175:stone wall
177 | 176:tile wall
178 | 177:wood wall
179 | 178:water
180 | 179:invalid_class_id
181 | 180:window blind
182 | 181:window
183 | 182:invalid_class_id
184 | 183:invalid_class_id
185 | 184:tree
186 | 185:fence
187 | 186:ceiling
188 | 187:sky
189 | 188:cabinet
190 | 189:table
191 | 190:floor
192 | 191:pavement
193 | 192:mountain
194 | 193:grass
195 | 194:dirt
196 | 195:paper
197 | 196:food
198 | 197:building
199 | 198:rock
200 | 199:wall
201 | 200:rug
202 | 


--------------------------------------------------------------------------------
/odise/data/datasets/openseg_labels/pascal_context_59.txt:
--------------------------------------------------------------------------------
 1 | 0:invalid_class_id
 2 | 1:aeroplane
 3 | 2:bag
 4 | 3:bed
 5 | 4:bedclothes
 6 | 5:bench
 7 | 6:bicycle
 8 | 7:bird
 9 | 8:boat
10 | 9:book
11 | 10:bottle
12 | 11:building
13 | 12:bus
14 | 13:cabinet
15 | 14:car
16 | 15:cat
17 | 16:ceiling
18 | 17:chair
19 | 18:cloth
20 | 19:computer
21 | 20:cow
22 | 21:cup
23 | 22:curtain
24 | 23:dog
25 | 24:door
26 | 25:fence
27 | 26:floor
28 | 27:flower
29 | 28:food
30 | 29:grass
31 | 30:ground
32 | 31:horse
33 | 32:keyboard
34 | 33:light
35 | 34:motorbike
36 | 35:mountain
37 | 36:mouse
38 | 37:person
39 | 38:plate
40 | 39:platform
41 | 40:pottedplant
42 | 41:road
43 | 42:rock
44 | 43:sheep
45 | 44:shelves
46 | 45:sidewalk
47 | 46:sign
48 | 47:sky
49 | 48:snow
50 | 49:sofa
51 | 50:diningtable
52 | 51:track
53 | 52:train
54 | 53:tree
55 | 54:truck
56 | 55:tvmonitor
57 | 56:wall
58 | 57:water
59 | 58:window
60 | 59:wood
61 | 


--------------------------------------------------------------------------------
/odise/data/datasets/openseg_labels/pascal_context_59_with_prompt_eng.txt:
--------------------------------------------------------------------------------
 1 | 0:invalid_class_id
 2 | 1:aeroplane,aeroplanes,airplanes,airplane
 3 | 2:bag,bags
 4 | 3:bed,beds
 5 | 4:bedclothes
 6 | 5:bench,benches
 7 | 6:bicycle,bicycles
 8 | 7:bird,birds
 9 | 8:boat,boats
10 | 9:book,books
11 | 10:bottle,bottles,water bottle
12 | 11:building,buildings
13 | 12:bus,buses
14 | 13:cabinet,cabinets,drawer,drawers
15 | 14:car,cars
16 | 15:cat,cats,kitties,kitty
17 | 16:ceiling
18 | 17:chair,chairs
19 | 18:cloth,clothes
20 | 19:computer case
21 | 20:cow,cows
22 | 21:cup,cups
23 | 22:curtain,curtains
24 | 23:dog,dogs,puppy,puppies
25 | 24:door,doors
26 | 25:fence,fences
27 | 26:floor,tile ground,carpet,rug,flooring
28 | 27:flower,flowers
29 | 28:food
30 | 29:grass,grasses,lawn,turf
31 | 30:ground,soil,soil ground,dirt ground
32 | 31:horse,horses,foal
33 | 32:keyboard,keyboards
34 | 33:lamp,lamps,bulb,bulbs
35 | 34:motorbike,motorcycle,motorbikes,motorcycles
36 | 35:mountain,mountains
37 | 36:mouse
38 | 37:person,child,girl,boy,woman,man,people,childeren,girls,boys,women,men,lady,guy,ladies,guys
39 | 38:plate,plates
40 | 39:platform,platforms
41 | 40:pottedplant,pottedplants,plant pot,plant pots,planter,planters
42 | 41:street,streets
43 | 42:rock,rocks,stone,stones
44 | 43:sheep
45 | 44:shelves,shelf
46 | 45:sidewalk
47 | 46:sign,signs
48 | 47:sky,clouds
49 | 48:snow
50 | 49:sofa
51 | 50:diningtable,diningtables,table,tables,desk,desks,side table,side tables,coffee table
52 | 51:track,train track,railroad
53 | 52:train,trains,locomotive,locomotives,freight train
54 | 53:tree,trees
55 | 54:truck,trucks
56 | 55:tvmonitor,monitor,tv
57 | 56:wall,walls
58 | 57:water
59 | 58:window,windows
60 | 59:wood piece
61 | 


--------------------------------------------------------------------------------
/odise/data/datasets/openseg_labels/pascal_voc_21.txt:
--------------------------------------------------------------------------------
 1 | 0:background,bag,bed,bench,book,building,cabinet,ceiling,cloth,computer,cup,door,fence,floor,flower,food,grass,ground,keyboard,light,mountain,mouse,curtain,platform,sign,plate,road,rock,shelves,sidewalk,sky,snow,bedclothes,track,tree,truck,wall,water,window,wood
 2 | 1:aeroplane
 3 | 2:bicycle
 4 | 3:bird
 5 | 4:boat
 6 | 5:bottle
 7 | 6:bus
 8 | 7:car
 9 | 8:cat
10 | 9:chair
11 | 10:cow
12 | 11:diningtable
13 | 12:dog
14 | 13:horse
15 | 14:motorbike
16 | 15:person
17 | 16:pottedplant
18 | 17:sheep
19 | 18:sofa
20 | 19:train
21 | 20:tvmonitor
22 | 


--------------------------------------------------------------------------------
/odise/data/datasets/openseg_labels/pascal_voc_21_with_prompt_eng.txt:
--------------------------------------------------------------------------------
 1 | 0:background,crops,bush,shrub,tiles,pavement,rug,carpet,box,boxes,speaker,storage,painting,board,panel,poster,clock,cage,drinking glass,park,plaything,toy,fireplace,bag,bag,bed,bench,book,books,building,buildings,cabinet,drawer,ceiling,computer,computer case,cup,cups,door,fence,floor,flower,grass,lawn,turf,ground,soil,dirt,tiles,keyboard,lamp,mountain,hills,mouse,curtain,platform,sign,street,rock,stone,shelf,sidewalk,sky,clouds,snow,track,train track,tree,trees,wall,water,window,wood,woods
 2 | 1:aeroplane,airplane,aeroplanes,airplanes
 3 | 2:bicycle,bicycles,bike,bikes
 4 | 3:bird,birds
 5 | 4:boat,boats
 6 | 5:bottle,bottles,water bottle
 7 | 6:bus,buses
 8 | 7:car,cars
 9 | 8:cat,cats,kitties,kitty
10 | 9:chair,chairs
11 | 10:cow,cows,calf
12 | 11:diningtable,dining table,diningtables,dining tables,plate,plates
13 | 12:dog,dogs,puppy,puppies
14 | 13:horse,horses,foal
15 | 14:motorbike,motorcycle,motorbikes,motorcycles
16 | 15:person,child,girl,boy,woman,man,people,childeren,girls,boys,women,men,lady,guy,ladies,guys,clothes
17 | 16:pottedplant,pottedplants,plant pot,plant pots,planter,planters
18 | 17:sheep
19 | 18:sofa,sofas
20 | 19:train,trains,locomotive,locomotives,freight train
21 | 20:tvmonitor,monitor,tv
22 | 


--------------------------------------------------------------------------------
/odise/data/datasets/register_coco_caption.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
 7 | #
 8 | # Written by Jiarui Xu
 9 | # ------------------------------------------------------------------------------
10 | 
11 | import os
12 | from detectron2.data import MetadataCatalog
13 | from mask2former.data.datasets.register_coco_panoptic_annos_semseg import (
14 |     get_metadata,
15 |     register_coco_panoptic_annos_sem_seg,
16 | )
17 | 
18 | _PREDEFINED_SPLITS_COCO_PANOPTIC_CAPTION = {
19 |     "coco_2017_train_panoptic_caption": (
20 |         # This is the original panoptic annotation directory
21 |         "coco/panoptic_train2017",
22 |         "coco/annotations/panoptic_caption_train2017.json",
23 |         # This directory contains semantic annotations that are
24 |         # converted from panoptic annotations.
25 |         # It is used by PanopticFPN.
26 |         # You can use the script at detectron2/datasets/prepare_panoptic_fpn.py
27 |         # to create these directories.
28 |         "coco/panoptic_semseg_train2017",
29 |     ),
30 |     "coco_2017_val_panoptic_caption": (
31 |         "coco/panoptic_val2017",
32 |         "coco/annotations/panoptic_caption_val2017.json",
33 |         "coco/panoptic_semseg_val2017",
34 |     ),
35 |     "coco_2017_val_100_panoptic_caption": (
36 |         "coco/panoptic_val2017_100",
37 |         "coco/annotations/panoptic_caption_val2017_100.json",
38 |         "coco/panoptic_semseg_val2017_100",
39 |     ),
40 | }
41 | 
42 | 
43 | # NOTE: the name is "coco_2017_train_panoptic_caption_with_sem_seg" and "coco_2017_val_panoptic_caption_with_sem_seg" # noqa
44 | def register_all_coco_panoptic_annos_sem_seg_caption(root):
45 |     for (
46 |         prefix,
47 |         (panoptic_root, panoptic_json, semantic_root),
48 |     ) in _PREDEFINED_SPLITS_COCO_PANOPTIC_CAPTION.items():
49 |         if prefix.endswith("_panoptic_caption"):
50 |             prefix_instances = prefix[: -len("_panoptic_caption")]
51 |         else:
52 |             raise ValueError("Unknown prefix: {}".format(prefix))
53 |         instances_meta = MetadataCatalog.get(prefix_instances)
54 |         image_root, instances_json = instances_meta.image_root, instances_meta.json_file
55 | 
56 |         register_coco_panoptic_annos_sem_seg(
57 |             prefix,
58 |             get_metadata(),
59 |             image_root,
60 |             os.path.join(root, panoptic_root),
61 |             os.path.join(root, panoptic_json),
62 |             os.path.join(root, semantic_root),
63 |             instances_json,
64 |         )
65 | 
66 | 
67 | register_all_coco_panoptic_annos_sem_seg_caption(os.getenv("DETECTRON2_DATASETS", "datasets"))
68 | 


--------------------------------------------------------------------------------
/odise/engine/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
 7 | #
 8 | # Written by Jiarui Xu
 9 | # ------------------------------------------------------------------------------
10 | 
11 | from .train_loop import SimpleTrainer, AMPTrainer
12 | 
13 | __all__ = ["SimpleTrainer", "AMPTrainer"]
14 | 


--------------------------------------------------------------------------------
/odise/engine/hooks.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | # To view a copy of this license, visit
 4 | # https://github.com/facebookresearch/detectron2/blob/main/LICENSE
 5 | # ------------------------------------------------------------------------------
 6 | #
 7 | # ------------------------------------------------------------------------------
 8 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 9 | #
10 | # This work is made available under the Nvidia Source Code License.
11 | # To view a copy of this license, visit
12 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
13 | #
14 | # Written by Jiarui Xu
15 | # ------------------------------------------------------------------------------
16 | 
17 | import inspect
18 | import detectron2.utils.comm as comm
19 | from detectron2.engine import EvalHook as _EvalHook
20 | from detectron2.evaluation.testing import flatten_results_dict
21 | 
22 | 
23 | class EvalHook(_EvalHook):
24 |     def __init__(self, eval_period, eval_function):
25 |         super().__init__(eval_period, eval_function)
26 |         func_args = inspect.getfullargspec(eval_function).args
27 |         assert {"final_iter", "next_iter"}.issubset(set(func_args)), (
28 |             f"Eval function must have either 'final_iter' or 'next_iter' as an argument."
29 |             f"Got {func_args} instead."
30 |         )
31 | 
32 |     def _do_eval(self, final_iter=False, next_iter=0):
33 |         results = self._func(final_iter=final_iter, next_iter=next_iter)
34 | 
35 |         if results:
36 |             assert isinstance(
37 |                 results, dict
38 |             ), "Eval function must return a dict. Got {} instead.".format(results)
39 | 
40 |             flattened_results = flatten_results_dict(results)
41 |             for k, v in flattened_results.items():
42 |                 try:
43 |                     v = float(v)
44 |                 except Exception as e:
45 |                     raise ValueError(
46 |                         "[EvalHook] eval_function should return a nested dict of float. "
47 |                         "Got '{}: {}' instead.".format(k, v)
48 |                     ) from e
49 |             self.trainer.storage.put_scalars(**flattened_results, smoothing_hint=False)
50 | 
51 |         # Evaluation may take different time among workers.
52 |         # A barrier make them start the next iteration together.
53 |         comm.synchronize()
54 | 
55 |     def after_step(self):
56 |         next_iter = self.trainer.iter + 1
57 |         if self._period > 0 and next_iter % self._period == 0:
58 |             # do the last eval in after_train
59 |             if next_iter != self.trainer.max_iter:
60 |                 self._do_eval(next_iter=next_iter)
61 | 
62 |     def after_train(self):
63 |         # This condition is to prevent the eval from running after a failed training
64 |         if self.trainer.iter + 1 >= self.trainer.max_iter:
65 |             self._do_eval(final_iter=True)
66 |         # func is likely a closure that holds reference to the trainer
67 |         # therefore we clean it to avoid circular reference in the end
68 |         del self._func
69 | 


--------------------------------------------------------------------------------
/odise/evaluation/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
 7 | #
 8 | # Written by Jiarui Xu
 9 | # ------------------------------------------------------------------------------
10 | 
11 | from .evaluator import inference_on_dataset
12 | from .d2_evaluator import (
13 |     COCOPanopticEvaluator,
14 |     InstanceSegEvaluator,
15 |     SemSegEvaluator,
16 |     COCOEvaluator,
17 | )
18 | 
19 | __all__ = [
20 |     "inference_on_dataset",
21 |     "COCOPanopticEvaluator",
22 |     "InstanceSegEvaluator",
23 |     "SemSegEvaluator",
24 |     "COCOEvaluator",
25 | ]
26 | 


--------------------------------------------------------------------------------
/odise/model_zoo/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | # To view a copy of this license, visit
 4 | # https://github.com/facebookresearch/detectron2/blob/main/LICENSE
 5 | # ------------------------------------------------------------------------------
 6 | #
 7 | # ------------------------------------------------------------------------------
 8 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 9 | #
10 | # This work is made available under the Nvidia Source Code License.
11 | # To view a copy of this license, visit
12 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
13 | #
14 | # Written by Jiarui Xu
15 | # ------------------------------------------------------------------------------
16 | 
17 | """
18 | Model Zoo API for ODISE: a collection of functions to create common model architectures
19 | listed in `MODEL_ZOO.md <https://github.com/NVlabs/ODISE/blob/master/README.md#model-zoo>`_,
20 | and optionally load their pre-trained weights.
21 | """
22 | 
23 | from .model_zoo import get, get_config_file, get_checkpoint_url, get_config
24 | 
25 | __all__ = ["get_checkpoint_url", "get", "get_config_file", "get_config"]
26 | 


--------------------------------------------------------------------------------
/odise/modeling/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
 7 | #
 8 | # Written by Jiarui Xu
 9 | # ------------------------------------------------------------------------------
10 | 
11 | from .backbone import FeatureExtractorBackbone
12 | 
13 | __all__ = ["FeatureExtractorBackbone"]
14 | 


--------------------------------------------------------------------------------
/odise/modeling/backbone/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
 7 | #
 8 | # Written by Jiarui Xu
 9 | # ------------------------------------------------------------------------------
10 | 
11 | from .feature_extractor import FeatureExtractorBackbone
12 | 
13 | __all__ = ["FeatureExtractorBackbone"]
14 | 


--------------------------------------------------------------------------------
/odise/modeling/diffusion/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
 7 | #
 8 | # Written by Jiarui Xu
 9 | # ------------------------------------------------------------------------------
10 | 
11 | from .diffusion_builder import create_gaussian_diffusion
12 | from .gaussian_diffusion import GaussianDiffusion
13 | 
14 | __all__ = ["create_gaussian_diffusion", "GaussianDiffusion"]
15 | 


--------------------------------------------------------------------------------
/odise/modeling/diffusion/diffusion_builder.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2021 OpenAI
 3 | # To view a copy of this license, visit
 4 | # https://github.com/openai/glide-text2im/blob/main/LICENSE
 5 | # ------------------------------------------------------------------------------
 6 | #
 7 | # ------------------------------------------------------------------------------
 8 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 9 | #
10 | # This work is made available under the Nvidia Source Code License.
11 | # To view a copy of this license, visit
12 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
13 | #
14 | # Written by Jiarui Xu
15 | # ------------------------------------------------------------------------------
16 | 
17 | from . import gaussian_diffusion as gd
18 | from .respace import SpacedDiffusion, space_timesteps
19 | 
20 | 
21 | def create_gaussian_diffusion(
22 |     *,
23 |     steps=1000,
24 |     learn_sigma=False,
25 |     sigma_small=False,
26 |     noise_schedule="linear",
27 |     use_kl=False,
28 |     predict_xstart=False,
29 |     rescale_timesteps=False,
30 |     rescale_learned_sigmas=False,
31 |     timestep_respacing="",
32 | ):
33 |     betas = gd.get_named_beta_schedule(noise_schedule, steps)
34 |     if use_kl:
35 |         loss_type = gd.LossType.RESCALED_KL
36 |     elif rescale_learned_sigmas:
37 |         loss_type = gd.LossType.RESCALED_MSE
38 |     else:
39 |         loss_type = gd.LossType.MSE
40 |     if not timestep_respacing:
41 |         timestep_respacing = [steps]
42 |     return SpacedDiffusion(
43 |         use_timesteps=space_timesteps(steps, timestep_respacing),
44 |         betas=betas,
45 |         model_mean_type=(
46 |             gd.ModelMeanType.EPSILON if not predict_xstart else gd.ModelMeanType.START_X
47 |         ),
48 |         model_var_type=(
49 |             (gd.ModelVarType.FIXED_LARGE if not sigma_small else gd.ModelVarType.FIXED_SMALL)
50 |             if not learn_sigma
51 |             else gd.ModelVarType.LEARNED_RANGE
52 |         ),
53 |         loss_type=loss_type,
54 |         rescale_timesteps=rescale_timesteps,
55 |     )
56 | 


--------------------------------------------------------------------------------
/odise/modeling/meta_arch/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
 7 | #
 8 | # Written by Jiarui Xu
 9 | # ------------------------------------------------------------------------------
10 | from .odise import CategoryODISE, CaptionODISE
11 | 
12 | __all__ = [
13 |     "CategoryODISE",
14 |     "CaptionODISE",
15 | ]
16 | 


--------------------------------------------------------------------------------
/odise/modeling/preprocess.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
 7 | #
 8 | # Written by Jiarui Xu
 9 | # ------------------------------------------------------------------------------
10 | 
11 | import collections.abc
12 | import torch
13 | 
14 | 
15 | def batched_input_to_device(batched_inputs, device, exclude=()):
16 | 
17 |     if isinstance(exclude, str):
18 |         exclude = [exclude]
19 | 
20 |     if isinstance(batched_inputs, torch.Tensor):
21 |         batch = batched_inputs.to(device, non_blocking=True)
22 |         return batch
23 |     elif isinstance(batched_inputs, collections.abc.Mapping):
24 |         batch = {}
25 |         for k in batched_inputs:
26 |             if k not in exclude:
27 |                 batched_inputs[k] = batched_input_to_device(batched_inputs[k], device)
28 |         return batched_inputs
29 | 
30 |     elif isinstance(batched_inputs, collections.abc.Sequence) and not isinstance(
31 |         batched_inputs, str
32 |     ):
33 |         return [batched_input_to_device(d, device) for d in batched_inputs]
34 |     elif isinstance(batched_inputs, str):
35 |         return batched_inputs
36 |     else:
37 |         raise TypeError(f"Unsupported type {type(batched_inputs)}")
38 | 


--------------------------------------------------------------------------------
/odise/modeling/wrapper/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
 7 | #
 8 | # Written by Jiarui Xu
 9 | # ------------------------------------------------------------------------------
10 | 
11 | from .pano_wrapper import OpenPanopticInference
12 | 
13 | __all__ = ["OpenPanopticInference"]
14 | 


--------------------------------------------------------------------------------
/odise/modeling/wrapper/pano_wrapper.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
 7 | #
 8 | # Written by Jiarui Xu
 9 | # ------------------------------------------------------------------------------
10 | 
11 | from collections import OrderedDict
12 | import torch.nn as nn
13 | 
14 | 
15 | class OpenPanopticInference(nn.Module):
16 |     def __init__(
17 |         self,
18 |         model,
19 |         labels,
20 |         metadata=None,
21 |         semantic_on=True,
22 |         instance_on=True,
23 |         panoptic_on=True,
24 |         test_topk_per_image=100,
25 |     ):
26 |         super().__init__()
27 |         self.model = model
28 |         self.labels = labels
29 |         self.metadata = metadata
30 | 
31 |         self.semantic_on = semantic_on
32 |         self.instance_on = instance_on
33 |         self.panoptic_on = panoptic_on
34 |         self.test_topk_per_image = test_topk_per_image
35 | 
36 |         self.open_state_dict = OrderedDict()
37 | 
38 |         for k in self.model.open_state_dict():
39 |             if k.endswith("test_labels"):
40 |                 self.open_state_dict[k] = self.labels
41 |             elif k.endswith("metadata"):
42 |                 self.open_state_dict[k] = self.metadata
43 |             elif k.endswith("num_classes"):
44 |                 self.open_state_dict[k] = self.num_classes
45 |             elif k.endswith("semantic_on"):
46 |                 self.open_state_dict[k] = self.semantic_on
47 |             elif k.endswith("instance_on"):
48 |                 self.open_state_dict[k] = self.instance_on
49 |             elif k.endswith("panoptic_on"):
50 |                 self.open_state_dict[k] = self.panoptic_on
51 |             elif k.endswith("test_topk_per_image"):
52 |                 self.open_state_dict[k] = self.test_topk_per_image
53 | 
54 |     @property
55 |     def num_classes(self):
56 |         return len(self.labels)
57 | 
58 |     def forward(self, batched_inputs):
59 |         assert not self.training
60 | 
61 |         _open_state_dict = self.model.open_state_dict()
62 |         self.model.load_open_state_dict(self.open_state_dict)
63 | 
64 |         results = self.model(batched_inputs)
65 | 
66 |         self.model.load_open_state_dict(_open_state_dict)
67 | 
68 |         return results
69 | 


--------------------------------------------------------------------------------
/odise/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVlabs/ODISE/2b187e4b2ff4c3d5da342aec2cc234b537720a65/odise/utils/__init__.py


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [isort]
 2 | line_length=100
 3 | multi_line_output=3
 4 | include_trailing_comma=True
 5 | known_standard_library=numpy,setuptools,mock
 6 | skip=./datasets,docs,local_data,third_party
 7 | skip_glob=*/__init__.py,**/configs/**,tests/config/**,vision/modeling/mask2former/**,output/**
 8 | known_myself=odise
 9 | known_third_party=fvcore,matplotlib,cv2,torch,torchvision,PIL,pycocotools,yacs,termcolor,cityscapesscripts,tabulate,tqdm,scipy,lvis,psutil,pkg_resources,caffe2,onnx,panopticapi,black,isort,av,iopath,omegaconf,hydra,yaml,pydoc,submitit,cloudpickle
10 | no_lines_before=STDLIB,THIRDPARTY
11 | sections=FUTURE,STDLIB,THIRDPARTY,myself,FIRSTPARTY,LOCALFOLDER
12 | default_section=FIRSTPARTY
13 | 
14 | [mypy]
15 | python_version=3.6
16 | ignore_missing_imports = True
17 | warn_unused_configs = True
18 | disallow_untyped_defs = True
19 | check_untyped_defs = True
20 | warn_unused_ignores = True
21 | warn_redundant_casts = True
22 | show_column_numbers = True
23 | follow_imports = silent
24 | allow_redefinition = True
25 | ; Require all functions to be annotated
26 | disallow_incomplete_defs = True
27 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #
  3 | # ------------------------------------------------------------------------------
  4 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  5 | #
  6 | # This work is made available under the Nvidia Source Code License.
  7 | # To view a copy of this license, visit
  8 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE
  9 | #
 10 | # Written by Jiarui Xu
 11 | # ------------------------------------------------------------------------------
 12 | 
 13 | import glob
 14 | import os
 15 | import shutil
 16 | from os import path
 17 | from setuptools import find_packages, setup
 18 | from typing import List
 19 | import torch
 20 | 
 21 | torch_ver = [int(x) for x in torch.__version__.split(".")[:2]]
 22 | assert torch_ver >= [1, 8], "Requires PyTorch >= 1.8"
 23 | 
 24 | 
 25 | def get_version():
 26 |     init_py_path = path.join(path.abspath(path.dirname(__file__)), "odise", "__init__.py")
 27 |     init_py = open(init_py_path, "r").readlines()
 28 |     version_line = [l.strip() for l in init_py if l.startswith("__version__")][0]
 29 |     version = version_line.split("=")[-1].strip().strip("'\"")
 30 | 
 31 |     return version
 32 | 
 33 | 
 34 | def get_model_zoo_configs() -> List[str]:
 35 |     """
 36 |     Return a list of configs to include in package for model zoo. Copy over these configs inside
 37 |     odise/model_zoo.
 38 |     """
 39 | 
 40 |     # Use absolute paths while symlinking.
 41 |     source_configs_dir = path.join(path.dirname(path.realpath(__file__)), "configs")
 42 |     destination = path.join(path.dirname(path.realpath(__file__)), "odise", "model_zoo", "configs")
 43 |     # Symlink the config directory inside package to have a cleaner pip install.
 44 | 
 45 |     # Remove stale symlink/directory from a previous build.
 46 |     if path.exists(source_configs_dir):
 47 |         if path.islink(destination):
 48 |             os.unlink(destination)
 49 |         elif path.isdir(destination):
 50 |             shutil.rmtree(destination)
 51 | 
 52 |     if not path.exists(destination):
 53 |         try:
 54 |             os.symlink(source_configs_dir, destination)
 55 |         except OSError:
 56 |             # Fall back to copying if symlink fails: ex. on Windows.
 57 |             shutil.copytree(source_configs_dir, destination)
 58 | 
 59 |     config_paths = glob.glob("configs/**/*.yaml", recursive=True) + glob.glob(
 60 |         "configs/**/*.py", recursive=True
 61 |     )
 62 |     return config_paths
 63 | 
 64 | 
 65 | setup(
 66 |     name="odise",
 67 |     version=get_version(),
 68 |     author="Jiarui Xu",
 69 |     url="https://github.com/NVlabs/ODISE",
 70 |     description="Open-vocabulary DIffusion-based Panoptic Segmentation",
 71 |     packages=find_packages(exclude=("configs", "tests*")),
 72 |     package_data={"odise.model_zoo": get_model_zoo_configs()},
 73 |     python_requires=">=3.8",
 74 |     install_requires=[
 75 |         "timm==0.6.11",  # freeze timm version for stabliity
 76 |         "opencv-python==4.6.0.66",
 77 |         "diffdist==0.1",
 78 |         "nltk>=3.6.2",
 79 |         "einops>=0.3.0",
 80 |         "wandb>=0.12.11",
 81 |         # "transformers==4.20.1",  # freeze transformers version for stabliity
 82 |         # there is BC breaking in omegaconf 2.2.1
 83 |         # see: https://github.com/omry/omegaconf/issues/939
 84 |         "omegaconf==2.1.1",
 85 |         "open-clip-torch==2.0.2",
 86 |         f"mask2former @ file://localhost/{os.getcwd()}/third_party/Mask2Former/",
 87 |         "stable-diffusion-sdkit==2.1.3",
 88 |     ],
 89 |     extras_require={
 90 |         # dev dependencies. Install them by `pip install 'odise[dev]'`
 91 |         "dev": [
 92 |             "flake8==3.8.1",
 93 |             "isort==4.3.21",
 94 |             "flake8-bugbear",
 95 |             "flake8-comprehensions",
 96 |             "click==8.0.4",
 97 |             "importlib-metadata==4.11.3",
 98 |         ],
 99 |     },
100 |     include_package_data=True,
101 | )
102 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/.gitignore:
--------------------------------------------------------------------------------
 1 | # output dir
 2 | output
 3 | instant_test_output
 4 | inference_test_output
 5 | 
 6 | 
 7 | *.png
 8 | *.json
 9 | *.diff
10 | *.jpg
11 | !/projects/DensePose/doc/images/*.jpg
12 | 
13 | # compilation and distribution
14 | __pycache__
15 | _ext
16 | *.pyc
17 | *.pyd
18 | *.so
19 | *.dll
20 | *.egg-info/
21 | build/
22 | dist/
23 | wheels/
24 | 
25 | # pytorch/python/numpy formats
26 | *.pth
27 | *.pkl
28 | *.npy
29 | *.ts
30 | model_ts*.txt
31 | 
32 | # ipython/jupyter notebooks
33 | *.ipynb
34 | **/.ipynb_checkpoints/
35 | 
36 | # Editor temporaries
37 | *.swn
38 | *.swo
39 | *.swp
40 | *~
41 | 
42 | # editor settings
43 | .idea
44 | .vscode
45 | _darcs
46 | 
47 | # project dirs
48 | /detectron2/model_zoo/configs
49 | /datasets/*
50 | !/datasets/*.*
51 | /projects/*/datasets
52 | /models
53 | /snippet


--------------------------------------------------------------------------------
/third_party/Mask2Former/ADVANCED_USAGE.md:
--------------------------------------------------------------------------------
 1 | ## Advanced Usage of Mask2Former
 2 | 
 3 | This document provides a brief intro of the advanced usage of Mask2Former for research purpose.
 4 | 
 5 | Mask2Former is highly modulized, it consists of three components: a backbone, a pixel decoder and a Transformer decoder.
 6 | You can easily replace each of these three components with your own implementation.
 7 | 
 8 | ### Test Mask2Former with your own backbone
 9 | 
10 | 1. Define and register your backbone under `mask2former/modeling/backbone`. You can follow the Swin Transformer as an example.
11 | 2. Change the config file accordingly.
12 | 
13 | ### Test Mask2Former with your own pixel decoder
14 | 
15 | 1. Define and register your pixel decoder under `mask2former/modeling/pixel_decoder`.
16 | 2. Change the config file accordingly.
17 | 
18 | Note that, your pixel decoder must have a `self.forward_features(features)` methods that returns three values:
19 | 1. `mask_features`, which is the per-pixel embeddings with resolution 1/4 of the original image. This is used to produce binary masks.
20 | 2. `None`, you can simply return `None` for the second value.
21 | 3. `multi_scale_features`, which is the multi-scale inputs to the Transformer decoder. This must be a list with length 3.
22 | We use resolution 1/32, 1/16, and 1/8 but you can use arbitrary resolutions here.
23 | 
24 | Example config to use a Transformer-encoder enhanced FPN instead of MSDeformAttn:
25 | ```
26 | MODEL:
27 |   SEM_SEG_HEAD:
28 |     # pixel decoder
29 |     PIXEL_DECODER_NAME: "TransformerEncoderPixelDecoder"
30 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
31 |     COMMON_STRIDE: 4
32 |     TRANSFORMER_ENC_LAYERS: 6
33 | ```
34 | 
35 | ### Build a new Transformer decoder.
36 | 
37 | Transformer decoders are defined under `mask2former/modeling/transformer_decoder`.
38 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Code of Conduct
2 | 
3 | Facebook has adopted a Code of Conduct that we expect project participants to adhere to.
4 | Please read the [full text](https://code.fb.com/codeofconduct/)
5 | so that you can understand what actions will and will not be tolerated.
6 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to maskformer2
 2 | We want to make contributing to this project as easy and transparent as
 3 | possible.
 4 | 
 5 | ## Our Development Process
 6 | Minor changes and improvements will be released on an ongoing basis. Larger changes (e.g., changesets implementing a new paper) will be released on a more periodic basis.
 7 | 
 8 | ## Pull Requests
 9 | We actively welcome your pull requests.
10 | 
11 | 1. Fork the repo and create your branch from `main`.
12 | 2. If you've added code that should be tested, add tests.
13 | 3. If you've changed APIs, update the documentation.
14 | 4. Ensure the test suite passes.
15 | 5. Make sure your code lints.
16 | 6. If you haven't already, complete the Contributor License Agreement ("CLA").
17 | 
18 | ## Contributor License Agreement ("CLA")
19 | In order to accept your pull request, we need you to submit a CLA. You only need
20 | to do this once to work on any of Facebook's open source projects.
21 | 
22 | Complete your CLA here: <https://code.facebook.com/cla>
23 | 
24 | ## Issues
25 | We use GitHub issues to track public bugs. Please ensure your description is
26 | clear and has sufficient instructions to be able to reproduce the issue.
27 | 
28 | Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe
29 | disclosure of security bugs. In those cases, please go through the process
30 | outlined on that page and do not file a public issue.
31 | 
32 | ## Coding Style  
33 | * 4 spaces for indentation rather than tabs
34 | * 80 character line length
35 | * PEP8 formatting following [Black](https://black.readthedocs.io/en/stable/)
36 | 
37 | ## License
38 | By contributing to MaskFormer, you agree that your contributions will be licensed
39 | under the LICENSE file in the root directory of this source tree.
40 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/GETTING_STARTED.md:
--------------------------------------------------------------------------------
 1 | ## Getting Started with Mask2Former
 2 | 
 3 | This document provides a brief intro of the usage of Mask2Former.
 4 | 
 5 | Please see [Getting Started with Detectron2](https://github.com/facebookresearch/detectron2/blob/master/GETTING_STARTED.md) for full usage.
 6 | 
 7 | 
 8 | ### Inference Demo with Pre-trained Models
 9 | 
10 | 1. Pick a model and its config file from
11 |   [model zoo](MODEL_ZOO.md),
12 |   for example, `configs/coco/panoptic-segmentation/maskformer2_R50_bs16_50ep.yaml`.
13 | 2. We provide `demo.py` that is able to demo builtin configs. Run it with:
14 | ```
15 | cd demo/
16 | python demo.py --config-file ../configs/coco/panoptic-segmentation/maskformer2_R50_bs16_50ep.yaml \
17 |   --input input1.jpg input2.jpg \
18 |   [--other-options]
19 |   --opts MODEL.WEIGHTS /path/to/checkpoint_file
20 | ```
21 | The configs are made for training, therefore we need to specify `MODEL.WEIGHTS` to a model from model zoo for evaluation.
22 | This command will run the inference and show visualizations in an OpenCV window.
23 | 
24 | For details of the command line arguments, see `demo.py -h` or look at its source code
25 | to understand its behavior. Some common arguments are:
26 | * To run __on your webcam__, replace `--input files` with `--webcam`.
27 | * To run __on a video__, replace `--input files` with `--video-input video.mp4`.
28 | * To run __on cpu__, add `MODEL.DEVICE cpu` after `--opts`.
29 | * To save outputs to a directory (for images) or a file (for webcam or video), use `--output`.
30 | 
31 | 
32 | ### Training & Evaluation in Command Line
33 | 
34 | We provide a script `train_net.py`, that is made to train all the configs provided in Mask2Former.
35 | 
36 | To train a model with "train_net.py", first
37 | setup the corresponding datasets following
38 | [datasets/README.md](./datasets/README.md),
39 | then run:
40 | ```
41 | python train_net.py --num-gpus 8 \
42 |   --config-file configs/coco/panoptic-segmentation/maskformer2_R50_bs16_50ep.yaml
43 | ```
44 | 
45 | The configs are made for 8-GPU training.
46 | Since we use ADAMW optimizer, it is not clear how to scale learning rate with batch size.
47 | To train on 1 GPU, you need to figure out learning rate and batch size by yourself:
48 | ```
49 | python train_net.py \
50 |   --config-file configs/coco/panoptic-segmentation/maskformer2_R50_bs16_50ep.yaml \
51 |   --num-gpus 1 SOLVER.IMS_PER_BATCH SET_TO_SOME_REASONABLE_VALUE SOLVER.BASE_LR SET_TO_SOME_REASONABLE_VALUE
52 | ```
53 | 
54 | To evaluate a model's performance, use
55 | ```
56 | python train_net.py \
57 |   --config-file configs/coco/panoptic-segmentation/maskformer2_R50_bs16_50ep.yaml \
58 |   --eval-only MODEL.WEIGHTS /path/to/checkpoint_file
59 | ```
60 | For more options, see `python train_net.py -h`.
61 | 
62 | 
63 | ### Video instance segmentation
64 | Please use `demo_video/demo.py` for video instance segmentation demo and `train_net_video.py` to train
65 | and evaluate video instance segmentation models.
66 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/INSTALL.md:
--------------------------------------------------------------------------------
 1 | ## Installation
 2 | 
 3 | ### Requirements
 4 | - Linux or macOS with Python ≥ 3.6
 5 | - PyTorch ≥ 1.9 and [torchvision](https://github.com/pytorch/vision/) that matches the PyTorch installation.
 6 |   Install them together at [pytorch.org](https://pytorch.org) to make sure of this. Note, please check
 7 |   PyTorch version matches that is required by Detectron2.
 8 | - Detectron2: follow [Detectron2 installation instructions](https://detectron2.readthedocs.io/tutorials/install.html).
 9 | - OpenCV is optional but needed by demo and visualization
10 | - `pip install -r requirements.txt`
11 | 
12 | ### CUDA kernel for MSDeformAttn
13 | After preparing the required environment, run the following command to compile CUDA kernel for MSDeformAttn:
14 | 
15 | `CUDA_HOME` must be defined and points to the directory of the installed CUDA toolkit.
16 | 
17 | ```bash
18 | cd mask2former/modeling/pixel_decoder/ops
19 | sh make.sh
20 | ```
21 | 
22 | #### Building on another system
23 | To build on a system that does not have a GPU device but provide the drivers:
24 | ```bash
25 | TORCH_CUDA_ARCH_LIST='8.0' FORCE_CUDA=1 python setup.py build install
26 | ```
27 | 
28 | ### Example conda environment setup
29 | ```bash
30 | conda create --name mask2former python=3.8 -y
31 | conda activate mask2former
32 | conda install pytorch==1.9.0 torchvision==0.10.0 cudatoolkit=11.1 -c pytorch -c nvidia
33 | pip install -U opencv-python
34 | 
35 | # under your working directory
36 | git clone git@github.com:facebookresearch/detectron2.git
37 | cd detectron2
38 | pip install -e .
39 | pip install git+https://github.com/cocodataset/panopticapi.git
40 | pip install git+https://github.com/mcordts/cityscapesScripts.git
41 | 
42 | cd ..
43 | git clone git@github.com:facebookresearch/Mask2Former.git
44 | cd Mask2Former
45 | pip install -r requirements.txt
46 | cd mask2former/modeling/pixel_decoder/ops
47 | sh make.sh
48 | ```
49 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2022 Meta, Inc.
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/README.md:
--------------------------------------------------------------------------------
 1 | # Mask2Former: Masked-attention Mask Transformer for Universal Image Segmentation (CVPR 2022)
 2 | 
 3 | [Bowen Cheng](https://bowenc0221.github.io/), [Ishan Misra](https://imisra.github.io/), [Alexander G. Schwing](https://alexander-schwing.de/), [Alexander Kirillov](https://alexander-kirillov.github.io/), [Rohit Girdhar](https://rohitgirdhar.github.io/)
 4 | 
 5 | [[`arXiv`](https://arxiv.org/abs/2112.01527)] [[`Project`](https://bowenc0221.github.io/mask2former)] [[`BibTeX`](#CitingMask2Former)]
 6 | 
 7 | <div align="center">
 8 |   <img src="https://bowenc0221.github.io/images/maskformerv2_teaser.png" width="100%" height="100%"/>
 9 | </div><br/>
10 | 
11 | ### Features
12 | * A single architecture for panoptic, instance and semantic segmentation.
13 | * Support major segmentation datasets: ADE20K, Cityscapes, COCO, Mapillary Vistas.
14 | 
15 | ## Updates
16 | * Add Google Colab demo.
17 | * Video instance segmentation is now supported! Please check our [tech report](https://arxiv.org/abs/2112.10764) for more details.
18 | 
19 | ## Installation
20 | 
21 | See [installation instructions](INSTALL.md).
22 | 
23 | ## Getting Started
24 | 
25 | See [Preparing Datasets for Mask2Former](datasets/README.md).
26 | 
27 | See [Getting Started with Mask2Former](GETTING_STARTED.md).
28 | 
29 | Run our demo using Colab: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1uIWE5KbGFSjrxey2aRd5pWkKNY1_SaNq)
30 | 
31 | Integrated into [Huggingface Spaces 🤗](https://huggingface.co/spaces) using [Gradio](https://github.com/gradio-app/gradio). Try out the Web Demo: [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/akhaliq/Mask2Former)
32 | 
33 | Replicate web demo and docker image is available here: [![Replicate](https://replicate.com/facebookresearch/mask2former/badge)](https://replicate.com/facebookresearch/mask2former)
34 | 
35 | ## Advanced usage
36 | 
37 | See [Advanced Usage of Mask2Former](ADVANCED_USAGE.md).
38 | 
39 | ## Model Zoo and Baselines
40 | 
41 | We provide a large set of baseline results and trained models available for download in the [Mask2Former Model Zoo](MODEL_ZOO.md).
42 | 
43 | ## License
44 | 
45 | Shield: [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
46 | 
47 | The majority of Mask2Former is licensed under a [MIT License](LICENSE).
48 | 
49 | 
50 | However portions of the project are available under separate license terms: Swin-Transformer-Semantic-Segmentation is licensed under the [MIT license](https://github.com/SwinTransformer/Swin-Transformer-Semantic-Segmentation/blob/main/LICENSE), Deformable-DETR is licensed under the [Apache-2.0 License](https://github.com/fundamentalvision/Deformable-DETR/blob/main/LICENSE).
51 | 
52 | ## <a name="CitingMask2Former"></a>Citing Mask2Former
53 | 
54 | If you use Mask2Former in your research or wish to refer to the baseline results published in the [Model Zoo](MODEL_ZOO.md), please use the following BibTeX entry.
55 | 
56 | ```BibTeX
57 | @inproceedings{cheng2021mask2former,
58 |   title={Masked-attention Mask Transformer for Universal Image Segmentation},
59 |   author={Bowen Cheng and Ishan Misra and Alexander G. Schwing and Alexander Kirillov and Rohit Girdhar},
60 |   journal={CVPR},
61 |   year={2022}
62 | }
63 | ```
64 | 
65 | If you find the code useful, please also consider the following BibTeX entry.
66 | 
67 | ```BibTeX
68 | @inproceedings{cheng2021maskformer,
69 |   title={Per-Pixel Classification is Not All You Need for Semantic Segmentation},
70 |   author={Bowen Cheng and Alexander G. Schwing and Alexander Kirillov},
71 |   journal={NeurIPS},
72 |   year={2021}
73 | }
74 | ```
75 | 
76 | ## Acknowledgement
77 | 
78 | Code is largely based on MaskFormer (https://github.com/facebookresearch/MaskFormer).
79 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/cog.yaml:
--------------------------------------------------------------------------------
 1 | build:
 2 |   gpu: true
 3 |   cuda: "10.1"
 4 |   python_version: "3.8"
 5 |   system_packages:
 6 |     - "libgl1-mesa-glx"
 7 |     - "libglib2.0-0"
 8 |   python_packages:
 9 |     - "ipython==7.30.1"
10 |     - "numpy==1.21.4"
11 |     - "torch==1.8.1"
12 |     - "torchvision==0.9.1"
13 |     - "opencv-python==4.5.5.62"
14 |     - "Shapely==1.8.0"
15 |     - "h5py==3.6.0"
16 |     - "scipy==1.7.3"
17 |     - "submitit==1.4.1"
18 |     - "scikit-image==0.19.1"
19 |     - "Cython==0.29.27"
20 |     - "timm==0.4.12"
21 |   run:
22 |     - pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu101/torch1.8/index.html
23 |     - pip install git+https://github.com/cocodataset/panopticapi.git
24 |     - pip install git+https://github.com/mcordts/cityscapesScripts.git
25 |     - git clone https://github.com/facebookresearch/Mask2Former
26 |     - TORCH_CUDA_ARCH_LIST='7.5' FORCE_CUDA=1 python Mask2Former/mask2former/modeling/pixel_decoder/ops/setup.py build install
27 | 
28 | predict: "predict.py:Predictor"
29 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/ade20k/instance-segmentation/Base-ADE20K-InstanceSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     # NORM: "SyncBN"
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("ade20k_instance_train",)
18 |   TEST: ("ade20k_instance_val",)
19 | SOLVER:
20 |   IMS_PER_BATCH: 16
21 |   BASE_LR: 0.0001
22 |   MAX_ITER: 160000
23 |   WARMUP_FACTOR: 1.0
24 |   WARMUP_ITERS: 0
25 |   WEIGHT_DECAY: 0.05
26 |   OPTIMIZER: "ADAMW"
27 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
28 |   BACKBONE_MULTIPLIER: 0.1
29 |   CLIP_GRADIENTS:
30 |     ENABLED: True
31 |     CLIP_TYPE: "full_model"
32 |     CLIP_VALUE: 0.01
33 |     NORM_TYPE: 2.0
34 |   AMP:
35 |     ENABLED: True
36 | INPUT:
37 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
38 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
39 |   MIN_SIZE_TEST: 640
40 |   MAX_SIZE_TRAIN: 2560
41 |   MAX_SIZE_TEST: 2560
42 |   CROP:
43 |     ENABLED: True
44 |     TYPE: "absolute"
45 |     SIZE: (640, 640)
46 |     SINGLE_CATEGORY_MAX_AREA: 1.0
47 |   COLOR_AUG_SSD: True
48 |   SIZE_DIVISIBILITY: 640  # used in dataset mapper
49 |   FORMAT: "RGB"
50 |   DATASET_MAPPER_NAME: "mask_former_instance"
51 | TEST:
52 |   EVAL_PERIOD: 5000
53 |   AUG:
54 |     ENABLED: False
55 |     MIN_SIZES: [320, 480, 640, 800, 960, 1120]
56 |     MAX_SIZE: 4480
57 |     FLIP: True
58 | DATALOADER:
59 |   FILTER_EMPTY_ANNOTATIONS: True
60 |   NUM_WORKERS: 4
61 | VERSION: 2
62 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/ade20k/instance-segmentation/maskformer2_R50_bs16_160k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-ADE20K-InstanceSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 255
 7 |     NUM_CLASSES: 100
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     DEEP_SUPERVISION: True
22 |     NO_OBJECT_WEIGHT: 0.1
23 |     CLASS_WEIGHT: 2.0
24 |     MASK_WEIGHT: 5.0
25 |     DICE_WEIGHT: 5.0
26 |     HIDDEN_DIM: 256
27 |     NUM_OBJECT_QUERIES: 100
28 |     NHEADS: 8
29 |     DROPOUT: 0.0
30 |     DIM_FEEDFORWARD: 2048
31 |     ENC_LAYERS: 0
32 |     PRE_NORM: False
33 |     ENFORCE_INPUT_PROJ: False
34 |     SIZE_DIVISIBILITY: 32
35 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
36 |     TRAIN_NUM_POINTS: 12544
37 |     OVERSAMPLE_RATIO: 3.0
38 |     IMPORTANCE_SAMPLE_RATIO: 0.75
39 |     TEST:
40 |       SEMANTIC_ON: True
41 |       INSTANCE_ON: True
42 |       PANOPTIC_ON: True
43 |       OVERLAP_THRESHOLD: 0.8
44 |       OBJECT_MASK_THRESHOLD: 0.8
45 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/ade20k/instance-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_160k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 200
19 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/ade20k/panoptic-segmentation/Base-ADE20K-PanopticSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     # NORM: "SyncBN"
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("ade20k_panoptic_train",)
18 |   TEST: ("ade20k_panoptic_val",)
19 | SOLVER:
20 |   IMS_PER_BATCH: 16
21 |   BASE_LR: 0.0001
22 |   MAX_ITER: 160000
23 |   WARMUP_FACTOR: 1.0
24 |   WARMUP_ITERS: 0
25 |   WEIGHT_DECAY: 0.05
26 |   OPTIMIZER: "ADAMW"
27 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
28 |   BACKBONE_MULTIPLIER: 0.1
29 |   CLIP_GRADIENTS:
30 |     ENABLED: True
31 |     CLIP_TYPE: "full_model"
32 |     CLIP_VALUE: 0.01
33 |     NORM_TYPE: 2.0
34 |   AMP:
35 |     ENABLED: True
36 | INPUT:
37 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
38 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
39 |   MIN_SIZE_TEST: 640
40 |   MAX_SIZE_TRAIN: 2560
41 |   MAX_SIZE_TEST: 2560
42 |   CROP:
43 |     ENABLED: True
44 |     TYPE: "absolute"
45 |     SIZE: (640, 640)
46 |     SINGLE_CATEGORY_MAX_AREA: 1.0
47 |   COLOR_AUG_SSD: True
48 |   SIZE_DIVISIBILITY: 640  # used in dataset mapper
49 |   FORMAT: "RGB"
50 |   DATASET_MAPPER_NAME: "mask_former_panoptic"
51 | TEST:
52 |   EVAL_PERIOD: 5000
53 |   AUG:
54 |     ENABLED: False
55 |     MIN_SIZES: [320, 480, 640, 800, 960, 1120]
56 |     MAX_SIZE: 4480
57 |     FLIP: True
58 | DATALOADER:
59 |   FILTER_EMPTY_ANNOTATIONS: True
60 |   NUM_WORKERS: 4
61 | VERSION: 2
62 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/ade20k/panoptic-segmentation/maskformer2_R50_bs16_160k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-ADE20K-PanopticSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 255
 7 |     NUM_CLASSES: 150
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     DEEP_SUPERVISION: True
22 |     NO_OBJECT_WEIGHT: 0.1
23 |     CLASS_WEIGHT: 2.0
24 |     MASK_WEIGHT: 5.0
25 |     DICE_WEIGHT: 5.0
26 |     HIDDEN_DIM: 256
27 |     NUM_OBJECT_QUERIES: 100
28 |     NHEADS: 8
29 |     DROPOUT: 0.0
30 |     DIM_FEEDFORWARD: 2048
31 |     ENC_LAYERS: 0
32 |     PRE_NORM: False
33 |     ENFORCE_INPUT_PROJ: False
34 |     SIZE_DIVISIBILITY: 32
35 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
36 |     TRAIN_NUM_POINTS: 12544
37 |     OVERSAMPLE_RATIO: 3.0
38 |     IMPORTANCE_SAMPLE_RATIO: 0.75
39 |     TEST:
40 |       SEMANTIC_ON: True
41 |       INSTANCE_ON: True
42 |       PANOPTIC_ON: True
43 |       OVERLAP_THRESHOLD: 0.8
44 |       OBJECT_MASK_THRESHOLD: 0.8
45 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/ade20k/panoptic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_160k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 200
19 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/ade20k/semantic-segmentation/Base-ADE20K-SemanticSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     # NORM: "SyncBN"
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("ade20k_sem_seg_train",)
18 |   TEST: ("ade20k_sem_seg_val",)
19 | SOLVER:
20 |   IMS_PER_BATCH: 16
21 |   BASE_LR: 0.0001
22 |   MAX_ITER: 160000
23 |   WARMUP_FACTOR: 1.0
24 |   WARMUP_ITERS: 0
25 |   WEIGHT_DECAY: 0.05
26 |   OPTIMIZER: "ADAMW"
27 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
28 |   BACKBONE_MULTIPLIER: 0.1
29 |   CLIP_GRADIENTS:
30 |     ENABLED: True
31 |     CLIP_TYPE: "full_model"
32 |     CLIP_VALUE: 0.01
33 |     NORM_TYPE: 2.0
34 |   AMP:
35 |     ENABLED: True
36 | INPUT:
37 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 512) for x in range(5, 21)]"]
38 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
39 |   MIN_SIZE_TEST: 512
40 |   MAX_SIZE_TRAIN: 2048
41 |   MAX_SIZE_TEST: 2048
42 |   CROP:
43 |     ENABLED: True
44 |     TYPE: "absolute"
45 |     SIZE: (512, 512)
46 |     SINGLE_CATEGORY_MAX_AREA: 1.0
47 |   COLOR_AUG_SSD: True
48 |   SIZE_DIVISIBILITY: 512  # used in dataset mapper
49 |   FORMAT: "RGB"
50 |   DATASET_MAPPER_NAME: "mask_former_semantic"
51 | TEST:
52 |   EVAL_PERIOD: 5000
53 |   AUG:
54 |     ENABLED: False
55 |     MIN_SIZES: [256, 384, 512, 640, 768, 896]
56 |     MAX_SIZE: 3584
57 |     FLIP: True
58 | DATALOADER:
59 |   FILTER_EMPTY_ANNOTATIONS: True
60 |   NUM_WORKERS: 4
61 | VERSION: 2
62 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/ade20k/semantic-segmentation/maskformer2_R101_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: maskformer2_R50_bs16_160k.yaml
 2 | MODEL:
 3 |   WEIGHTS: "R-101.pkl"
 4 |   RESNETS:
 5 |     DEPTH: 101
 6 |     STEM_TYPE: "basic"  # not used
 7 |     STEM_OUT_CHANNELS: 64
 8 |     STRIDE_IN_1X1: False
 9 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
10 |     NORM: "SyncBN"
11 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
12 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/ade20k/semantic-segmentation/maskformer2_R50_bs16_160k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-ADE20K-SemanticSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 255
 7 |     NUM_CLASSES: 150
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     DEEP_SUPERVISION: True
22 |     NO_OBJECT_WEIGHT: 0.1
23 |     CLASS_WEIGHT: 2.0
24 |     MASK_WEIGHT: 5.0
25 |     DICE_WEIGHT: 5.0
26 |     HIDDEN_DIM: 256
27 |     NUM_OBJECT_QUERIES: 100
28 |     NHEADS: 8
29 |     DROPOUT: 0.0
30 |     DIM_FEEDFORWARD: 2048
31 |     ENC_LAYERS: 0
32 |     PRE_NORM: False
33 |     ENFORCE_INPUT_PROJ: False
34 |     SIZE_DIVISIBILITY: 32
35 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
36 |     TRAIN_NUM_POINTS: 12544
37 |     OVERSAMPLE_RATIO: 3.0
38 |     IMPORTANCE_SAMPLE_RATIO: 0.75
39 |     TEST:
40 |       SEMANTIC_ON: True
41 |       INSTANCE_ON: False
42 |       PANOPTIC_ON: False
43 |       OVERLAP_THRESHOLD: 0.8
44 |       OBJECT_MASK_THRESHOLD: 0.8
45 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/ade20k/semantic-segmentation/swin/maskformer2_swin_base_384_bs16_160k_res640.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_base_patch4_window12_384.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | INPUT:
18 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
19 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
20 |   MIN_SIZE_TEST: 640
21 |   MAX_SIZE_TRAIN: 2560
22 |   MAX_SIZE_TEST: 2560
23 |   CROP:
24 |     ENABLED: True
25 |     TYPE: "absolute"
26 |     SIZE: (640, 640)
27 |     SINGLE_CATEGORY_MAX_AREA: 1.0
28 |   COLOR_AUG_SSD: True
29 |   SIZE_DIVISIBILITY: 640  # used in dataset mapper
30 |   FORMAT: "RGB"
31 | TEST:
32 |   EVAL_PERIOD: 5000
33 |   AUG:
34 |     ENABLED: False
35 |     MIN_SIZES: [320, 480, 640, 800, 960, 1120]
36 |     MAX_SIZE: 4480
37 |     FLIP: True
38 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/ade20k/semantic-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_160k_res640.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_base_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | INPUT:
18 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
19 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
20 |   MIN_SIZE_TEST: 640
21 |   MAX_SIZE_TRAIN: 2560
22 |   MAX_SIZE_TEST: 2560
23 |   CROP:
24 |     ENABLED: True
25 |     TYPE: "absolute"
26 |     SIZE: (640, 640)
27 |     SINGLE_CATEGORY_MAX_AREA: 1.0
28 |   COLOR_AUG_SSD: True
29 |   SIZE_DIVISIBILITY: 640  # used in dataset mapper
30 |   FORMAT: "RGB"
31 | TEST:
32 |   EVAL_PERIOD: 5000
33 |   AUG:
34 |     ENABLED: False
35 |     MIN_SIZES: [320, 480, 640, 800, 960, 1120]
36 |     MAX_SIZE: 4480
37 |     FLIP: True
38 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/ade20k/semantic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_160k_res640.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | INPUT:
18 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
19 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
20 |   MIN_SIZE_TEST: 640
21 |   MAX_SIZE_TRAIN: 2560
22 |   MAX_SIZE_TEST: 2560
23 |   CROP:
24 |     ENABLED: True
25 |     TYPE: "absolute"
26 |     SIZE: (640, 640)
27 |     SINGLE_CATEGORY_MAX_AREA: 1.0
28 |   COLOR_AUG_SSD: True
29 |   SIZE_DIVISIBILITY: 640  # used in dataset mapper
30 |   FORMAT: "RGB"
31 | TEST:
32 |   EVAL_PERIOD: 5000
33 |   AUG:
34 |     ENABLED: False
35 |     MIN_SIZES: [320, 480, 640, 800, 960, 1120]
36 |     MAX_SIZE: 4480
37 |     FLIP: True
38 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/ade20k/semantic-segmentation/swin/maskformer2_swin_small_bs16_160k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_small_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/ade20k/semantic-segmentation/swin/maskformer2_swin_tiny_bs16_160k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 6, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/cityscapes/instance-segmentation/Base-Cityscapes-InstanceSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     NORM: "SyncBN"  # use syncbn for cityscapes dataset
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("cityscapes_fine_instance_seg_train",)
18 |   TEST: ("cityscapes_fine_instance_seg_val",)
19 | SOLVER:
20 |   IMS_PER_BATCH: 16
21 |   BASE_LR: 0.0001
22 |   MAX_ITER: 90000
23 |   WARMUP_FACTOR: 1.0
24 |   WARMUP_ITERS: 0
25 |   WEIGHT_DECAY: 0.05
26 |   OPTIMIZER: "ADAMW"
27 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
28 |   BACKBONE_MULTIPLIER: 0.1
29 |   CLIP_GRADIENTS:
30 |     ENABLED: True
31 |     CLIP_TYPE: "full_model"
32 |     CLIP_VALUE: 0.01
33 |     NORM_TYPE: 2.0
34 |   AMP:
35 |     ENABLED: True
36 | INPUT:
37 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1024) for x in range(5, 21)]"]
38 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
39 |   MIN_SIZE_TEST: 1024
40 |   MAX_SIZE_TRAIN: 4096
41 |   MAX_SIZE_TEST: 2048
42 |   CROP:
43 |     ENABLED: True
44 |     TYPE: "absolute"
45 |     SIZE: (512, 1024)
46 |     SINGLE_CATEGORY_MAX_AREA: 1.0
47 |   COLOR_AUG_SSD: True
48 |   SIZE_DIVISIBILITY: -1
49 |   FORMAT: "RGB"
50 |   DATASET_MAPPER_NAME: "mask_former_instance"
51 | TEST:
52 |   EVAL_PERIOD: 5000
53 |   AUG:
54 |     ENABLED: False
55 |     MIN_SIZES: [512, 768, 1024, 1280, 1536, 1792]
56 |     MAX_SIZE: 4096
57 |     FLIP: True
58 | DATALOADER:
59 |   FILTER_EMPTY_ANNOTATIONS: True
60 |   NUM_WORKERS: 4
61 | VERSION: 2
62 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/cityscapes/instance-segmentation/maskformer2_R101_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   WEIGHTS: "R-101.pkl"
 4 |   RESNETS:
 5 |     DEPTH: 101
 6 |     STEM_TYPE: "basic"  # not used
 7 |     STEM_OUT_CHANNELS: 64
 8 |     STRIDE_IN_1X1: False
 9 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
10 |     NORM: "SyncBN"
11 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
12 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/cityscapes/instance-segmentation/maskformer2_R50_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-Cityscapes-InstanceSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 255
 7 |     NUM_CLASSES: 8
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     DEEP_SUPERVISION: True
22 |     NO_OBJECT_WEIGHT: 0.1
23 |     CLASS_WEIGHT: 2.0
24 |     MASK_WEIGHT: 5.0
25 |     DICE_WEIGHT: 5.0
26 |     HIDDEN_DIM: 256
27 |     NUM_OBJECT_QUERIES: 100
28 |     NHEADS: 8
29 |     DROPOUT: 0.0
30 |     DIM_FEEDFORWARD: 2048
31 |     ENC_LAYERS: 0
32 |     PRE_NORM: False
33 |     ENFORCE_INPUT_PROJ: False
34 |     SIZE_DIVISIBILITY: 32
35 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
36 |     TRAIN_NUM_POINTS: 12544
37 |     OVERSAMPLE_RATIO: 3.0
38 |     IMPORTANCE_SAMPLE_RATIO: 0.75
39 |     TEST:
40 |       SEMANTIC_ON: False
41 |       INSTANCE_ON: True
42 |       PANOPTIC_ON: False
43 |       OVERLAP_THRESHOLD: 0.8
44 |       OBJECT_MASK_THRESHOLD: 0.8
45 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/cityscapes/instance-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_base_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/cityscapes/instance-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 200
19 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/cityscapes/instance-segmentation/swin/maskformer2_swin_small_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_small_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/cityscapes/instance-segmentation/swin/maskformer2_swin_tiny_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 6, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/cityscapes/panoptic-segmentation/Base-Cityscapes-PanopticSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     NORM: "SyncBN"  # use syncbn for cityscapes dataset
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("cityscapes_fine_panoptic_train",)
18 |   TEST: ("cityscapes_fine_panoptic_val",)
19 | SOLVER:
20 |   IMS_PER_BATCH: 16
21 |   BASE_LR: 0.0001
22 |   MAX_ITER: 90000
23 |   WARMUP_FACTOR: 1.0
24 |   WARMUP_ITERS: 0
25 |   WEIGHT_DECAY: 0.05
26 |   OPTIMIZER: "ADAMW"
27 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
28 |   BACKBONE_MULTIPLIER: 0.1
29 |   CLIP_GRADIENTS:
30 |     ENABLED: True
31 |     CLIP_TYPE: "full_model"
32 |     CLIP_VALUE: 0.01
33 |     NORM_TYPE: 2.0
34 |   AMP:
35 |     ENABLED: True
36 | INPUT:
37 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1024) for x in range(5, 21)]"]
38 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
39 |   MIN_SIZE_TEST: 1024
40 |   MAX_SIZE_TRAIN: 4096
41 |   MAX_SIZE_TEST: 2048
42 |   CROP:
43 |     ENABLED: True
44 |     TYPE: "absolute"
45 |     SIZE: (512, 1024)
46 |     SINGLE_CATEGORY_MAX_AREA: 1.0
47 |   COLOR_AUG_SSD: True
48 |   SIZE_DIVISIBILITY: -1
49 |   FORMAT: "RGB"
50 |   DATASET_MAPPER_NAME: "mask_former_panoptic"
51 | TEST:
52 |   EVAL_PERIOD: 5000
53 |   AUG:
54 |     ENABLED: False
55 |     MIN_SIZES: [512, 768, 1024, 1280, 1536, 1792]
56 |     MAX_SIZE: 4096
57 |     FLIP: True
58 | DATALOADER:
59 |   FILTER_EMPTY_ANNOTATIONS: True
60 |   NUM_WORKERS: 4
61 | VERSION: 2
62 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/cityscapes/panoptic-segmentation/maskformer2_R101_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   WEIGHTS: "R-101.pkl"
 4 |   RESNETS:
 5 |     DEPTH: 101
 6 |     STEM_TYPE: "basic"  # not used
 7 |     STEM_OUT_CHANNELS: 64
 8 |     STRIDE_IN_1X1: False
 9 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
10 |     NORM: "SyncBN"
11 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
12 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/cityscapes/panoptic-segmentation/maskformer2_R50_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-Cityscapes-PanopticSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 255
 7 |     NUM_CLASSES: 19
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     DEEP_SUPERVISION: True
22 |     NO_OBJECT_WEIGHT: 0.1
23 |     CLASS_WEIGHT: 2.0
24 |     MASK_WEIGHT: 5.0
25 |     DICE_WEIGHT: 5.0
26 |     HIDDEN_DIM: 256
27 |     NUM_OBJECT_QUERIES: 100
28 |     NHEADS: 8
29 |     DROPOUT: 0.0
30 |     DIM_FEEDFORWARD: 2048
31 |     ENC_LAYERS: 0
32 |     PRE_NORM: False
33 |     ENFORCE_INPUT_PROJ: False
34 |     SIZE_DIVISIBILITY: 32
35 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
36 |     TRAIN_NUM_POINTS: 12544
37 |     OVERSAMPLE_RATIO: 3.0
38 |     IMPORTANCE_SAMPLE_RATIO: 0.75
39 |     TEST:
40 |       SEMANTIC_ON: True
41 |       INSTANCE_ON: True
42 |       PANOPTIC_ON: True
43 |       OVERLAP_THRESHOLD: 0.8
44 |       OBJECT_MASK_THRESHOLD: 0.8
45 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/cityscapes/panoptic-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_base_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/cityscapes/panoptic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 200
19 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/cityscapes/panoptic-segmentation/swin/maskformer2_swin_small_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_small_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/cityscapes/panoptic-segmentation/swin/maskformer2_swin_tiny_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 6, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/cityscapes/semantic-segmentation/Base-Cityscapes-SemanticSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     NORM: "SyncBN"  # use syncbn for cityscapes dataset
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("cityscapes_fine_sem_seg_train",)
18 |   TEST: ("cityscapes_fine_sem_seg_val",)
19 | SOLVER:
20 |   IMS_PER_BATCH: 16
21 |   BASE_LR: 0.0001
22 |   MAX_ITER: 90000
23 |   WARMUP_FACTOR: 1.0
24 |   WARMUP_ITERS: 0
25 |   WEIGHT_DECAY: 0.05
26 |   OPTIMIZER: "ADAMW"
27 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
28 |   BACKBONE_MULTIPLIER: 0.1
29 |   CLIP_GRADIENTS:
30 |     ENABLED: True
31 |     CLIP_TYPE: "full_model"
32 |     CLIP_VALUE: 0.01
33 |     NORM_TYPE: 2.0
34 |   AMP:
35 |     ENABLED: True
36 | INPUT:
37 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1024) for x in range(5, 21)]"]
38 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
39 |   MIN_SIZE_TEST: 1024
40 |   MAX_SIZE_TRAIN: 4096
41 |   MAX_SIZE_TEST: 2048
42 |   CROP:
43 |     ENABLED: True
44 |     TYPE: "absolute"
45 |     SIZE: (512, 1024)
46 |     SINGLE_CATEGORY_MAX_AREA: 1.0
47 |   COLOR_AUG_SSD: True
48 |   SIZE_DIVISIBILITY: -1
49 |   FORMAT: "RGB"
50 |   DATASET_MAPPER_NAME: "mask_former_semantic"
51 | TEST:
52 |   EVAL_PERIOD: 5000
53 |   AUG:
54 |     ENABLED: False
55 |     MIN_SIZES: [512, 768, 1024, 1280, 1536, 1792]
56 |     MAX_SIZE: 4096
57 |     FLIP: True
58 | DATALOADER:
59 |   FILTER_EMPTY_ANNOTATIONS: True
60 |   NUM_WORKERS: 4
61 | VERSION: 2
62 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/cityscapes/semantic-segmentation/maskformer2_R101_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   WEIGHTS: "R-101.pkl"
 4 |   RESNETS:
 5 |     DEPTH: 101
 6 |     STEM_TYPE: "basic"  # not used
 7 |     STEM_OUT_CHANNELS: 64
 8 |     STRIDE_IN_1X1: False
 9 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
10 |     NORM: "SyncBN"
11 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
12 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/cityscapes/semantic-segmentation/maskformer2_R50_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-Cityscapes-SemanticSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 255
 7 |     NUM_CLASSES: 19
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     DEEP_SUPERVISION: True
22 |     NO_OBJECT_WEIGHT: 0.1
23 |     CLASS_WEIGHT: 2.0
24 |     MASK_WEIGHT: 5.0
25 |     DICE_WEIGHT: 5.0
26 |     HIDDEN_DIM: 256
27 |     NUM_OBJECT_QUERIES: 100
28 |     NHEADS: 8
29 |     DROPOUT: 0.0
30 |     DIM_FEEDFORWARD: 2048
31 |     ENC_LAYERS: 0
32 |     PRE_NORM: False
33 |     ENFORCE_INPUT_PROJ: False
34 |     SIZE_DIVISIBILITY: 32
35 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
36 |     TRAIN_NUM_POINTS: 12544
37 |     OVERSAMPLE_RATIO: 3.0
38 |     IMPORTANCE_SAMPLE_RATIO: 0.75
39 |     TEST:
40 |       SEMANTIC_ON: True
41 |       INSTANCE_ON: False
42 |       PANOPTIC_ON: False
43 |       OVERLAP_THRESHOLD: 0.8
44 |       OBJECT_MASK_THRESHOLD: 0.8
45 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/cityscapes/semantic-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_base_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/cityscapes/semantic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 100
19 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/cityscapes/semantic-segmentation/swin/maskformer2_swin_small_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_small_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/cityscapes/semantic-segmentation/swin/maskformer2_swin_tiny_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 6, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/coco/instance-segmentation/Base-COCO-InstanceSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     # NORM: "SyncBN"
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("coco_2017_train",)
18 |   TEST: ("coco_2017_val",)
19 | SOLVER:
20 |   IMS_PER_BATCH: 16
21 |   BASE_LR: 0.0001
22 |   STEPS: (327778, 355092)
23 |   MAX_ITER: 368750
24 |   WARMUP_FACTOR: 1.0
25 |   WARMUP_ITERS: 10
26 |   WEIGHT_DECAY: 0.05
27 |   OPTIMIZER: "ADAMW"
28 |   BACKBONE_MULTIPLIER: 0.1
29 |   CLIP_GRADIENTS:
30 |     ENABLED: True
31 |     CLIP_TYPE: "full_model"
32 |     CLIP_VALUE: 0.01
33 |     NORM_TYPE: 2.0
34 |   AMP:
35 |     ENABLED: True
36 | INPUT:
37 |   IMAGE_SIZE: 1024
38 |   MIN_SCALE: 0.1
39 |   MAX_SCALE: 2.0
40 |   FORMAT: "RGB"
41 |   DATASET_MAPPER_NAME: "coco_instance_lsj"
42 | TEST:
43 |   EVAL_PERIOD: 5000
44 | DATALOADER:
45 |   FILTER_EMPTY_ANNOTATIONS: True
46 |   NUM_WORKERS: 4
47 | VERSION: 2
48 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/coco/instance-segmentation/maskformer2_R101_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   WEIGHTS: "R-101.pkl"
 4 |   RESNETS:
 5 |     DEPTH: 101
 6 |     STEM_TYPE: "basic"  # not used
 7 |     STEM_OUT_CHANNELS: 64
 8 |     STRIDE_IN_1X1: False
 9 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
10 |     # NORM: "SyncBN"
11 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
12 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/coco/instance-segmentation/maskformer2_R50_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-COCO-InstanceSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 255
 7 |     NUM_CLASSES: 80
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     DEEP_SUPERVISION: True
22 |     NO_OBJECT_WEIGHT: 0.1
23 |     CLASS_WEIGHT: 2.0
24 |     MASK_WEIGHT: 5.0
25 |     DICE_WEIGHT: 5.0
26 |     HIDDEN_DIM: 256
27 |     NUM_OBJECT_QUERIES: 100
28 |     NHEADS: 8
29 |     DROPOUT: 0.0
30 |     DIM_FEEDFORWARD: 2048
31 |     ENC_LAYERS: 0
32 |     PRE_NORM: False
33 |     ENFORCE_INPUT_PROJ: False
34 |     SIZE_DIVISIBILITY: 32
35 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
36 |     TRAIN_NUM_POINTS: 12544
37 |     OVERSAMPLE_RATIO: 3.0
38 |     IMPORTANCE_SAMPLE_RATIO: 0.75
39 |     TEST:
40 |       SEMANTIC_ON: False
41 |       INSTANCE_ON: True
42 |       PANOPTIC_ON: False
43 |       OVERLAP_THRESHOLD: 0.8
44 |       OBJECT_MASK_THRESHOLD: 0.8
45 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/coco/instance-segmentation/swin/maskformer2_swin_base_384_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_base_patch4_window12_384.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/coco/instance-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_base_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/coco/instance-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_100ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 200
19 | SOLVER:
20 |   STEPS: (655556, 710184)
21 |   MAX_ITER: 737500
22 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/coco/instance-segmentation/swin/maskformer2_swin_small_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_small_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/coco/instance-segmentation/swin/maskformer2_swin_tiny_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 6, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/coco/panoptic-segmentation/Base-COCO-PanopticSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     # NORM: "SyncBN"
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("coco_2017_train_panoptic",)
18 |   TEST: ("coco_2017_val_panoptic_with_sem_seg",)  # to evaluate instance and semantic performance as well
19 | SOLVER:
20 |   IMS_PER_BATCH: 16
21 |   BASE_LR: 0.0001
22 |   STEPS: (327778, 355092)
23 |   MAX_ITER: 368750
24 |   WARMUP_FACTOR: 1.0
25 |   WARMUP_ITERS: 10
26 |   WEIGHT_DECAY: 0.05
27 |   OPTIMIZER: "ADAMW"
28 |   BACKBONE_MULTIPLIER: 0.1
29 |   CLIP_GRADIENTS:
30 |     ENABLED: True
31 |     CLIP_TYPE: "full_model"
32 |     CLIP_VALUE: 0.01
33 |     NORM_TYPE: 2.0
34 |   AMP:
35 |     ENABLED: True
36 | INPUT:
37 |   IMAGE_SIZE: 1024
38 |   MIN_SCALE: 0.1
39 |   MAX_SCALE: 2.0
40 |   FORMAT: "RGB"
41 |   DATASET_MAPPER_NAME: "coco_panoptic_lsj"
42 | TEST:
43 |   EVAL_PERIOD: 5000
44 | DATALOADER:
45 |   FILTER_EMPTY_ANNOTATIONS: True
46 |   NUM_WORKERS: 4
47 | VERSION: 2
48 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/coco/panoptic-segmentation/maskformer2_R101_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   WEIGHTS: "R-101.pkl"
 4 |   RESNETS:
 5 |     DEPTH: 101
 6 |     STEM_TYPE: "basic"  # not used
 7 |     STEM_OUT_CHANNELS: 64
 8 |     STRIDE_IN_1X1: False
 9 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
10 |     # NORM: "SyncBN"
11 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
12 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/coco/panoptic-segmentation/maskformer2_R50_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-COCO-PanopticSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
 7 |     IGNORE_VALUE: 255
 8 |     NUM_CLASSES: 133
 9 |     LOSS_WEIGHT: 1.0
10 |     CONVS_DIM: 256
11 |     MASK_DIM: 256
12 |     NORM: "GN"
13 |     # pixel decoder
14 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
15 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
16 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
17 |     COMMON_STRIDE: 4
18 |     TRANSFORMER_ENC_LAYERS: 6
19 |   MASK_FORMER:
20 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
21 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
22 |     DEEP_SUPERVISION: True
23 |     NO_OBJECT_WEIGHT: 0.1
24 |     CLASS_WEIGHT: 2.0
25 |     MASK_WEIGHT: 5.0
26 |     DICE_WEIGHT: 5.0
27 |     HIDDEN_DIM: 256
28 |     NUM_OBJECT_QUERIES: 100
29 |     NHEADS: 8
30 |     DROPOUT: 0.0
31 |     DIM_FEEDFORWARD: 2048
32 |     ENC_LAYERS: 0
33 |     PRE_NORM: False
34 |     ENFORCE_INPUT_PROJ: False
35 |     SIZE_DIVISIBILITY: 32
36 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
37 |     TRAIN_NUM_POINTS: 12544
38 |     OVERSAMPLE_RATIO: 3.0
39 |     IMPORTANCE_SAMPLE_RATIO: 0.75
40 |     TEST:
41 |       SEMANTIC_ON: True
42 |       INSTANCE_ON: True
43 |       PANOPTIC_ON: True
44 |       OVERLAP_THRESHOLD: 0.8
45 |       OBJECT_MASK_THRESHOLD: 0.8
46 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/coco/panoptic-segmentation/swin/maskformer2_swin_base_384_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_base_patch4_window12_384.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/coco/panoptic-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_base_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/coco/panoptic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_100ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 200
19 | SOLVER:
20 |   STEPS: (655556, 710184)
21 |   MAX_ITER: 737500
22 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/coco/panoptic-segmentation/swin/maskformer2_swin_small_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_small_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/coco/panoptic-segmentation/swin/maskformer2_swin_tiny_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 6, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/mapillary-vistas/panoptic-segmentation/Base-MapillaryVistas-PanopticSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     # NORM: "SyncBN"
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("mapillary_vistas_panoptic_train",)
18 |   TEST: ("mapillary_vistas_panoptic_val",)
19 | SOLVER:
20 |   IMS_PER_BATCH: 16
21 |   BASE_LR: 0.0001
22 |   MAX_ITER: 300000
23 |   WARMUP_FACTOR: 1.0
24 |   WARMUP_ITERS: 0
25 |   WEIGHT_DECAY: 0.05
26 |   OPTIMIZER: "ADAMW"
27 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
28 |   BACKBONE_MULTIPLIER: 0.1
29 |   CLIP_GRADIENTS:
30 |     ENABLED: True
31 |     CLIP_TYPE: "full_model"
32 |     CLIP_VALUE: 0.01
33 |     NORM_TYPE: 2.0
34 |   AMP:
35 |     ENABLED: True
36 | INPUT:
37 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 2048) for x in range(5, 21)]"]
38 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
39 |   MIN_SIZE_TEST: 2048
40 |   MAX_SIZE_TRAIN: 8192
41 |   MAX_SIZE_TEST: 2048
42 |   CROP:
43 |     ENABLED: True
44 |     TYPE: "absolute"
45 |     SIZE: (1024, 1024)
46 |     SINGLE_CATEGORY_MAX_AREA: 1.0
47 |   COLOR_AUG_SSD: True
48 |   SIZE_DIVISIBILITY: 1024  # used in dataset mapper
49 |   FORMAT: "RGB"
50 |   DATASET_MAPPER_NAME: "mask_former_panoptic"
51 | TEST:
52 |   EVAL_PERIOD: 0
53 | DATALOADER:
54 |   FILTER_EMPTY_ANNOTATIONS: True
55 |   NUM_WORKERS: 10
56 | VERSION: 2
57 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/mapillary-vistas/panoptic-segmentation/maskformer_R50_bs16_300k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-MapillaryVistas-PanopticSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 65
 7 |     NUM_CLASSES: 65
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     DEEP_SUPERVISION: True
22 |     NO_OBJECT_WEIGHT: 0.1
23 |     CLASS_WEIGHT: 2.0
24 |     MASK_WEIGHT: 5.0
25 |     DICE_WEIGHT: 5.0
26 |     HIDDEN_DIM: 256
27 |     NUM_OBJECT_QUERIES: 100
28 |     NHEADS: 8
29 |     DROPOUT: 0.0
30 |     DIM_FEEDFORWARD: 2048
31 |     ENC_LAYERS: 0
32 |     PRE_NORM: False
33 |     ENFORCE_INPUT_PROJ: False
34 |     SIZE_DIVISIBILITY: 32
35 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
36 |     TRAIN_NUM_POINTS: 12544
37 |     OVERSAMPLE_RATIO: 3.0
38 |     IMPORTANCE_SAMPLE_RATIO: 0.75
39 |     TEST:
40 |       SEMANTIC_ON: True
41 |       INSTANCE_ON: False
42 |       PANOPTIC_ON: True
43 |       OVERLAP_THRESHOLD: 0.8
44 |       OBJECT_MASK_THRESHOLD: 0.0
45 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/mapillary-vistas/panoptic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_300k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer_R50_bs16_300k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 200
19 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/mapillary-vistas/semantic-segmentation/Base-MapillaryVistas-SemanticSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     # NORM: "SyncBN"
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("mapillary_vistas_sem_seg_train",)
18 |   TEST: ("mapillary_vistas_sem_seg_val",)
19 | SOLVER:
20 |   IMS_PER_BATCH: 16
21 |   BASE_LR: 0.0001
22 |   MAX_ITER: 300000
23 |   WARMUP_FACTOR: 1.0
24 |   WARMUP_ITERS: 0
25 |   WEIGHT_DECAY: 0.05
26 |   OPTIMIZER: "ADAMW"
27 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
28 |   BACKBONE_MULTIPLIER: 0.1
29 |   CLIP_GRADIENTS:
30 |     ENABLED: True
31 |     CLIP_TYPE: "full_model"
32 |     CLIP_VALUE: 0.01
33 |     NORM_TYPE: 2.0
34 |   AMP:
35 |     ENABLED: True
36 | INPUT:
37 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 2048) for x in range(5, 21)]"]
38 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
39 |   MIN_SIZE_TEST: 2048
40 |   MAX_SIZE_TRAIN: 8192
41 |   MAX_SIZE_TEST: 2048
42 |   CROP:
43 |     ENABLED: True
44 |     TYPE: "absolute"
45 |     SIZE: (1024, 1024)
46 |     SINGLE_CATEGORY_MAX_AREA: 1.0
47 |   COLOR_AUG_SSD: True
48 |   SIZE_DIVISIBILITY: 1024  # used in dataset mapper
49 |   FORMAT: "RGB"
50 |   DATASET_MAPPER_NAME: "mask_former_semantic"
51 | TEST:
52 |   EVAL_PERIOD: 0
53 | DATALOADER:
54 |   FILTER_EMPTY_ANNOTATIONS: True
55 |   NUM_WORKERS: 10
56 | VERSION: 2
57 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/mapillary-vistas/semantic-segmentation/maskformer2_R50_bs16_300k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-MapillaryVistas-SemanticSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 65
 7 |     NUM_CLASSES: 65
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     DEEP_SUPERVISION: True
22 |     NO_OBJECT_WEIGHT: 0.1
23 |     CLASS_WEIGHT: 2.0
24 |     MASK_WEIGHT: 5.0
25 |     DICE_WEIGHT: 5.0
26 |     HIDDEN_DIM: 256
27 |     NUM_OBJECT_QUERIES: 100
28 |     NHEADS: 8
29 |     DROPOUT: 0.0
30 |     DIM_FEEDFORWARD: 2048
31 |     ENC_LAYERS: 0
32 |     PRE_NORM: False
33 |     ENFORCE_INPUT_PROJ: False
34 |     SIZE_DIVISIBILITY: 32
35 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
36 |     TRAIN_NUM_POINTS: 12544
37 |     OVERSAMPLE_RATIO: 3.0
38 |     IMPORTANCE_SAMPLE_RATIO: 0.75
39 |     TEST:
40 |       SEMANTIC_ON: True
41 |       INSTANCE_ON: False
42 |       PANOPTIC_ON: False
43 |       OVERLAP_THRESHOLD: 0.8
44 |       OBJECT_MASK_THRESHOLD: 0.0
45 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/mapillary-vistas/semantic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_300k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_300k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 100
19 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/youtubevis_2019/Base-YouTubeVIS-VideoInstanceSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   MASK_ON: True
 9 |   RESNETS:
10 |     DEPTH: 50
11 |     STEM_TYPE: "basic"  # not used
12 |     STEM_OUT_CHANNELS: 64
13 |     STRIDE_IN_1X1: False
14 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     # NORM: "SyncBN"
16 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
17 | DATASETS:
18 |   TRAIN: ("ytvis_2019_train",)
19 |   TEST: ("ytvis_2019_val",)
20 | SOLVER:
21 |   IMS_PER_BATCH: 16
22 |   BASE_LR: 0.0001
23 |   STEPS: (4000,)
24 |   MAX_ITER: 6000
25 |   WARMUP_FACTOR: 1.0
26 |   WARMUP_ITERS: 10
27 |   WEIGHT_DECAY: 0.05
28 |   OPTIMIZER: "ADAMW"
29 |   BACKBONE_MULTIPLIER: 0.1
30 |   CLIP_GRADIENTS:
31 |     ENABLED: True
32 |     CLIP_TYPE: "full_model"
33 |     CLIP_VALUE: 0.01
34 |     NORM_TYPE: 2.0
35 |   AMP:
36 |     ENABLED: True
37 | INPUT:
38 |   MIN_SIZE_TRAIN_SAMPLING: "choice_by_clip"
39 |   RANDOM_FLIP: "flip_by_clip"
40 |   AUGMENTATIONS: []
41 |   MIN_SIZE_TRAIN: (360, 480)
42 |   MIN_SIZE_TEST: 360
43 |   CROP:
44 |     ENABLED: False
45 |     TYPE: "absolute_range"
46 |     SIZE: (600, 720)
47 |   FORMAT: "RGB"
48 | TEST:
49 |   EVAL_PERIOD: 0
50 | DATALOADER:
51 |   FILTER_EMPTY_ANNOTATIONS: False
52 |   NUM_WORKERS: 4
53 | VERSION: 2
54 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/youtubevis_2019/swin/video_maskformer2_swin_base_IN21k_384_bs16_8ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "model_final_83d103.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | INPUT:
18 |   MIN_SIZE_TEST: 480
19 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/youtubevis_2019/swin/video_maskformer2_swin_large_IN21k_384_bs16_8ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "model_final_e5f453.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 200
19 | INPUT:
20 |   MIN_SIZE_TEST: 480
21 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/youtubevis_2019/swin/video_maskformer2_swin_small_bs16_8ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "model_final_1e7f22.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | INPUT:
17 |   MIN_SIZE_TEST: 480
18 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/youtubevis_2019/swin/video_maskformer2_swin_tiny_bs16_8ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 6, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "model_final_86143f.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | INPUT:
17 |   MIN_SIZE_TEST: 480
18 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/youtubevis_2019/video_maskformer2_R101_bs16_8ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: video_maskformer2_R50_bs16_8ep.yaml
 2 | MODEL:
 3 |   WEIGHTS: "model_final_eba159.pkl"
 4 |   RESNETS:
 5 |     DEPTH: 101
 6 |     STEM_TYPE: "basic"  # not used
 7 |     STEM_OUT_CHANNELS: 64
 8 |     STRIDE_IN_1X1: False
 9 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
10 |     # NORM: "SyncBN"
11 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
12 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/youtubevis_2019/video_maskformer2_R50_bs16_8ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-YouTubeVIS-VideoInstanceSegmentation.yaml
 2 | MODEL:
 3 |   WEIGHTS: "model_final_3c8ec9.pkl"
 4 |   META_ARCHITECTURE: "VideoMaskFormer"
 5 |   SEM_SEG_HEAD:
 6 |     NAME: "MaskFormerHead"
 7 |     IGNORE_VALUE: 255
 8 |     NUM_CLASSES: 40
 9 |     LOSS_WEIGHT: 1.0
10 |     CONVS_DIM: 256
11 |     MASK_DIM: 256
12 |     NORM: "GN"
13 |     # pixel decoder
14 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
15 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
16 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
17 |     COMMON_STRIDE: 4
18 |     TRANSFORMER_ENC_LAYERS: 6
19 |   MASK_FORMER:
20 |     TRANSFORMER_DECODER_NAME: "VideoMultiScaleMaskedTransformerDecoder"
21 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
22 |     DEEP_SUPERVISION: True
23 |     NO_OBJECT_WEIGHT: 0.1
24 |     CLASS_WEIGHT: 2.0
25 |     MASK_WEIGHT: 5.0
26 |     DICE_WEIGHT: 5.0
27 |     HIDDEN_DIM: 256
28 |     NUM_OBJECT_QUERIES: 100
29 |     NHEADS: 8
30 |     DROPOUT: 0.0
31 |     DIM_FEEDFORWARD: 2048
32 |     ENC_LAYERS: 0
33 |     PRE_NORM: False
34 |     ENFORCE_INPUT_PROJ: False
35 |     SIZE_DIVISIBILITY: 32
36 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
37 |     TRAIN_NUM_POINTS: 12544
38 |     OVERSAMPLE_RATIO: 3.0
39 |     IMPORTANCE_SAMPLE_RATIO: 0.75
40 |     TEST:
41 |       SEMANTIC_ON: False
42 |       INSTANCE_ON: True
43 |       PANOPTIC_ON: False
44 |       OVERLAP_THRESHOLD: 0.8
45 |       OBJECT_MASK_THRESHOLD: 0.8
46 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/youtubevis_2021/Base-YouTubeVIS-VideoInstanceSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   MASK_ON: True
 9 |   RESNETS:
10 |     DEPTH: 50
11 |     STEM_TYPE: "basic"  # not used
12 |     STEM_OUT_CHANNELS: 64
13 |     STRIDE_IN_1X1: False
14 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     # NORM: "SyncBN"
16 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
17 | DATASETS:
18 |   TRAIN: ("ytvis_2021_train",)
19 |   TEST: ("ytvis_2021_val",)
20 | SOLVER:
21 |   IMS_PER_BATCH: 16
22 |   BASE_LR: 0.0001
23 |   STEPS: (5500,)
24 |   MAX_ITER: 8000
25 |   WARMUP_FACTOR: 1.0
26 |   WARMUP_ITERS: 10
27 |   WEIGHT_DECAY: 0.05
28 |   OPTIMIZER: "ADAMW"
29 |   BACKBONE_MULTIPLIER: 0.1
30 |   CLIP_GRADIENTS:
31 |     ENABLED: True
32 |     CLIP_TYPE: "full_model"
33 |     CLIP_VALUE: 0.01
34 |     NORM_TYPE: 2.0
35 |   AMP:
36 |     ENABLED: True
37 | INPUT:
38 |   MIN_SIZE_TRAIN_SAMPLING: "choice_by_clip"
39 |   RANDOM_FLIP: "flip_by_clip"
40 |   AUGMENTATIONS: []
41 |   MIN_SIZE_TRAIN: (360, 480)
42 |   MIN_SIZE_TEST: 360
43 |   CROP:
44 |     ENABLED: False
45 |     TYPE: "absolute_range"
46 |     SIZE: (600, 720)
47 |   FORMAT: "RGB"
48 | TEST:
49 |   EVAL_PERIOD: 0
50 | DATALOADER:
51 |   FILTER_EMPTY_ANNOTATIONS: False
52 |   NUM_WORKERS: 4
53 | VERSION: 2
54 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/youtubevis_2021/swin/video_maskformer2_swin_base_IN21k_384_bs16_8ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "model_final_83d103.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | INPUT:
18 |   MIN_SIZE_TEST: 480
19 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/youtubevis_2021/swin/video_maskformer2_swin_large_IN21k_384_bs16_8ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "model_final_e5f453.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 200
19 | # OOM when using a larger test size
20 | # INPUT:
21 | #   MIN_SIZE_TEST: 480
22 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/youtubevis_2021/swin/video_maskformer2_swin_small_bs16_8ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "model_final_1e7f22.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | INPUT:
17 |   MIN_SIZE_TEST: 480
18 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/youtubevis_2021/swin/video_maskformer2_swin_tiny_bs16_8ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 6, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "model_final_86143f.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | INPUT:
17 |   MIN_SIZE_TEST: 480
18 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/youtubevis_2021/video_maskformer2_R101_bs16_8ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: video_maskformer2_R50_bs16_8ep.yaml
 2 | MODEL:
 3 |   WEIGHTS: "model_final_eba159.pkl"
 4 |   RESNETS:
 5 |     DEPTH: 101
 6 |     STEM_TYPE: "basic"  # not used
 7 |     STEM_OUT_CHANNELS: 64
 8 |     STRIDE_IN_1X1: False
 9 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
10 |     # NORM: "SyncBN"
11 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
12 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/configs/youtubevis_2021/video_maskformer2_R50_bs16_8ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-YouTubeVIS-VideoInstanceSegmentation.yaml
 2 | MODEL:
 3 |   WEIGHTS: "model_final_3c8ec9.pkl"
 4 |   META_ARCHITECTURE: "VideoMaskFormer"
 5 |   SEM_SEG_HEAD:
 6 |     NAME: "MaskFormerHead"
 7 |     IGNORE_VALUE: 255
 8 |     NUM_CLASSES: 40
 9 |     LOSS_WEIGHT: 1.0
10 |     CONVS_DIM: 256
11 |     MASK_DIM: 256
12 |     NORM: "GN"
13 |     # pixel decoder
14 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
15 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
16 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
17 |     COMMON_STRIDE: 4
18 |     TRANSFORMER_ENC_LAYERS: 6
19 |   MASK_FORMER:
20 |     TRANSFORMER_DECODER_NAME: "VideoMultiScaleMaskedTransformerDecoder"
21 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
22 |     DEEP_SUPERVISION: True
23 |     NO_OBJECT_WEIGHT: 0.1
24 |     CLASS_WEIGHT: 2.0
25 |     MASK_WEIGHT: 5.0
26 |     DICE_WEIGHT: 5.0
27 |     HIDDEN_DIM: 256
28 |     NUM_OBJECT_QUERIES: 100
29 |     NHEADS: 8
30 |     DROPOUT: 0.0
31 |     DIM_FEEDFORWARD: 2048
32 |     ENC_LAYERS: 0
33 |     PRE_NORM: False
34 |     ENFORCE_INPUT_PROJ: False
35 |     SIZE_DIVISIBILITY: 32
36 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
37 |     TRAIN_NUM_POINTS: 12544
38 |     OVERSAMPLE_RATIO: 3.0
39 |     IMPORTANCE_SAMPLE_RATIO: 0.75
40 |     TEST:
41 |       SEMANTIC_ON: False
42 |       INSTANCE_ON: True
43 |       PANOPTIC_ON: False
44 |       OVERLAP_THRESHOLD: 0.8
45 |       OBJECT_MASK_THRESHOLD: 0.8
46 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/datasets/ade20k_instance_catid_mapping.txt:
--------------------------------------------------------------------------------
  1 | Instacne100	SceneParse150	FullADE20K
  2 | 1		8		165
  3 | 2		9		3055
  4 | 3		11		350
  5 | 4		13		1831
  6 | 5		15		774
  7 | 5		15		783
  8 | 6		16		2684
  9 | 7		19		687
 10 | 8		20		471
 11 | 9		21		401
 12 | 10		23		1735
 13 | 11		24		2473
 14 | 12		25		2329
 15 | 13		28		1564
 16 | 14		31		57
 17 | 15		32		2272
 18 | 16		33		907
 19 | 17		34		724
 20 | 18		36		2985
 21 | 18		36		533
 22 | 19		37		1395
 23 | 20		38		155
 24 | 21		39		2053
 25 | 22		40		689
 26 | 23		42		266
 27 | 24		43		581
 28 | 25		44		2380
 29 | 26		45		491
 30 | 27		46		627
 31 | 28		48		2388
 32 | 29		50		943
 33 | 30		51		2096
 34 | 31		54		2530
 35 | 32		56		420
 36 | 33		57		1948
 37 | 34		58		1869
 38 | 35		59		2251
 39 | 36		63		239
 40 | 37		65		571
 41 | 38		66		2793
 42 | 39		67		978
 43 | 40		68		236
 44 | 41		70		181
 45 | 42		71		629
 46 | 43		72		2598
 47 | 44		73		1744
 48 | 45		74		1374
 49 | 46		75		591
 50 | 47		76		2679
 51 | 48		77		223
 52 | 49		79		47
 53 | 50		81		327
 54 | 51		82		2821
 55 | 52		83		1451
 56 | 53		84		2880
 57 | 54		86		480
 58 | 55		87		77
 59 | 56		88		2616
 60 | 57		89		246
 61 | 57		89		247
 62 | 58		90		2733
 63 | 59		91		14
 64 | 60		93		38
 65 | 61		94		1936
 66 | 62		96		120
 67 | 63		98		1702
 68 | 64		99		249
 69 | 65		103		2928
 70 | 66		104		2337
 71 | 67		105		1023
 72 | 68		108		2989
 73 | 69		109		1930
 74 | 70		111		2586
 75 | 71		112		131
 76 | 72		113		146
 77 | 73		116		95
 78 | 74		117		1563
 79 | 75		119		1708
 80 | 76		120		103
 81 | 77		121		1002
 82 | 78		122		2569
 83 | 79		124		2833
 84 | 80		125		1551
 85 | 81		126		1981
 86 | 82		127		29
 87 | 83		128		187
 88 | 84		130		747
 89 | 85		131		2254
 90 | 86		133		2262
 91 | 87		134		1260
 92 | 88		135		2243
 93 | 89		136		2932
 94 | 90		137		2836
 95 | 91		138		2850
 96 | 92		139		64
 97 | 93		140		894
 98 | 94		143		1919
 99 | 95		144		1583
100 | 96		145		318
101 | 97		147		2046
102 | 98		148		1098
103 | 99		149		530
104 | 100		150		954
105 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/datasets/prepare_ade20k_sem_seg.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | # Copyright (c) Facebook, Inc. and its affiliates.
 4 | import os
 5 | from pathlib import Path
 6 | 
 7 | import numpy as np
 8 | import tqdm
 9 | from PIL import Image
10 | 
11 | 
12 | def convert(input, output):
13 |     img = np.asarray(Image.open(input))
14 |     assert img.dtype == np.uint8
15 |     img = img - 1  # 0 (ignore) becomes 255. others are shifted by 1
16 |     Image.fromarray(img).save(output)
17 | 
18 | 
19 | if __name__ == "__main__":
20 |     dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "ADEChallengeData2016"
21 |     for name in ["training", "validation"]:
22 |         annotation_dir = dataset_dir / "annotations" / name
23 |         output_dir = dataset_dir / "annotations_detectron2" / name
24 |         output_dir.mkdir(parents=True, exist_ok=True)
25 |         for file in tqdm.tqdm(list(annotation_dir.iterdir())):
26 |             output_file = output_dir / file.name
27 |             convert(file, output_file)
28 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/datasets/prepare_coco_semantic_annos_from_panoptic_annos.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | # Copyright (c) Facebook, Inc. and its affiliates.
 4 | 
 5 | import functools
 6 | import json
 7 | import multiprocessing as mp
 8 | import numpy as np
 9 | import os
10 | import time
11 | from fvcore.common.download import download
12 | from panopticapi.utils import rgb2id
13 | from PIL import Image
14 | 
15 | from detectron2.data.datasets.builtin_meta import COCO_CATEGORIES
16 | 
17 | 
18 | def _process_panoptic_to_semantic(input_panoptic, output_semantic, segments, id_map):
19 |     panoptic = np.asarray(Image.open(input_panoptic), dtype=np.uint32)
20 |     panoptic = rgb2id(panoptic)
21 |     output = np.zeros_like(panoptic, dtype=np.uint8) + 255
22 |     for seg in segments:
23 |         cat_id = seg["category_id"]
24 |         new_cat_id = id_map[cat_id]
25 |         output[panoptic == seg["id"]] = new_cat_id
26 |     Image.fromarray(output).save(output_semantic)
27 | 
28 | 
29 | def separate_coco_semantic_from_panoptic(panoptic_json, panoptic_root, sem_seg_root, categories):
30 |     """
31 |     Create semantic segmentation annotations from panoptic segmentation
32 |     annotations, to be used by PanopticFPN.
33 |     It maps all thing categories to class 0, and maps all unlabeled pixels to class 255.
34 |     It maps all stuff categories to contiguous ids starting from 1.
35 |     Args:
36 |         panoptic_json (str): path to the panoptic json file, in COCO's format.
37 |         panoptic_root (str): a directory with panoptic annotation files, in COCO's format.
38 |         sem_seg_root (str): a directory to output semantic annotation files
39 |         categories (list[dict]): category metadata. Each dict needs to have:
40 |             "id": corresponds to the "category_id" in the json annotations
41 |             "isthing": 0 or 1
42 |     """
43 |     os.makedirs(sem_seg_root, exist_ok=True)
44 | 
45 |     id_map = {}  # map from category id to id in the output semantic annotation
46 |     assert len(categories) <= 254
47 |     for i, k in enumerate(categories):
48 |         id_map[k["id"]] = i
49 |     # what is id = 0?
50 |     # id_map[0] = 255
51 |     print(id_map)
52 | 
53 |     with open(panoptic_json) as f:
54 |         obj = json.load(f)
55 | 
56 |     pool = mp.Pool(processes=max(mp.cpu_count() // 2, 4))
57 | 
58 |     def iter_annotations():
59 |         for anno in obj["annotations"]:
60 |             file_name = anno["file_name"]
61 |             segments = anno["segments_info"]
62 |             input = os.path.join(panoptic_root, file_name)
63 |             output = os.path.join(sem_seg_root, file_name)
64 |             yield input, output, segments
65 | 
66 |     print("Start writing to {} ...".format(sem_seg_root))
67 |     start = time.time()
68 |     pool.starmap(
69 |         functools.partial(_process_panoptic_to_semantic, id_map=id_map),
70 |         iter_annotations(),
71 |         chunksize=100,
72 |     )
73 |     print("Finished. time: {:.2f}s".format(time.time() - start))
74 | 
75 | 
76 | if __name__ == "__main__":
77 |     dataset_dir = os.path.join(os.getenv("DETECTRON2_DATASETS", "datasets"), "coco")
78 |     for s in ["val2017", "train2017"]:
79 |         separate_coco_semantic_from_panoptic(
80 |             os.path.join(dataset_dir, "annotations/panoptic_{}.json".format(s)),
81 |             os.path.join(dataset_dir, "panoptic_{}".format(s)),
82 |             os.path.join(dataset_dir, "panoptic_semseg_{}".format(s)),
83 |             COCO_CATEGORIES,
84 |         )
85 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/demo/README.md:
--------------------------------------------------------------------------------
1 | ## Mask2Former Demo
2 | 
3 | We provide a command line tool to run a simple demo of builtin configs.
4 | The usage is explained in [GETTING_STARTED.md](../GETTING_STARTED.md).
5 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/demo_video/README.md:
--------------------------------------------------------------------------------
1 | ## Video Mask2Former Demo
2 | 
3 | We provide a command line tool to run a simple demo of builtin configs.
4 | The usage is explained in [GETTING_STARTED.md](../GETTING_STARTED.md).
5 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/mask2former/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | from . import data  # register all new datasets
 3 | from . import modeling
 4 | 
 5 | # config
 6 | from .config import add_maskformer2_config
 7 | 
 8 | # dataset loading
 9 | from .data.dataset_mappers.coco_instance_new_baseline_dataset_mapper import COCOInstanceNewBaselineDatasetMapper
10 | from .data.dataset_mappers.coco_panoptic_new_baseline_dataset_mapper import COCOPanopticNewBaselineDatasetMapper
11 | from .data.dataset_mappers.mask_former_instance_dataset_mapper import (
12 |     MaskFormerInstanceDatasetMapper,
13 | )
14 | from .data.dataset_mappers.mask_former_panoptic_dataset_mapper import (
15 |     MaskFormerPanopticDatasetMapper,
16 | )
17 | from .data.dataset_mappers.mask_former_semantic_dataset_mapper import (
18 |     MaskFormerSemanticDatasetMapper,
19 | )
20 | 
21 | # models
22 | from .maskformer_model import MaskFormer
23 | from .test_time_augmentation import SemanticSegmentorWithTTA
24 | 
25 | # evaluation
26 | from .evaluation.instance_evaluation import InstanceSegEvaluator
27 | 
28 | __version__ = "0.1"


--------------------------------------------------------------------------------
/third_party/Mask2Former/mask2former/data/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from . import datasets
3 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/mask2former/data/dataset_mappers/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/mask2former/data/datasets/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | from . import (
 3 |     register_ade20k_full,
 4 |     register_ade20k_panoptic,
 5 |     register_coco_stuff_10k,
 6 |     register_mapillary_vistas,
 7 |     register_coco_panoptic_annos_semseg,
 8 |     register_ade20k_instance,
 9 |     register_mapillary_vistas_panoptic,
10 | )
11 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/mask2former/evaluation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVlabs/ODISE/2b187e4b2ff4c3d5da342aec2cc234b537720a65/third_party/Mask2Former/mask2former/evaluation/__init__.py


--------------------------------------------------------------------------------
/third_party/Mask2Former/mask2former/modeling/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from .backbone.swin import D2SwinTransformer
3 | from .pixel_decoder.fpn import BasePixelDecoder
4 | from .pixel_decoder.msdeformattn import MSDeformAttnPixelDecoder
5 | from .meta_arch.mask_former_head import MaskFormerHead
6 | from .meta_arch.per_pixel_baseline import PerPixelBaselineHead, PerPixelBaselinePlusHead
7 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/mask2former/modeling/backbone/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/mask2former/modeling/meta_arch/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/mask2former/modeling/pixel_decoder/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVlabs/ODISE/2b187e4b2ff4c3d5da342aec2cc234b537720a65/third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/__init__.py


--------------------------------------------------------------------------------
/third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/functions/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from .ms_deform_attn_func import MSDeformAttnFunction
13 | 
14 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/functions/ms_deform_attn_func.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from __future__ import absolute_import
13 | from __future__ import print_function
14 | from __future__ import division
15 | 
16 | import torch
17 | import torch.nn.functional as F
18 | from torch.autograd import Function
19 | from torch.autograd.function import once_differentiable
20 | 
21 | try:
22 |     import MultiScaleDeformableAttention as MSDA
23 | except ModuleNotFoundError as e:
24 |     info_string = (
25 |         "\n\nPlease compile MultiScaleDeformableAttention CUDA op with the following commands:\n"
26 |         "\t`cd mask2former/modeling/pixel_decoder/ops`\n"
27 |         "\t`sh make.sh`\n"
28 |     )
29 |     MSDA = None
30 | 
31 | 
32 | class MSDeformAttnFunction(Function):
33 |     @staticmethod
34 |     def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step):
35 |         ctx.im2col_step = im2col_step
36 |         output = MSDA.ms_deform_attn_forward(
37 |             value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step)
38 |         ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights)
39 |         return output
40 | 
41 |     @staticmethod
42 |     @once_differentiable
43 |     def backward(ctx, grad_output):
44 |         value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors
45 |         grad_value, grad_sampling_loc, grad_attn_weight = \
46 |             MSDA.ms_deform_attn_backward(
47 |                 value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step)
48 | 
49 |         return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
50 | 
51 | 
52 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
53 |     # for debug and test only,
54 |     # need to use cuda version instead
55 |     N_, S_, M_, D_ = value.shape
56 |     _, Lq_, M_, L_, P_, _ = sampling_locations.shape
57 |     value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
58 |     sampling_grids = 2 * sampling_locations - 1
59 |     sampling_value_list = []
60 |     for lid_, (H_, W_) in enumerate(value_spatial_shapes):
61 |         # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
62 |         value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_)
63 |         # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
64 |         sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)
65 |         # N_*M_, D_, Lq_, P_
66 |         sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,
67 |                                           mode='bilinear', padding_mode='zeros', align_corners=False)
68 |         sampling_value_list.append(sampling_value_l_)
69 |     # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
70 |     attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_)
71 |     output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)
72 |     return output.transpose(1, 2).contiguous()
73 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/modules/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from .ms_deform_attn import MSDeformAttn
13 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.cpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #include <vector>
17 | 
18 | #include <ATen/ATen.h>
19 | #include <ATen/cuda/CUDAContext.h>
20 | 
21 | 
22 | at::Tensor
23 | ms_deform_attn_cpu_forward(
24 |     const at::Tensor &value, 
25 |     const at::Tensor &spatial_shapes,
26 |     const at::Tensor &level_start_index,
27 |     const at::Tensor &sampling_loc,
28 |     const at::Tensor &attn_weight,
29 |     const int im2col_step)
30 | {
31 |     AT_ERROR("Not implement on cpu");
32 | }
33 | 
34 | std::vector<at::Tensor>
35 | ms_deform_attn_cpu_backward(
36 |     const at::Tensor &value, 
37 |     const at::Tensor &spatial_shapes,
38 |     const at::Tensor &level_start_index,
39 |     const at::Tensor &sampling_loc,
40 |     const at::Tensor &attn_weight,
41 |     const at::Tensor &grad_output,
42 |     const int im2col_step)
43 | {
44 |     AT_ERROR("Not implement on cpu");
45 | }
46 | 
47 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #pragma once
17 | #include <torch/extension.h>
18 | 
19 | at::Tensor
20 | ms_deform_attn_cpu_forward(
21 |     const at::Tensor &value, 
22 |     const at::Tensor &spatial_shapes,
23 |     const at::Tensor &level_start_index,
24 |     const at::Tensor &sampling_loc,
25 |     const at::Tensor &attn_weight,
26 |     const int im2col_step);
27 | 
28 | std::vector<at::Tensor>
29 | ms_deform_attn_cpu_backward(
30 |     const at::Tensor &value, 
31 |     const at::Tensor &spatial_shapes,
32 |     const at::Tensor &level_start_index,
33 |     const at::Tensor &sampling_loc,
34 |     const at::Tensor &attn_weight,
35 |     const at::Tensor &grad_output,
36 |     const int im2col_step);
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #pragma once
17 | #include <torch/extension.h>
18 | 
19 | at::Tensor ms_deform_attn_cuda_forward(
20 |     const at::Tensor &value, 
21 |     const at::Tensor &spatial_shapes,
22 |     const at::Tensor &level_start_index,
23 |     const at::Tensor &sampling_loc,
24 |     const at::Tensor &attn_weight,
25 |     const int im2col_step);
26 | 
27 | std::vector<at::Tensor> ms_deform_attn_cuda_backward(
28 |     const at::Tensor &value, 
29 |     const at::Tensor &spatial_shapes,
30 |     const at::Tensor &level_start_index,
31 |     const at::Tensor &sampling_loc,
32 |     const at::Tensor &attn_weight,
33 |     const at::Tensor &grad_output,
34 |     const int im2col_step);
35 | 
36 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/src/ms_deform_attn.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #pragma once
17 | 
18 | #include "cpu/ms_deform_attn_cpu.h"
19 | 
20 | #ifdef WITH_CUDA
21 | #include "cuda/ms_deform_attn_cuda.h"
22 | #endif
23 | 
24 | 
25 | at::Tensor
26 | ms_deform_attn_forward(
27 |     const at::Tensor &value, 
28 |     const at::Tensor &spatial_shapes,
29 |     const at::Tensor &level_start_index,
30 |     const at::Tensor &sampling_loc,
31 |     const at::Tensor &attn_weight,
32 |     const int im2col_step)
33 | {
34 |     if (value.type().is_cuda())
35 |     {
36 | #ifdef WITH_CUDA
37 |         return ms_deform_attn_cuda_forward(
38 |             value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step);
39 | #else
40 |         AT_ERROR("Not compiled with GPU support");
41 | #endif
42 |     }
43 |     AT_ERROR("Not implemented on the CPU");
44 | }
45 | 
46 | std::vector<at::Tensor>
47 | ms_deform_attn_backward(
48 |     const at::Tensor &value, 
49 |     const at::Tensor &spatial_shapes,
50 |     const at::Tensor &level_start_index,
51 |     const at::Tensor &sampling_loc,
52 |     const at::Tensor &attn_weight,
53 |     const at::Tensor &grad_output,
54 |     const int im2col_step)
55 | {
56 |     if (value.type().is_cuda())
57 |     {
58 | #ifdef WITH_CUDA
59 |         return ms_deform_attn_cuda_backward(
60 |             value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step);
61 | #else
62 |         AT_ERROR("Not compiled with GPU support");
63 | #endif
64 |     }
65 |     AT_ERROR("Not implemented on the CPU");
66 | }
67 | 
68 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/src/vision.cpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #include "ms_deform_attn.h"
17 | 
18 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
19 |   m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward");
20 |   m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward");
21 | }
22 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/mask2former/modeling/transformer_decoder/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from .maskformer_transformer_decoder import StandardTransformerDecoder
3 | from .mask2former_transformer_decoder import MultiScaleMaskedTransformerDecoder
4 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/mask2former/modeling/transformer_decoder/position_encoding.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py
 3 | """
 4 | Various positional encodings for the transformer.
 5 | """
 6 | import math
 7 | 
 8 | import torch
 9 | from torch import nn
10 | 
11 | 
12 | class PositionEmbeddingSine(nn.Module):
13 |     """
14 |     This is a more standard version of the position embedding, very similar to the one
15 |     used by the Attention is all you need paper, generalized to work on images.
16 |     """
17 | 
18 |     def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
19 |         super().__init__()
20 |         self.num_pos_feats = num_pos_feats
21 |         self.temperature = temperature
22 |         self.normalize = normalize
23 |         if scale is not None and normalize is False:
24 |             raise ValueError("normalize should be True if scale is passed")
25 |         if scale is None:
26 |             scale = 2 * math.pi
27 |         self.scale = scale
28 | 
29 |     def forward(self, x, mask=None):
30 |         if mask is None:
31 |             mask = torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool)
32 |         not_mask = ~mask
33 |         y_embed = not_mask.cumsum(1, dtype=torch.float32)
34 |         x_embed = not_mask.cumsum(2, dtype=torch.float32)
35 |         if self.normalize:
36 |             eps = 1e-6
37 |             y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
38 |             x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
39 | 
40 |         dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
41 |         dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
42 | 
43 |         pos_x = x_embed[:, :, :, None] / dim_t
44 |         pos_y = y_embed[:, :, :, None] / dim_t
45 |         pos_x = torch.stack(
46 |             (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4
47 |         ).flatten(3)
48 |         pos_y = torch.stack(
49 |             (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4
50 |         ).flatten(3)
51 |         pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
52 |         return pos
53 |     
54 |     def __repr__(self, _repr_indent=4):
55 |         head = "Positional encoding " + self.__class__.__name__
56 |         body = [
57 |             "num_pos_feats: {}".format(self.num_pos_feats),
58 |             "temperature: {}".format(self.temperature),
59 |             "normalize: {}".format(self.normalize),
60 |             "scale: {}".format(self.scale),
61 |         ]
62 |         # _repr_indent = 4
63 |         lines = [head] + [" " * _repr_indent + line for line in body]
64 |         return "\n".join(lines)
65 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/mask2former/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/mask2former_video/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | from . import modeling
 3 | 
 4 | # config
 5 | from .config import add_maskformer2_video_config
 6 | 
 7 | # models
 8 | from .video_maskformer_model import VideoMaskFormer
 9 | 
10 | # video
11 | from .data_video import (
12 |     YTVISDatasetMapper,
13 |     YTVISEvaluator,
14 |     build_detection_train_loader,
15 |     build_detection_test_loader,
16 |     get_detection_dataset_dicts,
17 | )
18 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/mask2former_video/config.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | from detectron2.config import CfgNode as CN
 4 | 
 5 | 
 6 | def add_maskformer2_video_config(cfg):
 7 |     # video data
 8 |     # DataLoader
 9 |     cfg.INPUT.SAMPLING_FRAME_NUM = 2
10 |     cfg.INPUT.SAMPLING_FRAME_RANGE = 20
11 |     cfg.INPUT.SAMPLING_FRAME_SHUFFLE = False
12 |     cfg.INPUT.AUGMENTATIONS = [] # "brightness", "contrast", "saturation", "rotation"
13 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/mask2former_video/data_video/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC
3 | 
4 | from .dataset_mapper import YTVISDatasetMapper, CocoClipDatasetMapper
5 | from .build import *
6 | 
7 | from .datasets import *
8 | from .ytvis_eval import YTVISEvaluator
9 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/mask2former_video/data_video/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC
3 | 
4 | from . import builtin  # ensure the builtin datasets are registered
5 | 
6 | __all__ = [k for k in globals().keys() if "builtin" not in k and not k.startswith("_")]
7 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/mask2former_video/data_video/datasets/builtin.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC
 3 | 
 4 | import os
 5 | 
 6 | from .ytvis import (
 7 |     register_ytvis_instances,
 8 |     _get_ytvis_2019_instances_meta,
 9 |     _get_ytvis_2021_instances_meta,
10 | )
11 | 
12 | # ==== Predefined splits for YTVIS 2019 ===========
13 | _PREDEFINED_SPLITS_YTVIS_2019 = {
14 |     "ytvis_2019_train": ("ytvis_2019/train/JPEGImages",
15 |                          "ytvis_2019/train.json"),
16 |     "ytvis_2019_val": ("ytvis_2019/valid/JPEGImages",
17 |                        "ytvis_2019/valid.json"),
18 |     "ytvis_2019_test": ("ytvis_2019/test/JPEGImages",
19 |                         "ytvis_2019/test.json"),
20 | }
21 | 
22 | 
23 | # ==== Predefined splits for YTVIS 2021 ===========
24 | _PREDEFINED_SPLITS_YTVIS_2021 = {
25 |     "ytvis_2021_train": ("ytvis_2021/train/JPEGImages",
26 |                          "ytvis_2021/train.json"),
27 |     "ytvis_2021_val": ("ytvis_2021/valid/JPEGImages",
28 |                        "ytvis_2021/valid.json"),
29 |     "ytvis_2021_test": ("ytvis_2021/test/JPEGImages",
30 |                         "ytvis_2021/test.json"),
31 | }
32 | 
33 | 
34 | def register_all_ytvis_2019(root):
35 |     for key, (image_root, json_file) in _PREDEFINED_SPLITS_YTVIS_2019.items():
36 |         # Assume pre-defined datasets live in `./datasets`.
37 |         register_ytvis_instances(
38 |             key,
39 |             _get_ytvis_2019_instances_meta(),
40 |             os.path.join(root, json_file) if "://" not in json_file else json_file,
41 |             os.path.join(root, image_root),
42 |         )
43 | 
44 | 
45 | def register_all_ytvis_2021(root):
46 |     for key, (image_root, json_file) in _PREDEFINED_SPLITS_YTVIS_2021.items():
47 |         # Assume pre-defined datasets live in `./datasets`.
48 |         register_ytvis_instances(
49 |             key,
50 |             _get_ytvis_2021_instances_meta(),
51 |             os.path.join(root, json_file) if "://" not in json_file else json_file,
52 |             os.path.join(root, image_root),
53 |         )
54 | 
55 | 
56 | if __name__.endswith(".builtin"):
57 |     # Assume pre-defined datasets live in `./datasets`.
58 |     _root = os.getenv("DETECTRON2_DATASETS", "datasets")
59 |     register_all_ytvis_2019(_root)
60 |     register_all_ytvis_2021(_root)
61 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/mask2former_video/data_video/datasets/ytvis_api/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # Modified by Bowen Cheng from https://github.com/youtubevos/cocoapi
3 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/mask2former_video/modeling/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from .transformer_decoder.video_mask2former_transformer_decoder import VideoMultiScaleMaskedTransformerDecoder
3 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/mask2former_video/modeling/transformer_decoder/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from .video_mask2former_transformer_decoder import VideoMultiScaleMaskedTransformerDecoder
3 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/mask2former_video/modeling/transformer_decoder/position_encoding.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py
 3 | """
 4 | Various positional encodings for the transformer.
 5 | """
 6 | import math
 7 | 
 8 | import torch
 9 | from torch import nn
10 | 
11 | 
12 | class PositionEmbeddingSine3D(nn.Module):
13 |     """
14 |     This is a more standard version of the position embedding, very similar to the one
15 |     used by the Attention is all you need paper, generalized to work on images.
16 |     """
17 | 
18 |     def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
19 |         super().__init__()
20 |         self.num_pos_feats = num_pos_feats
21 |         self.temperature = temperature
22 |         self.normalize = normalize
23 |         if scale is not None and normalize is False:
24 |             raise ValueError("normalize should be True if scale is passed")
25 |         if scale is None:
26 |             scale = 2 * math.pi
27 |         self.scale = scale
28 | 
29 |     def forward(self, x, mask=None):
30 |         # b, t, c, h, w
31 |         assert x.dim() == 5, f"{x.shape} should be a 5-dimensional Tensor, got {x.dim()}-dimensional Tensor instead"
32 |         if mask is None:
33 |             mask = torch.zeros((x.size(0), x.size(1), x.size(3), x.size(4)), device=x.device, dtype=torch.bool)
34 |         not_mask = ~mask
35 |         z_embed = not_mask.cumsum(1, dtype=torch.float32)
36 |         y_embed = not_mask.cumsum(2, dtype=torch.float32)
37 |         x_embed = not_mask.cumsum(3, dtype=torch.float32)
38 |         if self.normalize:
39 |             eps = 1e-6
40 |             z_embed = z_embed / (z_embed[:, -1:, :, :] + eps) * self.scale
41 |             y_embed = y_embed / (y_embed[:, :, -1:, :] + eps) * self.scale
42 |             x_embed = x_embed / (x_embed[:, :, :, -1:] + eps) * self.scale
43 | 
44 |         dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
45 |         dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
46 | 
47 |         dim_t_z = torch.arange((self.num_pos_feats * 2), dtype=torch.float32, device=x.device)
48 |         dim_t_z = self.temperature ** (2 * (dim_t_z // 2) / (self.num_pos_feats * 2))
49 | 
50 |         pos_x = x_embed[:, :, :, :, None] / dim_t
51 |         pos_y = y_embed[:, :, :, :, None] / dim_t
52 |         pos_z = z_embed[:, :, :, :, None] / dim_t_z
53 |         pos_x = torch.stack((pos_x[:, :, :, :, 0::2].sin(), pos_x[:, :, :, :, 1::2].cos()), dim=5).flatten(4)
54 |         pos_y = torch.stack((pos_y[:, :, :, :, 0::2].sin(), pos_y[:, :, :, :, 1::2].cos()), dim=5).flatten(4)
55 |         pos_z = torch.stack((pos_z[:, :, :, :, 0::2].sin(), pos_z[:, :, :, :, 1::2].cos()), dim=5).flatten(4)
56 |         pos = (torch.cat((pos_y, pos_x), dim=4) + pos_z).permute(0, 1, 4, 2, 3)  # b, t, c, h, w
57 |         return pos
58 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/mask2former_video/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/mask2former_video/utils/memory.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | 
 3 | import logging
 4 | from contextlib import contextmanager
 5 | from functools import wraps
 6 | import torch
 7 | from torch.cuda.amp import autocast
 8 | 
 9 | __all__ = ["retry_if_cuda_oom"]
10 | 
11 | 
12 | @contextmanager
13 | def _ignore_torch_cuda_oom():
14 |     """
15 |     A context which ignores CUDA OOM exception from pytorch.
16 |     """
17 |     try:
18 |         yield
19 |     except RuntimeError as e:
20 |         # NOTE: the string may change?
21 |         if "CUDA out of memory. " in str(e):
22 |             pass
23 |         else:
24 |             raise
25 | 
26 | 
27 | def retry_if_cuda_oom(func):
28 |     """
29 |     Makes a function retry itself after encountering
30 |     pytorch's CUDA OOM error.
31 |     It will first retry after calling `torch.cuda.empty_cache()`.
32 |     If that still fails, it will then retry by trying to convert inputs to CPUs.
33 |     In this case, it expects the function to dispatch to CPU implementation.
34 |     The return values may become CPU tensors as well and it's user's
35 |     responsibility to convert it back to CUDA tensor if needed.
36 |     Args:
37 |         func: a stateless callable that takes tensor-like objects as arguments
38 |     Returns:
39 |         a callable which retries `func` if OOM is encountered.
40 |     Examples:
41 |     ::
42 |         output = retry_if_cuda_oom(some_torch_function)(input1, input2)
43 |         # output may be on CPU even if inputs are on GPU
44 |     Note:
45 |         1. When converting inputs to CPU, it will only look at each argument and check
46 |            if it has `.device` and `.to` for conversion. Nested structures of tensors
47 |            are not supported.
48 |         2. Since the function might be called more than once, it has to be
49 |            stateless.
50 |     """
51 | 
52 |     def maybe_to_cpu(x):
53 |         try:
54 |             like_gpu_tensor = x.device.type == "cuda" and hasattr(x, "to")
55 |         except AttributeError:
56 |             like_gpu_tensor = False
57 |         if like_gpu_tensor:
58 |             return x.to(device="cpu").to(torch.float32)
59 |         else:
60 |             return x
61 | 
62 |     @wraps(func)
63 |     def wrapped(*args, **kwargs):
64 |         with _ignore_torch_cuda_oom():
65 |             return func(*args, **kwargs)
66 | 
67 |         # Clear cache and retry
68 |         torch.cuda.empty_cache()
69 |         with _ignore_torch_cuda_oom():
70 |             return func(*args, **kwargs)
71 | 
72 |         # Try on CPU. This slows down the code significantly, therefore print a notice.
73 |         logger = logging.getLogger(__name__)
74 |         logger.info("Attempting to copy inputs to CPU due to CUDA OOM")
75 |         new_args = (maybe_to_cpu(x) for x in args)
76 |         new_kwargs = {k: maybe_to_cpu(v) for k, v in kwargs.items()}
77 |         with autocast(enabled=False):
78 |             return func(*new_args, **new_kwargs)
79 | 
80 |     return wrapped
81 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/predict.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | sys.path.insert(0, "Mask2Former")
 3 | import tempfile
 4 | from pathlib import Path
 5 | import numpy as np
 6 | import cv2
 7 | import cog
 8 | 
 9 | # import some common detectron2 utilities
10 | from detectron2.config import CfgNode as CN
11 | from detectron2.engine import DefaultPredictor
12 | from detectron2.config import get_cfg
13 | from detectron2.utils.visualizer import Visualizer, ColorMode
14 | from detectron2.data import MetadataCatalog
15 | from detectron2.projects.deeplab import add_deeplab_config
16 | 
17 | # import Mask2Former project
18 | from mask2former import add_maskformer2_config
19 | 
20 | 
21 | class Predictor(cog.Predictor):
22 |     def setup(self):
23 |         cfg = get_cfg()
24 |         add_deeplab_config(cfg)
25 |         add_maskformer2_config(cfg)
26 |         cfg.merge_from_file("Mask2Former/configs/coco/panoptic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_100ep.yaml")
27 |         cfg.MODEL.WEIGHTS = 'model_final_f07440.pkl'
28 |         cfg.MODEL.MASK_FORMER.TEST.SEMANTIC_ON = True
29 |         cfg.MODEL.MASK_FORMER.TEST.INSTANCE_ON = True
30 |         cfg.MODEL.MASK_FORMER.TEST.PANOPTIC_ON = True
31 |         self.predictor = DefaultPredictor(cfg)
32 |         self.coco_metadata = MetadataCatalog.get("coco_2017_val_panoptic")
33 | 
34 | 
35 |     @cog.input(
36 |         "image",
37 |         type=Path,
38 |         help="Input image for segmentation. Output will be the concatenation of Panoptic segmentation (top), "
39 |              "instance segmentation (middle), and semantic segmentation (bottom).",
40 |     )
41 |     def predict(self, image):
42 |         im = cv2.imread(str(image))
43 |         outputs = self.predictor(im)
44 |         v = Visualizer(im[:, :, ::-1], self.coco_metadata, scale=1.2, instance_mode=ColorMode.IMAGE_BW)
45 |         panoptic_result = v.draw_panoptic_seg(outputs["panoptic_seg"][0].to("cpu"),
46 |                                               outputs["panoptic_seg"][1]).get_image()
47 |         v = Visualizer(im[:, :, ::-1], self.coco_metadata, scale=1.2, instance_mode=ColorMode.IMAGE_BW)
48 |         instance_result = v.draw_instance_predictions(outputs["instances"].to("cpu")).get_image()
49 |         v = Visualizer(im[:, :, ::-1], self.coco_metadata, scale=1.2, instance_mode=ColorMode.IMAGE_BW)
50 |         semantic_result = v.draw_sem_seg(outputs["sem_seg"].argmax(0).to("cpu")).get_image()
51 |         result = np.concatenate((panoptic_result, instance_result, semantic_result), axis=0)[:, :, ::-1]
52 |         out_path = Path(tempfile.mkdtemp()) / "out.png"
53 |         cv2.imwrite(str(out_path), result)
54 |         return out_path
55 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/requirements.txt:
--------------------------------------------------------------------------------
1 | cython
2 | scipy
3 | shapely
4 | timm
5 | h5py
6 | submitit
7 | scikit-image
8 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | 
 4 | import glob
 5 | import os
 6 | from os import path
 7 | from setuptools import find_packages, setup
 8 | import torch
 9 | from torch.utils.cpp_extension import CUDA_HOME, CppExtension, CUDAExtension
10 | 
11 | torch_ver = [int(x) for x in torch.__version__.split(".")[:2]]
12 | assert torch_ver >= [1, 8], "Requires PyTorch >= 1.8"
13 | 
14 | 
15 | def get_version():
16 |     init_py_path = path.join(path.abspath(path.dirname(__file__)), "mask2former", "__init__.py")
17 |     init_py = open(init_py_path, "r").readlines()
18 |     version_line = [l.strip() for l in init_py if l.startswith("__version__")][0]
19 |     version = version_line.split("=")[-1].strip().strip("'\"")
20 | 
21 |     return version
22 | 
23 | 
24 | # Copied from Detectron2
25 | def get_extensions():
26 |     # skip building
27 |     if not (os.environ.get("FORCE_CUDA") or torch.cuda.is_available()) or CUDA_HOME is None:
28 |         return []
29 | 
30 |     this_dir = os.path.dirname(os.path.abspath(__file__))
31 |     extensions_dir = os.path.join(this_dir, "mask2former/modeling/pixel_decoder/ops/src")
32 | 
33 |     main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
34 |     source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
35 |     source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
36 | 
37 |     sources = main_file + source_cpu
38 |     extension = CppExtension
39 |     extra_compile_args = {"cxx": []}
40 |     define_macros = []
41 | 
42 |     # Force cuda since torch ask for a device, not if cuda is in fact available.
43 |     if (os.environ.get("FORCE_CUDA") or torch.cuda.is_available()) and CUDA_HOME is not None:
44 |         extension = CUDAExtension
45 |         sources += source_cuda
46 |         define_macros += [("WITH_CUDA", None)]
47 |         extra_compile_args["nvcc"] = [
48 |             "-DCUDA_HAS_FP16=1",
49 |             "-D__CUDA_NO_HALF_OPERATORS__",
50 |             "-D__CUDA_NO_HALF_CONVERSIONS__",
51 |             "-D__CUDA_NO_HALF2_OPERATORS__",
52 |         ]
53 |     else:
54 |         if CUDA_HOME is None:
55 |             raise NotImplementedError(
56 |                 "CUDA_HOME is None. Please set environment variable CUDA_HOME."
57 |             )
58 |         else:
59 |             raise NotImplementedError(
60 |                 "No CUDA runtime is found. Please set FORCE_CUDA=1 or test it by running torch.cuda.is_available()."  # noqa
61 |             )
62 | 
63 |     sources = [os.path.join(extensions_dir, s) for s in sources]
64 |     include_dirs = [extensions_dir]
65 |     ext_modules = [
66 |         extension(
67 |             "MultiScaleDeformableAttention",
68 |             sources,
69 |             include_dirs=include_dirs,
70 |             define_macros=define_macros,
71 |             extra_compile_args=extra_compile_args,
72 |         )
73 |     ]
74 |     return ext_modules
75 | 
76 | 
77 | setup(
78 |     name="mask2former",
79 |     version=get_version(),
80 |     author="Bowen Cheng", # Thanks Bowen! 
81 |     url="https://github.com/facebook/mask2former",
82 |     description="A pip installable version of mask2former",
83 |     packages=find_packages(exclude=("configs", "tests*")),
84 |     python_requires=">=3.6",
85 |     install_requires=[
86 |         "detectron2 @ https://github.com/facebookresearch/detectron2/archive/v0.6.zip",
87 |         "scipy>=1.7.3",
88 |         "boto3>=1.21.25",
89 |         "hydra-core==1.1.1",
90 |         # there is BC breaking in omegaconf 2.2.1
91 |         # see: https://github.com/omry/omegaconf/issues/939
92 |         "omegaconf==2.1.1",
93 |         "panopticapi @ https://github.com/cocodataset/panopticapi/archive/master.zip",
94 |         "lvis @ https://github.com/lvis-dataset/lvis-api/archive/master.zip",
95 |     ],
96 |     ext_modules=get_extensions(),
97 |     cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
98 | )
99 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/tools/README.md:
--------------------------------------------------------------------------------
 1 | This directory contains few tools for MaskFormer.
 2 | 
 3 | * `convert-torchvision-to-d2.py`
 4 | 
 5 | Tool to convert torchvision pre-trained weights for D2.
 6 | 
 7 | ```
 8 | wget https://download.pytorch.org/models/resnet101-63fe2227.pth
 9 | python tools/convert-torchvision-to-d2.py resnet101-63fe2227.pth R-101.pkl
10 | ```
11 | 
12 | * `convert-pretrained-swin-model-to-d2.py`
13 | 
14 | Tool to convert Swin Transformer pre-trained weights for D2.
15 | 
16 | ```
17 | pip install timm
18 | 
19 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth
20 | python tools/convert-pretrained-swin-model-to-d2.py swin_tiny_patch4_window7_224.pth swin_tiny_patch4_window7_224.pkl
21 | 
22 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_small_patch4_window7_224.pth
23 | python tools/convert-pretrained-swin-model-to-d2.py swin_small_patch4_window7_224.pth swin_small_patch4_window7_224.pkl
24 | 
25 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window12_384_22k.pth
26 | python tools/convert-pretrained-swin-model-to-d2.py swin_base_patch4_window12_384_22k.pth swin_base_patch4_window12_384_22k.pkl
27 | 
28 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth
29 | python tools/convert-pretrained-swin-model-to-d2.py swin_large_patch4_window12_384_22k.pth swin_large_patch4_window12_384_22k.pkl
30 | ```
31 | 
32 | * `evaluate_pq_for_semantic_segmentation.py`
33 | 
34 | Tool to evaluate PQ (PQ-stuff) for semantic segmentation predictions.
35 | 
36 | Usage:
37 | 
38 | ```
39 | python tools/evaluate_pq_for_semantic_segmentation.py --dataset-name ade20k_sem_seg_val --json-file OUTPUT_DIR/inference/sem_seg_predictions.json
40 | ```
41 | 
42 | where `OUTPUT_DIR` is set in the config file.
43 | 
44 | * `evaluate_coco_boundary_ap.py`
45 | 
46 | Tool to evaluate Boundary AP for instance segmentation predictions.
47 | 
48 | Usage:
49 | 
50 | ```
51 | python tools/coco_instance_evaluation.py --gt-json-file COCO_GT_JSON --dt-json-file COCO_DT_JSON
52 | ```
53 | 
54 | To install Boundary IoU API, run:
55 | 
56 | ```
57 | pip install git+https://github.com/bowenc0221/boundary-iou-api.git
58 | ```
59 | 
60 | * `analyze_model.py`
61 | 
62 | Tool to analyze model parameters and flops.
63 | 
64 | Usage for semantic segmentation (ADE20K only, use with caution!):
65 | 
66 | ```
67 | python tools/analyze_model.py --num-inputs 1 --tasks flop --use-fixed-input-size --config-file CONFIG_FILE
68 | ```
69 | 
70 | Note that, for semantic segmentation (ADE20K only), we use a dummy image with fixed size that equals to `cfg.INPUT.CROP.SIZE[0] x cfg.INPUT.CROP.SIZE[0]`.
71 | Please do not use `--use-fixed-input-size` for calculating FLOPs on other datasets like Cityscapes!
72 | 
73 | Usage for panoptic and instance segmentation:
74 | 
75 | ```
76 | python tools/analyze_model.py --num-inputs 100 --tasks flop --config-file CONFIG_FILE
77 | ```
78 | 
79 | Note that, for panoptic and instance segmentation, we compute the average flops over 100 real validation images.
80 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/tools/convert-pretrained-swin-model-to-d2.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 3 | 
 4 | import pickle as pkl
 5 | import sys
 6 | 
 7 | import torch
 8 | 
 9 | """
10 | Usage:
11 |   # download pretrained swin model:
12 |   wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth
13 |   # run the conversion
14 |   ./convert-pretrained-model-to-d2.py swin_tiny_patch4_window7_224.pth swin_tiny_patch4_window7_224.pkl
15 |   # Then, use swin_tiny_patch4_window7_224.pkl with the following changes in config:
16 | MODEL:
17 |   WEIGHTS: "/path/to/swin_tiny_patch4_window7_224.pkl"
18 | INPUT:
19 |   FORMAT: "RGB"
20 | """
21 | 
22 | if __name__ == "__main__":
23 |     input = sys.argv[1]
24 | 
25 |     obj = torch.load(input, map_location="cpu")["model"]
26 | 
27 |     res = {"model": obj, "__author__": "third_party", "matching_heuristics": True}
28 | 
29 |     with open(sys.argv[2], "wb") as f:
30 |         pkl.dump(res, f)
31 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/tools/convert-torchvision-to-d2.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | 
 4 | import pickle as pkl
 5 | import sys
 6 | 
 7 | import torch
 8 | 
 9 | """
10 | Usage:
11 |   # download one of the ResNet{18,34,50,101,152} models from torchvision:
12 |   wget https://download.pytorch.org/models/resnet50-19c8e357.pth -O r50.pth
13 |   # run the conversion
14 |   ./convert-torchvision-to-d2.py r50.pth r50.pkl
15 |   # Then, use r50.pkl with the following changes in config:
16 | MODEL:
17 |   WEIGHTS: "/path/to/r50.pkl"
18 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
19 |   PIXEL_STD: [58.395, 57.120, 57.375]
20 |   RESNETS:
21 |     DEPTH: 50
22 |     STRIDE_IN_1X1: False
23 | INPUT:
24 |   FORMAT: "RGB"
25 | """
26 | 
27 | if __name__ == "__main__":
28 |     input = sys.argv[1]
29 | 
30 |     obj = torch.load(input, map_location="cpu")
31 | 
32 |     newmodel = {}
33 |     for k in list(obj.keys()):
34 |         old_k = k
35 |         if "layer" not in k:
36 |             k = "stem." + k
37 |         for t in [1, 2, 3, 4]:
38 |             k = k.replace("layer{}".format(t), "res{}".format(t + 1))
39 |         for t in [1, 2, 3]:
40 |             k = k.replace("bn{}".format(t), "conv{}.norm".format(t))
41 |         k = k.replace("downsample.0", "shortcut")
42 |         k = k.replace("downsample.1", "shortcut.norm")
43 |         print(old_k, "->", k)
44 |         newmodel[k] = obj.pop(old_k).detach().numpy()
45 | 
46 |     res = {"model": newmodel, "__author__": "torchvision", "matching_heuristics": True}
47 | 
48 |     with open(sys.argv[2], "wb") as f:
49 |         pkl.dump(res, f)
50 |     if obj:
51 |         print("Unconverted keys:", obj.keys())
52 | 


--------------------------------------------------------------------------------
/third_party/Mask2Former/tools/evaluate_coco_boundary_ap.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 3 | # Modified by Bowen Cheng from: https://github.com/bowenc0221/boundary-iou-api/blob/master/tools/coco_instance_evaluation.py
 4 | 
 5 | """
 6 | Evaluation for COCO val2017:
 7 | python ./tools/coco_instance_evaluation.py \
 8 |     --gt-json-file COCO_GT_JSON \
 9 |     --dt-json-file COCO_DT_JSON
10 | """
11 | import argparse
12 | import json
13 | 
14 | from boundary_iou.coco_instance_api.coco import COCO
15 | from boundary_iou.coco_instance_api.cocoeval import COCOeval
16 | 
17 | 
18 | def main():
19 |     parser = argparse.ArgumentParser()
20 |     parser.add_argument("--gt-json-file", default="")
21 |     parser.add_argument("--dt-json-file", default="")
22 |     parser.add_argument("--iou-type", default="boundary")
23 |     parser.add_argument("--dilation-ratio", default="0.020", type=float)
24 |     args = parser.parse_args()
25 |     print(args)
26 | 
27 |     annFile = args.gt_json_file
28 |     resFile = args.dt_json_file
29 |     dilation_ratio = args.dilation_ratio
30 |     if args.iou_type == "boundary":
31 |         get_boundary = True
32 |     else:
33 |         get_boundary = False
34 |     cocoGt = COCO(annFile, get_boundary=get_boundary, dilation_ratio=dilation_ratio)
35 |     
36 |     # remove box predictions
37 |     resFile = json.load(open(resFile))
38 |     for c in resFile:
39 |         c.pop("bbox", None)
40 | 
41 |     cocoDt = cocoGt.loadRes(resFile)
42 |     cocoEval = COCOeval(cocoGt, cocoDt, iouType=args.iou_type, dilation_ratio=dilation_ratio)
43 |     cocoEval.evaluate()
44 |     cocoEval.accumulate()
45 |     cocoEval.summarize()
46 | 
47 | 
48 | if __name__ == '__main__':
49 |     main()
50 | 


--------------------------------------------------------------------------------