├── .gitignore ├── GETTING_STARTED.md ├── LICENSE ├── MANIFEST.in ├── README.md ├── configs ├── Panoptic │ ├── odise_caption_coco_50e.py │ └── odise_label_coco_50e.py └── common │ ├── data │ ├── coco_panoptic_semseg.py │ └── pano_open_d2_eval.py │ ├── models │ ├── mask_generator_with_caption.py │ ├── mask_generator_with_label.py │ ├── odise_with_caption.py │ └── odise_with_label.py │ ├── optim.py │ ├── schedule.py │ └── train.py ├── datasets ├── README.md ├── ade20k_instance_catid_mapping.txt ├── ade20k_instance_imgCatIds.json ├── prepare_ade20k_full_sem_seg.py ├── prepare_ade20k_ins_seg.py ├── prepare_ade20k_pan_seg.py ├── prepare_ade20k_sem_seg.py ├── prepare_coco_caption.py ├── prepare_coco_semantic_annos_from_panoptic_annos.py ├── prepare_lvis_openseg_labels.py ├── prepare_pascal_ctx_full_sem_seg.py ├── prepare_pascal_ctx_sem_seg.py └── prepare_pascal_voc_sem_seg.py ├── demo ├── app.py ├── demo.ipynb ├── demo.py └── examples │ ├── ade.jpg │ ├── coco.jpg │ ├── ego4d.jpg │ └── purse.jpeg ├── docker └── Dockerfile ├── figs ├── github_arch.gif ├── github_vis_ade_0.gif ├── github_vis_ade_1.gif ├── github_vis_coco_0.gif ├── github_vis_coco_1.gif ├── github_vis_ego4d_0.gif ├── github_vis_ego4d_1.gif └── teaser.jpg ├── odise ├── __init__.py ├── checkpoint │ ├── __init__.py │ └── odise_checkpointer.py ├── config │ ├── __init__.py │ ├── instantiate.py │ └── utils.py ├── data │ ├── __init__.py │ ├── build.py │ ├── dataset_mapper.py │ └── datasets │ │ ├── __init__.py │ │ ├── openseg_labels │ │ ├── README.md │ │ ├── ade20k_150.txt │ │ ├── ade20k_150_with_prompt_eng.txt │ │ ├── ade20k_847.txt │ │ ├── ade20k_847_with_prompt_eng.txt │ │ ├── coco_panoptic.txt │ │ ├── coco_panoptic_with_prompt_eng.txt │ │ ├── lvis_1203.txt │ │ ├── lvis_1203_with_prompt_eng.txt │ │ ├── pascal_context_459.txt │ │ ├── pascal_context_459_with_prompt_eng.txt │ │ ├── pascal_context_59.txt │ │ ├── pascal_context_59_with_prompt_eng.txt │ │ ├── pascal_voc_21.txt │ │ └── pascal_voc_21_with_prompt_eng.txt │ │ ├── register_coco_caption.py │ │ └── register_pascal.py ├── engine │ ├── __init__.py │ ├── defaults.py │ ├── hooks.py │ └── train_loop.py ├── evaluation │ ├── __init__.py │ ├── d2_evaluator.py │ └── evaluator.py ├── model_zoo │ ├── __init__.py │ └── model_zoo.py ├── modeling │ ├── __init__.py │ ├── backbone │ │ ├── __init__.py │ │ └── feature_extractor.py │ ├── diffusion │ │ ├── __init__.py │ │ ├── diffusion_builder.py │ │ ├── gaussian_diffusion.py │ │ ├── resample.py │ │ └── respace.py │ ├── meta_arch │ │ ├── __init__.py │ │ ├── clip.py │ │ ├── helper.py │ │ ├── ldm.py │ │ └── odise.py │ ├── preprocess.py │ └── wrapper │ │ ├── __init__.py │ │ └── pano_wrapper.py └── utils │ ├── __init__.py │ ├── collect_env.py │ ├── events.py │ ├── file_io.py │ └── parameter_count.py ├── setup.cfg ├── setup.py ├── third_party └── Mask2Former │ ├── .gitignore │ ├── ADVANCED_USAGE.md │ ├── CODE_OF_CONDUCT.md │ ├── CONTRIBUTING.md │ ├── GETTING_STARTED.md │ ├── INSTALL.md │ ├── LICENSE │ ├── MODEL_ZOO.md │ ├── README.md │ ├── cog.yaml │ ├── configs │ ├── ade20k │ │ ├── instance-segmentation │ │ │ ├── Base-ADE20K-InstanceSegmentation.yaml │ │ │ ├── maskformer2_R50_bs16_160k.yaml │ │ │ └── swin │ │ │ │ └── maskformer2_swin_large_IN21k_384_bs16_160k.yaml │ │ ├── panoptic-segmentation │ │ │ ├── Base-ADE20K-PanopticSegmentation.yaml │ │ │ ├── maskformer2_R50_bs16_160k.yaml │ │ │ └── swin │ │ │ │ └── maskformer2_swin_large_IN21k_384_bs16_160k.yaml │ │ └── semantic-segmentation │ │ │ ├── Base-ADE20K-SemanticSegmentation.yaml │ │ │ ├── maskformer2_R101_bs16_90k.yaml │ │ │ ├── maskformer2_R50_bs16_160k.yaml │ │ │ └── swin │ │ │ ├── maskformer2_swin_base_384_bs16_160k_res640.yaml │ │ │ ├── maskformer2_swin_base_IN21k_384_bs16_160k_res640.yaml │ │ │ ├── maskformer2_swin_large_IN21k_384_bs16_160k_res640.yaml │ │ │ ├── maskformer2_swin_small_bs16_160k.yaml │ │ │ └── maskformer2_swin_tiny_bs16_160k.yaml │ ├── cityscapes │ │ ├── instance-segmentation │ │ │ ├── Base-Cityscapes-InstanceSegmentation.yaml │ │ │ ├── maskformer2_R101_bs16_90k.yaml │ │ │ ├── maskformer2_R50_bs16_90k.yaml │ │ │ └── swin │ │ │ │ ├── maskformer2_swin_base_IN21k_384_bs16_90k.yaml │ │ │ │ ├── maskformer2_swin_large_IN21k_384_bs16_90k.yaml │ │ │ │ ├── maskformer2_swin_small_bs16_90k.yaml │ │ │ │ └── maskformer2_swin_tiny_bs16_90k.yaml │ │ ├── panoptic-segmentation │ │ │ ├── Base-Cityscapes-PanopticSegmentation.yaml │ │ │ ├── maskformer2_R101_bs16_90k.yaml │ │ │ ├── maskformer2_R50_bs16_90k.yaml │ │ │ └── swin │ │ │ │ ├── maskformer2_swin_base_IN21k_384_bs16_90k.yaml │ │ │ │ ├── maskformer2_swin_large_IN21k_384_bs16_90k.yaml │ │ │ │ ├── maskformer2_swin_small_bs16_90k.yaml │ │ │ │ └── maskformer2_swin_tiny_bs16_90k.yaml │ │ └── semantic-segmentation │ │ │ ├── Base-Cityscapes-SemanticSegmentation.yaml │ │ │ ├── maskformer2_R101_bs16_90k.yaml │ │ │ ├── maskformer2_R50_bs16_90k.yaml │ │ │ └── swin │ │ │ ├── maskformer2_swin_base_IN21k_384_bs16_90k.yaml │ │ │ ├── maskformer2_swin_large_IN21k_384_bs16_90k.yaml │ │ │ ├── maskformer2_swin_small_bs16_90k.yaml │ │ │ └── maskformer2_swin_tiny_bs16_90k.yaml │ ├── coco │ │ ├── instance-segmentation │ │ │ ├── Base-COCO-InstanceSegmentation.yaml │ │ │ ├── maskformer2_R101_bs16_50ep.yaml │ │ │ ├── maskformer2_R50_bs16_50ep.yaml │ │ │ └── swin │ │ │ │ ├── maskformer2_swin_base_384_bs16_50ep.yaml │ │ │ │ ├── maskformer2_swin_base_IN21k_384_bs16_50ep.yaml │ │ │ │ ├── maskformer2_swin_large_IN21k_384_bs16_100ep.yaml │ │ │ │ ├── maskformer2_swin_small_bs16_50ep.yaml │ │ │ │ └── maskformer2_swin_tiny_bs16_50ep.yaml │ │ └── panoptic-segmentation │ │ │ ├── Base-COCO-PanopticSegmentation.yaml │ │ │ ├── maskformer2_R101_bs16_50ep.yaml │ │ │ ├── maskformer2_R50_bs16_50ep.yaml │ │ │ └── swin │ │ │ ├── maskformer2_swin_base_384_bs16_50ep.yaml │ │ │ ├── maskformer2_swin_base_IN21k_384_bs16_50ep.yaml │ │ │ ├── maskformer2_swin_large_IN21k_384_bs16_100ep.yaml │ │ │ ├── maskformer2_swin_small_bs16_50ep.yaml │ │ │ └── maskformer2_swin_tiny_bs16_50ep.yaml │ ├── mapillary-vistas │ │ ├── panoptic-segmentation │ │ │ ├── Base-MapillaryVistas-PanopticSegmentation.yaml │ │ │ ├── maskformer_R50_bs16_300k.yaml │ │ │ └── swin │ │ │ │ └── maskformer2_swin_large_IN21k_384_bs16_300k.yaml │ │ └── semantic-segmentation │ │ │ ├── Base-MapillaryVistas-SemanticSegmentation.yaml │ │ │ ├── maskformer2_R50_bs16_300k.yaml │ │ │ └── swin │ │ │ └── maskformer2_swin_large_IN21k_384_bs16_300k.yaml │ ├── youtubevis_2019 │ │ ├── Base-YouTubeVIS-VideoInstanceSegmentation.yaml │ │ ├── swin │ │ │ ├── video_maskformer2_swin_base_IN21k_384_bs16_8ep.yaml │ │ │ ├── video_maskformer2_swin_large_IN21k_384_bs16_8ep.yaml │ │ │ ├── video_maskformer2_swin_small_bs16_8ep.yaml │ │ │ └── video_maskformer2_swin_tiny_bs16_8ep.yaml │ │ ├── video_maskformer2_R101_bs16_8ep.yaml │ │ └── video_maskformer2_R50_bs16_8ep.yaml │ └── youtubevis_2021 │ │ ├── Base-YouTubeVIS-VideoInstanceSegmentation.yaml │ │ ├── swin │ │ ├── video_maskformer2_swin_base_IN21k_384_bs16_8ep.yaml │ │ ├── video_maskformer2_swin_large_IN21k_384_bs16_8ep.yaml │ │ ├── video_maskformer2_swin_small_bs16_8ep.yaml │ │ └── video_maskformer2_swin_tiny_bs16_8ep.yaml │ │ ├── video_maskformer2_R101_bs16_8ep.yaml │ │ └── video_maskformer2_R50_bs16_8ep.yaml │ ├── datasets │ ├── README.md │ ├── ade20k_instance_catid_mapping.txt │ ├── ade20k_instance_imgCatIds.json │ ├── prepare_ade20k_ins_seg.py │ ├── prepare_ade20k_pan_seg.py │ ├── prepare_ade20k_sem_seg.py │ └── prepare_coco_semantic_annos_from_panoptic_annos.py │ ├── demo │ ├── README.md │ ├── demo.py │ └── predictor.py │ ├── demo_video │ ├── README.md │ ├── demo.py │ ├── predictor.py │ └── visualizer.py │ ├── mask2former │ ├── __init__.py │ ├── config.py │ ├── data │ │ ├── __init__.py │ │ ├── dataset_mappers │ │ │ ├── __init__.py │ │ │ ├── coco_instance_new_baseline_dataset_mapper.py │ │ │ ├── coco_panoptic_new_baseline_dataset_mapper.py │ │ │ ├── mask_former_instance_dataset_mapper.py │ │ │ ├── mask_former_panoptic_dataset_mapper.py │ │ │ └── mask_former_semantic_dataset_mapper.py │ │ └── datasets │ │ │ ├── __init__.py │ │ │ ├── register_ade20k_full.py │ │ │ ├── register_ade20k_instance.py │ │ │ ├── register_ade20k_panoptic.py │ │ │ ├── register_coco_panoptic_annos_semseg.py │ │ │ ├── register_coco_stuff_10k.py │ │ │ ├── register_mapillary_vistas.py │ │ │ └── register_mapillary_vistas_panoptic.py │ ├── evaluation │ │ ├── __init__.py │ │ └── instance_evaluation.py │ ├── maskformer_model.py │ ├── modeling │ │ ├── __init__.py │ │ ├── backbone │ │ │ ├── __init__.py │ │ │ └── swin.py │ │ ├── criterion.py │ │ ├── matcher.py │ │ ├── meta_arch │ │ │ ├── __init__.py │ │ │ ├── mask_former_head.py │ │ │ └── per_pixel_baseline.py │ │ ├── pixel_decoder │ │ │ ├── __init__.py │ │ │ ├── fpn.py │ │ │ ├── msdeformattn.py │ │ │ └── ops │ │ │ │ ├── __init__.py │ │ │ │ ├── functions │ │ │ │ ├── __init__.py │ │ │ │ └── ms_deform_attn_func.py │ │ │ │ ├── modules │ │ │ │ ├── __init__.py │ │ │ │ └── ms_deform_attn.py │ │ │ │ ├── src │ │ │ │ ├── cpu │ │ │ │ │ ├── ms_deform_attn_cpu.cpp │ │ │ │ │ └── ms_deform_attn_cpu.h │ │ │ │ ├── cuda │ │ │ │ │ ├── ms_deform_attn_cuda.cu │ │ │ │ │ ├── ms_deform_attn_cuda.h │ │ │ │ │ └── ms_deform_im2col_cuda.cuh │ │ │ │ ├── ms_deform_attn.h │ │ │ │ └── vision.cpp │ │ │ │ └── test.py │ │ └── transformer_decoder │ │ │ ├── __init__.py │ │ │ ├── mask2former_transformer_decoder.py │ │ │ ├── maskformer_transformer_decoder.py │ │ │ ├── position_encoding.py │ │ │ └── transformer.py │ ├── test_time_augmentation.py │ └── utils │ │ ├── __init__.py │ │ └── misc.py │ ├── mask2former_video │ ├── __init__.py │ ├── config.py │ ├── data_video │ │ ├── __init__.py │ │ ├── augmentation.py │ │ ├── build.py │ │ ├── dataset_mapper.py │ │ ├── datasets │ │ │ ├── __init__.py │ │ │ ├── builtin.py │ │ │ ├── ytvis.py │ │ │ └── ytvis_api │ │ │ │ ├── __init__.py │ │ │ │ ├── ytvos.py │ │ │ │ └── ytvoseval.py │ │ └── ytvis_eval.py │ ├── modeling │ │ ├── __init__.py │ │ ├── criterion.py │ │ ├── matcher.py │ │ └── transformer_decoder │ │ │ ├── __init__.py │ │ │ ├── position_encoding.py │ │ │ └── video_mask2former_transformer_decoder.py │ ├── utils │ │ ├── __init__.py │ │ └── memory.py │ └── video_maskformer_model.py │ ├── predict.py │ ├── requirements.txt │ ├── setup.py │ ├── tools │ ├── README.md │ ├── analyze_model.py │ ├── convert-pretrained-swin-model-to-d2.py │ ├── convert-torchvision-to-d2.py │ ├── evaluate_coco_boundary_ap.py │ └── evaluate_pq_for_semantic_segmentation.py │ ├── train_net.py │ └── train_net_video.py └── tools └── train_net.py /.gitignore: -------------------------------------------------------------------------------- 1 | # output dir 2 | output 3 | instant_test_output 4 | inference_test_output 5 | 6 | 7 | *.png 8 | *.json 9 | *.diff 10 | *.jpg 11 | !/projects/DensePose/doc/images/*.jpg 12 | 13 | # compilation and distribution 14 | __pycache__ 15 | _ext 16 | *.pyc 17 | *.pyd 18 | *.so 19 | *.dll 20 | *.egg-info/ 21 | build/ 22 | dist/ 23 | wheels/ 24 | 25 | # pytorch/python/numpy formats 26 | *.pth 27 | *.pkl 28 | *.npy 29 | *.ts 30 | model_ts*.txt 31 | 32 | # ipython/jupyter notebooks 33 | *.ipynb 34 | **/.ipynb_checkpoints/ 35 | 36 | # Editor temporaries 37 | *.swn 38 | *.swo 39 | *.swp 40 | *~ 41 | 42 | # editor settings 43 | .idea 44 | .vscode 45 | _darcs 46 | 47 | # project dirs 48 | /detectron2/model_zoo/configs 49 | /odise/model_zoo/configs 50 | /datasets/* 51 | !/datasets/*.* 52 | /projects/*/datasets 53 | /models 54 | /snippet 55 | 56 | # Mac 57 | *.DS_Store 58 | 59 | # Gradio 60 | gradio_queue.db 61 | 62 | # CLIP 63 | *.pt 64 | 65 | # stable diffusion 66 | *.ckpt 67 | 68 | *.o -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include odise/data/datasets/openseg_labels/*.txt 2 | -------------------------------------------------------------------------------- /configs/Panoptic/odise_caption_coco_50e.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # 4 | # This work is made available under the Nvidia Source Code License. 5 | # To view a copy of this license, visit 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE 7 | # 8 | # Written by Jiarui Xu 9 | # ------------------------------------------------------------------------------ 10 | 11 | from detectron2.config import LazyCall as L 12 | from detectron2.solver import WarmupParamScheduler 13 | from fvcore.common.param_scheduler import MultiStepParamScheduler 14 | 15 | from ..common.models.odise_with_caption import model 16 | from ..common.data.coco_panoptic_semseg import dataloader 17 | from ..common.train import train 18 | from ..common.optim import AdamW as optimizer 19 | from ..common.data.pano_open_d2_eval import ( 20 | ade150_open_eval as _ade150_eval, 21 | ctx59_open_eval as _ctx59_eval, 22 | ade847_open_eval as _ade847_eval, 23 | ctx459_open_eval as _ctx459_eval, 24 | pas21_open_eval as _pas21_eval, 25 | ) 26 | 27 | train.max_iter = 92_188 28 | train.grad_clip = 0.01 29 | train.checkpointer.period = 4500 30 | 31 | lr_multiplier = L(WarmupParamScheduler)( 32 | scheduler=L(MultiStepParamScheduler)( 33 | values=[1.0, 0.1, 0.01], 34 | # assume 100e with batch-size 64 as original LSJ 35 | # Equivalent to 100 epochs. 36 | # 100 ep = 184375 iters * 64 images/iter / 118000 images/ep 37 | milestones=[163889, 177546], 38 | num_updates=184375, 39 | ), 40 | # for warmup length we adopted COCO LSJ setting 41 | warmup_length=500 / 184375, 42 | warmup_factor=0.067, 43 | ) 44 | 45 | optimizer.lr = 1e-4 46 | optimizer.weight_decay = 0.05 47 | 48 | dataloader.train.dataset.names = "coco_2017_train_panoptic_caption_with_sem_seg" 49 | 50 | _ade847_eval.final_iter_only = True 51 | _ctx459_eval.final_iter_only = True 52 | 53 | dataloader.extra_task = dict( 54 | eval_ade150=_ade150_eval, 55 | eval_ctx59=_ctx59_eval, 56 | eval_ade847=_ade847_eval, 57 | eval_ctx459=_ctx459_eval, 58 | eval_pas21=_pas21_eval, 59 | ) 60 | -------------------------------------------------------------------------------- /configs/Panoptic/odise_label_coco_50e.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # 4 | # This work is made available under the Nvidia Source Code License. 5 | # To view a copy of this license, visit 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE 7 | # 8 | # Written by Jiarui Xu 9 | # ------------------------------------------------------------------------------ 10 | 11 | from detectron2.config import LazyCall as L 12 | from detectron2.solver import WarmupParamScheduler 13 | from fvcore.common.param_scheduler import MultiStepParamScheduler 14 | 15 | from ..common.models.odise_with_label import model 16 | from ..common.data.coco_panoptic_semseg import dataloader 17 | from ..common.train import train 18 | from ..common.optim import AdamW as optimizer 19 | from ..common.data.pano_open_d2_eval import ( 20 | ade150_open_eval as _ade150_eval, 21 | ctx59_open_eval as _ctx59_eval, 22 | ade847_open_eval as _ade847_eval, 23 | ctx459_open_eval as _ctx459_eval, 24 | pas21_open_eval as _pas21_eval, 25 | ) 26 | 27 | train.max_iter = 92_188 28 | train.grad_clip = 0.01 29 | train.checkpointer.period = 4500 30 | 31 | lr_multiplier = L(WarmupParamScheduler)( 32 | scheduler=L(MultiStepParamScheduler)( 33 | values=[1.0, 0.1, 0.01], 34 | # assume 100e with batch-size 64 as original LSJ 35 | # Equivalent to 100 epochs. 36 | # 100 ep = 184375 iters * 64 images/iter / 118000 images/ep 37 | milestones=[163889, 177546], 38 | num_updates=184375, 39 | ), 40 | # for warmup length we adopted COCO LSJ setting 41 | warmup_length=500 / 184375, 42 | warmup_factor=0.067, 43 | ) 44 | 45 | optimizer.lr = 1e-4 46 | optimizer.weight_decay = 0.05 47 | 48 | _ade847_eval.final_iter_only = True 49 | _ctx459_eval.final_iter_only = True 50 | 51 | dataloader.extra_task = dict( 52 | eval_ade150=_ade150_eval, 53 | eval_ctx59=_ctx59_eval, 54 | eval_ade847=_ade847_eval, 55 | eval_ctx459=_ctx459_eval, 56 | eval_pas21=_pas21_eval, 57 | ) 58 | -------------------------------------------------------------------------------- /configs/common/data/coco_panoptic_semseg.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # To view a copy of this license, visit 4 | # https://github.com/facebookresearch/detectron2/blob/main/LICENSE 5 | # ------------------------------------------------------------------------------ 6 | # 7 | # ------------------------------------------------------------------------------ 8 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 9 | # 10 | # This work is made available under the Nvidia Source Code License. 11 | # To view a copy of this license, visit 12 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE 13 | # 14 | # Written by Jiarui Xu 15 | # ------------------------------------------------------------------------------ 16 | 17 | from omegaconf import OmegaConf 18 | 19 | import detectron2.data.transforms as T 20 | from detectron2.config import LazyCall as L 21 | from detectron2.data import get_detection_dataset_dicts 22 | from detectron2.data import DatasetMapper 23 | 24 | from odise.data import ( 25 | COCOPanopticDatasetMapper, 26 | build_d2_test_dataloader, 27 | build_d2_train_dataloader, 28 | get_openseg_labels, 29 | ) 30 | from odise.evaluation.d2_evaluator import ( 31 | COCOEvaluator, 32 | COCOPanopticEvaluator, 33 | SemSegEvaluator, 34 | ) 35 | from odise.modeling.wrapper.pano_wrapper import OpenPanopticInference 36 | from detectron2.data import MetadataCatalog 37 | 38 | dataloader = OmegaConf.create() 39 | 40 | dataloader.train = L(build_d2_train_dataloader)( 41 | dataset=L(get_detection_dataset_dicts)( 42 | names="coco_2017_train_panoptic_with_sem_seg", filter_empty=True 43 | ), 44 | mapper=L(COCOPanopticDatasetMapper)( 45 | is_train=True, 46 | # COCO LSJ aug 47 | augmentations=[ 48 | L(T.RandomFlip)(horizontal=True), 49 | L(T.ResizeScale)( 50 | min_scale=0.1, 51 | max_scale=2.0, 52 | target_height=1024, 53 | target_width=1024, 54 | ), 55 | L(T.FixedSizeCrop)(crop_size=(1024, 1024)), 56 | ], 57 | image_format="RGB", 58 | ), 59 | total_batch_size=64, 60 | num_workers=4, 61 | ) 62 | 63 | dataloader.test = L(build_d2_test_dataloader)( 64 | dataset=L(get_detection_dataset_dicts)( 65 | names="coco_2017_val_panoptic_with_sem_seg", 66 | filter_empty=False, 67 | ), 68 | mapper=L(DatasetMapper)( 69 | is_train=False, 70 | augmentations=[ 71 | L(T.ResizeShortestEdge)(short_edge_length=1024, sample_style="choice", max_size=2560), 72 | ], 73 | image_format="${...train.mapper.image_format}", 74 | ), 75 | local_batch_size=1, 76 | num_workers=1, 77 | ) 78 | 79 | dataloader.evaluator = [ 80 | L(COCOEvaluator)( 81 | dataset_name="${...test.dataset.names}", 82 | tasks=("segm",), 83 | ), 84 | L(SemSegEvaluator)( 85 | dataset_name="${...test.dataset.names}", 86 | ), 87 | L(COCOPanopticEvaluator)( 88 | dataset_name="${...test.dataset.names}", 89 | ), 90 | ] 91 | 92 | dataloader.wrapper = L(OpenPanopticInference)( 93 | labels=L(get_openseg_labels)(dataset="coco_panoptic", prompt_engineered=True), 94 | metadata=L(MetadataCatalog.get)(name="${...test.dataset.names}"), 95 | ) 96 | -------------------------------------------------------------------------------- /configs/common/models/odise_with_caption.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # 4 | # This work is made available under the Nvidia Source Code License. 5 | # To view a copy of this license, visit 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE 7 | # 8 | # Written by Jiarui Xu 9 | # ------------------------------------------------------------------------------ 10 | 11 | from detectron2.config import LazyCall as L 12 | from odise.modeling.meta_arch.ldm import LdmImplicitCaptionerExtractor 13 | from odise.modeling.backbone.feature_extractor import FeatureExtractorBackbone 14 | from .mask_generator_with_caption import model 15 | 16 | model.backbone = L(FeatureExtractorBackbone)( 17 | feature_extractor=L(LdmImplicitCaptionerExtractor)( 18 | encoder_block_indices=(5, 7), 19 | unet_block_indices=(2, 5, 8, 11), 20 | decoder_block_indices=(2, 5), 21 | steps=(0,), 22 | learnable_time_embed=True, 23 | num_timesteps=1, 24 | clip_model_name="ViT-L-14-336", 25 | ), 26 | out_features=["s2", "s3", "s4", "s5"], 27 | use_checkpoint=True, 28 | slide_training=True, 29 | ) 30 | model.sem_seg_head.pixel_decoder.transformer_in_features = ["s3", "s4", "s5"] 31 | model.clip_head.alpha = 0.35 32 | model.clip_head.beta = 0.65 33 | -------------------------------------------------------------------------------- /configs/common/models/odise_with_label.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # 4 | # This work is made available under the Nvidia Source Code License. 5 | # To view a copy of this license, visit 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE 7 | # 8 | # Written by Jiarui Xu 9 | # ------------------------------------------------------------------------------ 10 | 11 | from detectron2.config import LazyCall as L 12 | from odise.modeling.meta_arch.ldm import LdmImplicitCaptionerExtractor 13 | from odise.modeling.backbone.feature_extractor import FeatureExtractorBackbone 14 | from .mask_generator_with_label import model 15 | 16 | model.backbone = L(FeatureExtractorBackbone)( 17 | feature_extractor=L(LdmImplicitCaptionerExtractor)( 18 | encoder_block_indices=(5, 7), 19 | unet_block_indices=(2, 5, 8, 11), 20 | decoder_block_indices=(2, 5), 21 | steps=(0,), 22 | learnable_time_embed=True, 23 | num_timesteps=1, 24 | clip_model_name="ViT-L-14-336", 25 | ), 26 | out_features=["s2", "s3", "s4", "s5"], 27 | use_checkpoint=True, 28 | slide_training=True, 29 | ) 30 | model.sem_seg_head.pixel_decoder.transformer_in_features = ["s3", "s4", "s5"] 31 | model.clip_head.alpha = 0.3 32 | model.clip_head.beta = 0.7 33 | -------------------------------------------------------------------------------- /configs/common/optim.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # To view a copy of this license, visit 4 | # https://github.com/facebookresearch/detectron2/blob/main/LICENSE 5 | # ------------------------------------------------------------------------------ 6 | # 7 | # ------------------------------------------------------------------------------ 8 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 9 | # 10 | # This work is made available under the Nvidia Source Code License. 11 | # To view a copy of this license, visit 12 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE 13 | # 14 | # Written by Jiarui Xu 15 | # ------------------------------------------------------------------------------ 16 | 17 | import torch 18 | 19 | from detectron2.config import LazyCall as L 20 | from detectron2.solver.build import get_default_optimizer_params 21 | 22 | 23 | AdamW = L(torch.optim.AdamW)( 24 | params=L(get_default_optimizer_params)( 25 | # params.model is meant to be set to the model object, before instantiating 26 | # the optimizer. 27 | weight_decay_norm=0.0, 28 | weight_decay_bias=0.0, 29 | ), 30 | lr="???", 31 | weight_decay="???", 32 | ) 33 | -------------------------------------------------------------------------------- /configs/common/schedule.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # To view a copy of this license, visit 4 | # https://github.com/facebookresearch/detectron2/blob/main/LICENSE 5 | # ------------------------------------------------------------------------------ 6 | # 7 | # ------------------------------------------------------------------------------ 8 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 9 | # 10 | # This work is made available under the Nvidia Source Code License. 11 | # To view a copy of this license, visit 12 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE 13 | # 14 | # Written by Jiarui Xu 15 | # ------------------------------------------------------------------------------ 16 | 17 | from fvcore.common.param_scheduler import CosineParamScheduler 18 | 19 | from detectron2.config import LazyCall as L 20 | from detectron2.solver import WarmupParamScheduler 21 | 22 | cosine_lr_multiplier = L(WarmupParamScheduler)( 23 | scheduler=L(CosineParamScheduler)(start_value=1.0, end_value=0.01), 24 | warmup_length="???", 25 | warmup_method="linear", 26 | warmup_factor=0.001, 27 | ) 28 | -------------------------------------------------------------------------------- /configs/common/train.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # To view a copy of this license, visit 4 | # https://github.com/facebookresearch/detectron2/blob/main/LICENSE 5 | # ------------------------------------------------------------------------------ 6 | # 7 | # ------------------------------------------------------------------------------ 8 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 9 | # 10 | # This work is made available under the Nvidia Source Code License. 11 | # To view a copy of this license, visit 12 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE 13 | # 14 | # Written by Jiarui Xu 15 | # ------------------------------------------------------------------------------ 16 | 17 | # Common training-related configs that are designed for "tools/lazyconfig_train_net.py" 18 | # You can use your own instead, together with your own train_net.py 19 | 20 | train = dict( 21 | output_dir="./output", 22 | init_checkpoint="", 23 | max_iter="???", 24 | amp=dict( 25 | enabled=False, 26 | opt_level=None, 27 | ), # options for Automatic Mixed Precision 28 | grad_clip=None, 29 | ddp=dict( # options for DistributedDataParallel 30 | broadcast_buffers=False, 31 | find_unused_parameters=False, 32 | fp16_compression=False, 33 | ), 34 | checkpointer=dict(period=5000, max_to_keep=2), # options for PeriodicCheckpointer 35 | eval_period="${train.checkpointer.period}", 36 | log_period=50, 37 | device="cuda", 38 | seed=42, 39 | # ... 40 | wandb=dict( 41 | enable_writer=False, 42 | resume=False, 43 | project="ODISE", 44 | ), 45 | cfg_name="", 46 | run_name="", 47 | run_tag="", 48 | reference_world_size=0, 49 | ) 50 | -------------------------------------------------------------------------------- /datasets/ade20k_instance_catid_mapping.txt: -------------------------------------------------------------------------------- 1 | Instacne100 SceneParse150 FullADE20K 2 | 1 8 165 3 | 2 9 3055 4 | 3 11 350 5 | 4 13 1831 6 | 5 15 774 7 | 5 15 783 8 | 6 16 2684 9 | 7 19 687 10 | 8 20 471 11 | 9 21 401 12 | 10 23 1735 13 | 11 24 2473 14 | 12 25 2329 15 | 13 28 1564 16 | 14 31 57 17 | 15 32 2272 18 | 16 33 907 19 | 17 34 724 20 | 18 36 2985 21 | 18 36 533 22 | 19 37 1395 23 | 20 38 155 24 | 21 39 2053 25 | 22 40 689 26 | 23 42 266 27 | 24 43 581 28 | 25 44 2380 29 | 26 45 491 30 | 27 46 627 31 | 28 48 2388 32 | 29 50 943 33 | 30 51 2096 34 | 31 54 2530 35 | 32 56 420 36 | 33 57 1948 37 | 34 58 1869 38 | 35 59 2251 39 | 36 63 239 40 | 37 65 571 41 | 38 66 2793 42 | 39 67 978 43 | 40 68 236 44 | 41 70 181 45 | 42 71 629 46 | 43 72 2598 47 | 44 73 1744 48 | 45 74 1374 49 | 46 75 591 50 | 47 76 2679 51 | 48 77 223 52 | 49 79 47 53 | 50 81 327 54 | 51 82 2821 55 | 52 83 1451 56 | 53 84 2880 57 | 54 86 480 58 | 55 87 77 59 | 56 88 2616 60 | 57 89 246 61 | 57 89 247 62 | 58 90 2733 63 | 59 91 14 64 | 60 93 38 65 | 61 94 1936 66 | 62 96 120 67 | 63 98 1702 68 | 64 99 249 69 | 65 103 2928 70 | 66 104 2337 71 | 67 105 1023 72 | 68 108 2989 73 | 69 109 1930 74 | 70 111 2586 75 | 71 112 131 76 | 72 113 146 77 | 73 116 95 78 | 74 117 1563 79 | 75 119 1708 80 | 76 120 103 81 | 77 121 1002 82 | 78 122 2569 83 | 79 124 2833 84 | 80 125 1551 85 | 81 126 1981 86 | 82 127 29 87 | 83 128 187 88 | 84 130 747 89 | 85 131 2254 90 | 86 133 2262 91 | 87 134 1260 92 | 88 135 2243 93 | 89 136 2932 94 | 90 137 2836 95 | 91 138 2850 96 | 92 139 64 97 | 93 140 894 98 | 94 143 1919 99 | 95 144 1583 100 | 96 145 318 101 | 97 147 2046 102 | 98 148 1098 103 | 99 149 530 104 | 100 150 954 105 | -------------------------------------------------------------------------------- /datasets/prepare_ade20k_sem_seg.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # 4 | # ------------------------------------------------------------------------------ 5 | # Copyright (c) Facebook, Inc. and its affiliates. 6 | # To view a copy of this license, visit 7 | # https://github.com/facebookresearch/Mask2Former/blob/main/LICENSE 8 | # ------------------------------------------------------------------------------ 9 | 10 | import os 11 | from pathlib import Path 12 | 13 | import numpy as np 14 | import tqdm 15 | from PIL import Image 16 | 17 | 18 | def convert(input, output): 19 | img = np.asarray(Image.open(input)) 20 | assert img.dtype == np.uint8 21 | img = img - 1 # 0 (ignore) becomes 255. others are shifted by 1 22 | Image.fromarray(img).save(output) 23 | 24 | 25 | if __name__ == "__main__": 26 | dataset_dir = ( 27 | Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "ade" / "ADEChallengeData2016" 28 | ) 29 | for name in ["training", "validation"]: 30 | annotation_dir = dataset_dir / "annotations" / name 31 | output_dir = dataset_dir / "annotations_detectron2" / name 32 | output_dir.mkdir(parents=True, exist_ok=True) 33 | for file in tqdm.tqdm(list(annotation_dir.iterdir())): 34 | output_file = output_dir / file.name 35 | convert(file, output_file) 36 | -------------------------------------------------------------------------------- /datasets/prepare_coco_caption.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # 4 | # This work is made available under the Nvidia Source Code License. 5 | # To view a copy of this license, visit 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE 7 | # 8 | # Written by Jiarui Xu 9 | # ------------------------------------------------------------------------------ 10 | 11 | # Convert adding COCO captions into annotation json 12 | 13 | import json 14 | import os 15 | from collections import defaultdict 16 | 17 | 18 | def load_coco_caption(): 19 | id2caption = defaultdict(list) 20 | dataset_dir = os.path.join(os.getenv("DETECTRON2_DATASETS", "datasets"), "coco") 21 | for json_file in ["captions_train2017.json", "captions_val2017.json"]: 22 | with open(os.path.join(dataset_dir, "annotations", json_file)) as f: 23 | obj = json.load(f) 24 | for ann in obj["annotations"]: 25 | id2caption[int(ann["image_id"])].append(ann["caption"]) 26 | 27 | return id2caption 28 | 29 | 30 | def create_annotation_with_caption(input_json, output_json): 31 | id2coco_caption = load_coco_caption() 32 | 33 | with open(input_json) as f: 34 | obj = json.load(f) 35 | 36 | coco_count = 0 37 | 38 | print(f"Starting to add captions to {input_json} ...") 39 | print(f"Total images: {len(obj['annotations'])}") 40 | for ann in obj["annotations"]: 41 | image_id = int(ann["image_id"]) 42 | if image_id in id2coco_caption: 43 | ann["coco_captions"] = id2coco_caption[image_id] 44 | coco_count += 1 45 | print(f"Found {coco_count} captions from COCO ") 46 | 47 | print(f"Start writing to {output_json} ...") 48 | with open(output_json, "w") as f: 49 | json.dump(obj, f) 50 | 51 | 52 | if __name__ == "__main__": 53 | dataset_dir = os.path.join(os.getenv("DETECTRON2_DATASETS", "datasets"), "coco") 54 | for s in ["val2017", "val2017_100", "train2017"]: 55 | create_annotation_with_caption( 56 | os.path.join(dataset_dir, "annotations/panoptic_{}.json".format(s)), 57 | os.path.join(dataset_dir, "annotations/panoptic_caption_{}.json".format(s)), 58 | ) 59 | -------------------------------------------------------------------------------- /datasets/prepare_lvis_openseg_labels.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # 4 | # This work is made available under the Nvidia Source Code License. 5 | # To view a copy of this license, visit 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE 7 | # 8 | # Written by Jiarui Xu 9 | # ------------------------------------------------------------------------------ 10 | 11 | import json 12 | import os 13 | 14 | if __name__ == "__main__": 15 | dataset_dir = os.path.join(os.getenv("DETECTRON2_DATASETS", "datasets"), "coco") 16 | ann = os.path.join(dataset_dir, "annotations/lvis_v1_val.json") 17 | print("Loading", ann) 18 | data = json.load(open(ann, "r")) 19 | cat_names = [x["name"] for x in sorted(data["categories"], key=lambda x: x["id"])] 20 | nonrare_names = [ 21 | x["name"] 22 | for x in sorted(data["categories"], key=lambda x: x["id"]) 23 | if x["frequency"] != "r" 24 | ] 25 | 26 | synonyms = [x["synonyms"] for x in sorted(data["categories"], key=lambda x: x["id"])] 27 | nonrare_synonyms = [ 28 | x["synonyms"] 29 | for x in sorted(data["categories"], key=lambda x: x["id"]) 30 | if x["frequency"] != "r" 31 | ] 32 | 33 | with open("datasets/openseg/lvis_1203.txt", "w") as f: 34 | for idx, cat in enumerate(cat_names): 35 | cat = cat.replace("_", " ") 36 | f.write(f"{idx+1}:{cat}\n") 37 | 38 | with open("datasets/openseg/lvis_1203_with_prompt_eng.txt", "w") as f: 39 | for idx, syns in enumerate(synonyms): 40 | cat = ",".join(syns) 41 | cat = cat.replace("_", " ") 42 | f.write(f"{idx+1}:{cat}\n") 43 | 44 | with open("datasets/openseg/lvis_nonrare_866.txt", "w") as f: 45 | for idx, cat in enumerate(nonrare_names): 46 | cat = cat.replace("_", " ") 47 | f.write(f"{idx+1}:{cat}\n") 48 | 49 | with open("datasets/openseg/lvis_nonrare_866_with_prompt_eng.txt", "w") as f: 50 | for idx, syns in enumerate(nonrare_synonyms): 51 | cat = ",".join(syns) 52 | cat = cat.replace("_", " ") 53 | f.write(f"{idx+1}:{cat}\n") 54 | -------------------------------------------------------------------------------- /datasets/prepare_pascal_ctx_full_sem_seg.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # 4 | # This work is made available under the Nvidia Source Code License. 5 | # To view a copy of this license, visit 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE 7 | # 8 | # Written by Jiarui Xu 9 | # ------------------------------------------------------------------------------ 10 | 11 | import os 12 | import numpy as np 13 | from pathlib import Path 14 | from PIL import Image 15 | import scipy.io as sio 16 | 17 | import tqdm 18 | 19 | 20 | def generate_labels(mat_file, out_dir): 21 | 22 | mat = sio.loadmat(mat_file) 23 | label_map = mat["LabelMap"] 24 | assert label_map.dtype == np.uint16 25 | label_map[label_map == 0] = 65535 26 | label_map = label_map - 1 27 | label_map[label_map == 65534] = 65535 28 | 29 | out_file = out_dir / Path(mat_file.name).with_suffix(".tif") 30 | Image.fromarray(label_map).save(out_file) 31 | 32 | 33 | if __name__ == "__main__": 34 | dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "pascal_ctx_d2" 35 | voc_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "VOCdevkit/VOC2010" 36 | mat_dir = voc_dir / "trainval" 37 | for split in ["training", "validation"]: 38 | file_names = list((dataset_dir / "images" / split).glob("*.jpg")) 39 | output_img_dir = dataset_dir / "images" / split 40 | output_ann_dir = dataset_dir / "annotations_ctx459" / split 41 | 42 | output_img_dir.mkdir(parents=True, exist_ok=True) 43 | output_ann_dir.mkdir(parents=True, exist_ok=True) 44 | 45 | for file_name in tqdm.tqdm(file_names): 46 | mat_file_path = mat_dir / f"{file_name.stem}.mat" 47 | 48 | generate_labels(mat_file_path, output_ann_dir) 49 | -------------------------------------------------------------------------------- /datasets/prepare_pascal_ctx_sem_seg.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # 4 | # This work is made available under the Nvidia Source Code License. 5 | # To view a copy of this license, visit 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE 7 | # 8 | # Written by Jiarui Xu 9 | # ------------------------------------------------------------------------------ 10 | 11 | import os 12 | from pathlib import Path 13 | import shutil 14 | 15 | import numpy as np 16 | import tqdm 17 | from PIL import Image 18 | import multiprocessing as mp 19 | import functools 20 | from detail import Detail 21 | 22 | # fmt: off 23 | _mapping = np.sort( 24 | np.array([ 25 | 0, 2, 259, 260, 415, 324, 9, 258, 144, 18, 19, 22, 23, 397, 25, 284, 26 | 158, 159, 416, 33, 162, 420, 454, 295, 296, 427, 44, 45, 46, 308, 59, 27 | 440, 445, 31, 232, 65, 354, 424, 68, 326, 72, 458, 34, 207, 80, 355, 28 | 85, 347, 220, 349, 360, 98, 187, 104, 105, 366, 189, 368, 113, 115 29 | ])) 30 | # fmt: on 31 | _key = np.array(range(len(_mapping))).astype("uint8") 32 | 33 | 34 | def generate_labels(img_info, detail_api, out_dir): 35 | def _class_to_index(mask, _mapping, _key): 36 | # assert the values 37 | values = np.unique(mask) 38 | for i in range(len(values)): 39 | assert values[i] in _mapping 40 | index = np.digitize(mask.ravel(), _mapping, right=True) 41 | return _key[index].reshape(mask.shape) 42 | 43 | sem_seg = _class_to_index(detail_api.getMask(img_info), _mapping=_mapping, _key=_key) 44 | sem_seg = sem_seg - 1 # 0 (ignore) becomes 255. others are shifted by 1 45 | filename = img_info["file_name"] 46 | 47 | Image.fromarray(sem_seg).save(out_dir / filename.replace("jpg", "png")) 48 | 49 | 50 | def copy_images(img_info, img_dir, out_dir): 51 | filename = img_info["file_name"] 52 | shutil.copy2(img_dir / filename, out_dir / filename) 53 | 54 | 55 | if __name__ == "__main__": 56 | dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "pascal_ctx_d2" 57 | voc_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "VOCdevkit/VOC2010" 58 | for split in ["training", "validation"]: 59 | img_dir = voc_dir / "JPEGImages" 60 | if split == "training": 61 | detail_api = Detail(voc_dir / "trainval_merged.json", img_dir, "train") 62 | else: 63 | detail_api = Detail(voc_dir / "trainval_merged.json", img_dir, "val") 64 | img_infos = detail_api.getImgs() 65 | 66 | output_img_dir = dataset_dir / "images" / split 67 | output_ann_dir = dataset_dir / "annotations_ctx59" / split 68 | 69 | output_img_dir.mkdir(parents=True, exist_ok=True) 70 | output_ann_dir.mkdir(parents=True, exist_ok=True) 71 | 72 | pool = mp.Pool(processes=max(mp.cpu_count() // 2, 4)) 73 | 74 | pool.map( 75 | functools.partial(copy_images, img_dir=img_dir, out_dir=output_img_dir), 76 | tqdm.tqdm(img_infos, desc=f"Writing {split} images to {output_img_dir} ..."), 77 | chunksize=100, 78 | ) 79 | 80 | pool.map( 81 | functools.partial(generate_labels, detail_api=detail_api, out_dir=output_ann_dir), 82 | tqdm.tqdm(img_infos, desc=f"Writing {split} images to {output_ann_dir} ..."), 83 | chunksize=100, 84 | ) 85 | -------------------------------------------------------------------------------- /datasets/prepare_pascal_voc_sem_seg.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # 4 | # This work is made available under the Nvidia Source Code License. 5 | # To view a copy of this license, visit 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE 7 | # 8 | # Written by Jiarui Xu 9 | # ------------------------------------------------------------------------------ 10 | 11 | import os 12 | from pathlib import Path 13 | import shutil 14 | 15 | import numpy as np 16 | import tqdm 17 | from PIL import Image 18 | 19 | 20 | def convert(input, output): 21 | img = np.asarray(Image.open(input)) 22 | assert img.dtype == np.uint8 23 | # do nothing 24 | Image.fromarray(img).save(output) 25 | 26 | 27 | if __name__ == "__main__": 28 | dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "pascal_voc_d2" 29 | voc_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "VOCdevkit/VOC2012" 30 | for split in ["training", "validation"]: 31 | if split == "training": 32 | img_name_path = voc_dir / "ImageSets/Segmentation/train.txt" 33 | else: 34 | img_name_path = voc_dir / "ImageSets/Segmentation/val.txt" 35 | img_dir = voc_dir / "JPEGImages" 36 | ann_dir = voc_dir / "SegmentationClass" 37 | 38 | output_img_dir = dataset_dir / "images" / split 39 | output_ann_dir = dataset_dir / "annotations_pascal21" / split 40 | 41 | output_img_dir.mkdir(parents=True, exist_ok=True) 42 | output_ann_dir.mkdir(parents=True, exist_ok=True) 43 | 44 | with open(img_name_path) as f: 45 | for line in tqdm.tqdm(f.readlines()): 46 | img_name = line.strip() 47 | img_path = img_dir / f"{img_name}.jpg" 48 | ann_path = ann_dir / f"{img_name}.png" 49 | 50 | # print(f'copy2 {output_img_dir}') 51 | shutil.copy2(img_path, output_img_dir) 52 | # print(f"convert {ann_dir} to {output_ann_dir / f'{img_name}.png'}") 53 | convert(ann_path, output_ann_dir / f"{img_name}.png") 54 | -------------------------------------------------------------------------------- /demo/examples/ade.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/ODISE/2b187e4b2ff4c3d5da342aec2cc234b537720a65/demo/examples/ade.jpg -------------------------------------------------------------------------------- /demo/examples/coco.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/ODISE/2b187e4b2ff4c3d5da342aec2cc234b537720a65/demo/examples/coco.jpg -------------------------------------------------------------------------------- /demo/examples/ego4d.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/ODISE/2b187e4b2ff4c3d5da342aec2cc234b537720a65/demo/examples/ego4d.jpg -------------------------------------------------------------------------------- /demo/examples/purse.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/ODISE/2b187e4b2ff4c3d5da342aec2cc234b537720a65/demo/examples/purse.jpeg -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # 4 | # This work is made available under the Nvidia Source Code License. 5 | # To view a copy of this license, visit 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE 7 | # 8 | # Written by Jiarui Xu 9 | # ------------------------------------------------------------------------------ 10 | 11 | FROM pytorch/pytorch:1.13.1-cuda11.6-cudnn8-devel 12 | 13 | WORKDIR /workspace 14 | 15 | ARG DEBIAN_FRONTEND=noninteractive 16 | ENV TZ=US/Pacific 17 | 18 | RUN apt-get update && apt-get install -y \ 19 | build-essential \ 20 | cmake \ 21 | curl \ 22 | g++ \ 23 | wget \ 24 | bzip2 \ 25 | git \ 26 | vim \ 27 | tmux \ 28 | htop \ 29 | git \ 30 | zip \ 31 | unzip \ 32 | ca-certificates \ 33 | libosmesa6-dev \ 34 | libgl1-mesa-glx \ 35 | libglfw3 \ 36 | patchelf \ 37 | libglu1-mesa \ 38 | libxext6 \ 39 | libxtst6 \ 40 | libxrender1 \ 41 | libxi6 \ 42 | libjpeg-dev \ 43 | libpng-dev \ 44 | libopenblas-dev \ 45 | libopencv-dev \ 46 | libyaml-dev \ 47 | libavformat-dev \ 48 | libavcodec-dev \ 49 | libswscale-dev \ 50 | libavutil-dev \ 51 | libavfilter-dev \ 52 | libavdevice-dev \ 53 | libswresample-dev \ 54 | less \ 55 | groff \ 56 | mpich 57 | 58 | RUN apt-get clean && rm -rf /var/lib/apt/lists/* 59 | 60 | # Install git lfs 61 | RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash 62 | RUN apt-get install -y git-lfs 63 | RUN git lfs install 64 | 65 | 66 | RUN curl https://rclone.org/install.sh | bash 67 | 68 | # Set timezone 69 | RUN ln -sf /usr/share/zoneinfo/US/Pacific /etc/localtime 70 | 71 | # Set CUDA_ROOT 72 | RUN export CUDA_HOME="/usr/local/cuda" 73 | 74 | # Install pytorch 75 | #RUN conda install pytorch torchvision cudatoolkit=11.1 -c pytorch -c conda-forge -y 76 | 77 | # Install zsh 78 | RUN sh -c "$(wget -O- https://github.com/deluan/zsh-in-docker/releases/download/v1.1.1/zsh-in-docker.sh)" -t robbyrussell -p git 79 | 80 | # Set a fixed model cache directory. 81 | ENV FVCORE_CACHE="/tmp" 82 | 83 | ENV HOME /workspace 84 | -------------------------------------------------------------------------------- /figs/github_arch.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/ODISE/2b187e4b2ff4c3d5da342aec2cc234b537720a65/figs/github_arch.gif -------------------------------------------------------------------------------- /figs/github_vis_ade_0.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/ODISE/2b187e4b2ff4c3d5da342aec2cc234b537720a65/figs/github_vis_ade_0.gif -------------------------------------------------------------------------------- /figs/github_vis_ade_1.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/ODISE/2b187e4b2ff4c3d5da342aec2cc234b537720a65/figs/github_vis_ade_1.gif -------------------------------------------------------------------------------- /figs/github_vis_coco_0.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/ODISE/2b187e4b2ff4c3d5da342aec2cc234b537720a65/figs/github_vis_coco_0.gif -------------------------------------------------------------------------------- /figs/github_vis_coco_1.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/ODISE/2b187e4b2ff4c3d5da342aec2cc234b537720a65/figs/github_vis_coco_1.gif -------------------------------------------------------------------------------- /figs/github_vis_ego4d_0.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/ODISE/2b187e4b2ff4c3d5da342aec2cc234b537720a65/figs/github_vis_ego4d_0.gif -------------------------------------------------------------------------------- /figs/github_vis_ego4d_1.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/ODISE/2b187e4b2ff4c3d5da342aec2cc234b537720a65/figs/github_vis_ego4d_1.gif -------------------------------------------------------------------------------- /figs/teaser.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/ODISE/2b187e4b2ff4c3d5da342aec2cc234b537720a65/figs/teaser.jpg -------------------------------------------------------------------------------- /odise/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # 4 | # This work is made available under the Nvidia Source Code License. 5 | # To view a copy of this license, visit 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE 7 | # 8 | # Written by Jiarui Xu 9 | # ------------------------------------------------------------------------------ 10 | 11 | # This line will be programatically read/write by setup.py. 12 | # Leave them at the bottom of this file and don't touch them. 13 | __version__ = "0.1" 14 | -------------------------------------------------------------------------------- /odise/checkpoint/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # 4 | # This work is made available under the Nvidia Source Code License. 5 | # To view a copy of this license, visit 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE 7 | # 8 | # Written by Jiarui Xu 9 | # ------------------------------------------------------------------------------ 10 | 11 | from .odise_checkpointer import ODISECheckpointer 12 | 13 | __all__ = ["ODISECheckpointer"] 14 | -------------------------------------------------------------------------------- /odise/config/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # 4 | # This work is made available under the Nvidia Source Code License. 5 | # To view a copy of this license, visit 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE 7 | # 8 | # Written by Jiarui Xu 9 | # ------------------------------------------------------------------------------ 10 | 11 | from .instantiate import instantiate_odise 12 | from .utils import auto_scale_workers 13 | 14 | __all__ = [ 15 | "instantiate_odise", 16 | "auto_scale_workers", 17 | ] 18 | -------------------------------------------------------------------------------- /odise/config/instantiate.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # 4 | # This work is made available under the Nvidia Source Code License. 5 | # To view a copy of this license, visit 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE 7 | # 8 | # Written by Jiarui Xu 9 | # ------------------------------------------------------------------------------ 10 | 11 | from detectron2.config import instantiate 12 | 13 | 14 | def instantiate_odise(cfg): 15 | backbone = instantiate(cfg.backbone) 16 | cfg.sem_seg_head.input_shape = backbone.output_shape() 17 | cfg.sem_seg_head.pixel_decoder.input_shape = backbone.output_shape() 18 | cfg.backbone = backbone 19 | model = instantiate(cfg) 20 | 21 | return model 22 | -------------------------------------------------------------------------------- /odise/config/utils.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # To view a copy of this license, visit 4 | # https://github.com/facebookresearch/detectron2/blob/main/LICENSE 5 | # ------------------------------------------------------------------------------ 6 | # 7 | # ------------------------------------------------------------------------------ 8 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 9 | # 10 | # This work is made available under the Nvidia Source Code License. 11 | # To view a copy of this license, visit 12 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE 13 | # 14 | # Written by Jiarui Xu 15 | # ------------------------------------------------------------------------------ 16 | 17 | from copy import deepcopy 18 | 19 | 20 | def auto_scale_workers(cfg, num_workers: int): 21 | """ 22 | When the config is defined for certain number of workers (according to 23 | ``cfg.train.reference_world_size``) that's different from the number of 24 | workers currently in use, returns a new cfg where the total batch size 25 | is scaled so that the per-GPU batch size stays the same as the 26 | original ``total_batch_size // reference_world_size``. 27 | 28 | Other config options are also scaled accordingly: 29 | * training steps and warmup steps are scaled inverse proportionally. 30 | * learning rate are scaled proportionally, following :paper:`ImageNet in 1h`. 31 | 32 | For example, with the original config like the following: 33 | 34 | .. code-block:: yaml 35 | 36 | dataloader.train.total_batch_size: 16 37 | optimizer.lr: 0.1 38 | train.reference_world_size: 8 39 | train.max_iter: 5000 40 | train.checkpointer.period: 1000 41 | 42 | When this config is used on 16 GPUs instead of the reference number 8, 43 | calling this method will return a new config with: 44 | 45 | .. code-block:: yaml 46 | 47 | dataloader.train.total_batch_size: 32 48 | optimizer.lr: 0.2 49 | train.reference_world_size: 16 50 | train.max_iter: 2500 51 | train.checkpointer.period: 500 52 | 53 | Note that both the original config and this new config can be trained on 16 GPUs. 54 | It's up to user whether to enable this feature (by setting ``reference_world_size``). 55 | 56 | Returns: 57 | CfgNode: a new config. Same as original if ``cfg.SOLVER.REFERENCE_WORLD_SIZE==0``. 58 | """ 59 | old_world_size = cfg.train.reference_world_size 60 | if old_world_size == 0 or old_world_size == num_workers: 61 | print("No need to scale the config.") 62 | return cfg 63 | cfg = deepcopy(cfg) 64 | 65 | assert cfg.dataloader.train.total_batch_size % old_world_size == 0, ( 66 | f"Invalid reference_world_size in config! " 67 | f"{cfg.dataloader.train.total_batch_size} % {old_world_size} != 0" 68 | ) 69 | scale = num_workers / old_world_size 70 | bs = cfg.dataloader.train.total_batch_size = int( 71 | round(cfg.dataloader.train.total_batch_size * scale) 72 | ) 73 | lr = cfg.optimizer.lr = cfg.optimizer.lr * scale 74 | max_iter = cfg.train.max_iter = int(round(cfg.train.max_iter / scale)) 75 | cfg.train.eval_period = int(round(cfg.train.eval_period / scale)) 76 | cfg.train.checkpointer.period = int(round(cfg.train.checkpointer.period / scale)) 77 | cfg.train.reference_world_size = num_workers # maintain invariant 78 | print( 79 | f"Auto-scaling the config to batch_size={bs}, learning_rate={lr}, " f"max_iter={max_iter}." 80 | ) 81 | 82 | return cfg 83 | -------------------------------------------------------------------------------- /odise/data/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # 4 | # This work is made available under the Nvidia Source Code License. 5 | # To view a copy of this license, visit 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE 7 | # 8 | # Written by Jiarui Xu 9 | # ------------------------------------------------------------------------------ 10 | 11 | 12 | from .build import get_openseg_labels, build_d2_train_dataloader, build_d2_test_dataloader 13 | from .dataset_mapper import COCOPanopticDatasetMapper 14 | from .datasets import ( 15 | register_all_ctx59, 16 | register_all_pascal21, 17 | register_all_ctx459, 18 | register_all_coco_panoptic_annos_sem_seg_caption, 19 | ) 20 | 21 | __all__ = [ 22 | "COCOPanopticDatasetMapper", 23 | "get_openseg_labels", 24 | "build_d2_train_dataloader", 25 | "build_d2_test_dataloader", 26 | "register_all_ctx59", 27 | "register_all_pascal21", 28 | "register_all_ctx459", 29 | "register_all_coco_panoptic_annos_sem_seg_caption", 30 | ] 31 | -------------------------------------------------------------------------------- /odise/data/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # 4 | # This work is made available under the Nvidia Source Code License. 5 | # To view a copy of this license, visit 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE 7 | # 8 | # Written by Jiarui Xu 9 | # ------------------------------------------------------------------------------ 10 | 11 | from .register_pascal import register_all_ctx59, register_all_pascal21, register_all_ctx459 12 | from .register_coco_caption import register_all_coco_panoptic_annos_sem_seg_caption 13 | 14 | __all__ = [ 15 | "register_all_ctx59", 16 | "register_all_pascal21", 17 | "register_all_ctx459", 18 | "register_all_coco_panoptic_annos_sem_seg_caption", 19 | ] 20 | -------------------------------------------------------------------------------- /odise/data/datasets/openseg_labels/README.md: -------------------------------------------------------------------------------- 1 | # Acknowledgement 2 | 3 | We thank Golnaz Ghiasi for providing the [OpenSeg](https://arxiv.org/abs/2112.12143) labels for evaluation. 4 | 5 | 6 | ## Citation 7 | 8 | ```BiBTeX 9 | @inproceedings{ghiasi2022scaling, 10 | title={Scaling open-vocabulary image segmentation with image-level labels}, 11 | author={Ghiasi, Golnaz and Gu, Xiuye and Cui, Yin and Lin, Tsung-Yi}, 12 | booktitle={Computer Vision--ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23--27, 2022, Proceedings, Part XXXVI}, 13 | pages={540--557}, 14 | year={2022}, 15 | organization={Springer} 16 | } 17 | ``` 18 | -------------------------------------------------------------------------------- /odise/data/datasets/openseg_labels/coco_panoptic.txt: -------------------------------------------------------------------------------- 1 | 0:invalid_class_id 2 | 1:person 3 | 2:bicycle 4 | 3:car 5 | 4:motorcycle 6 | 5:airplane 7 | 6:bus 8 | 7:train 9 | 8:truck 10 | 9:boat 11 | 10:traffic light 12 | 11:fire hydrant 13 | 12:invalid_class_id 14 | 13:stop sign 15 | 14:parking meter 16 | 15:bench 17 | 16:bird 18 | 17:cat 19 | 18:dog 20 | 19:horse 21 | 20:sheep 22 | 21:cow 23 | 22:elephant 24 | 23:bear 25 | 24:zebra 26 | 25:giraffe 27 | 26:invalid_class_id 28 | 27:backpack 29 | 28:umbrella 30 | 29:invalid_class_id 31 | 30:invalid_class_id 32 | 31:handbag 33 | 32:tie 34 | 33:suitcase 35 | 34:frisbee 36 | 35:skis 37 | 36:snowboard 38 | 37:sports ball 39 | 38:kite 40 | 39:baseball bat 41 | 40:baseball glove 42 | 41:skateboard 43 | 42:surfboard 44 | 43:tennis racket 45 | 44:bottle 46 | 45:invalid_class_id 47 | 46:wine glass 48 | 47:cup 49 | 48:fork 50 | 49:knife 51 | 50:spoon 52 | 51:bowl 53 | 52:banana 54 | 53:apple 55 | 54:sandwich 56 | 55:orange 57 | 56:broccoli 58 | 57:carrot 59 | 58:hot dog 60 | 59:pizza 61 | 60:donut 62 | 61:cake 63 | 62:chair 64 | 63:couch 65 | 64:potted plant 66 | 65:bed 67 | 66:invalid_class_id 68 | 67:dining table 69 | 68:invalid_class_id 70 | 69:invalid_class_id 71 | 70:toilet 72 | 71:invalid_class_id 73 | 72:tv 74 | 73:laptop 75 | 74:mouse 76 | 75:remote 77 | 76:keyboard 78 | 77:cell phone 79 | 78:microwave 80 | 79:oven 81 | 80:toaster 82 | 81:sink 83 | 82:refrigerator 84 | 83:invalid_class_id 85 | 84:book 86 | 85:clock 87 | 86:vase 88 | 87:scissors 89 | 88:teddy bear 90 | 89:hair drier 91 | 90:toothbrush 92 | 91:invalid_class_id 93 | 92:banner 94 | 93:blanket 95 | 94:invalid_class_id 96 | 95:bridge 97 | 96:invalid_class_id 98 | 97:invalid_class_id 99 | 98:invalid_class_id 100 | 99:invalid_class_id 101 | 100:cardboard 102 | 101:invalid_class_id 103 | 102:invalid_class_id 104 | 103:invalid_class_id 105 | 104:invalid_class_id 106 | 105:invalid_class_id 107 | 106:invalid_class_id 108 | 107:counter 109 | 108:invalid_class_id 110 | 109:curtain 111 | 110:invalid_class_id 112 | 111:invalid_class_id 113 | 112:door 114 | 113:invalid_class_id 115 | 114:invalid_class_id 116 | 115:invalid_class_id 117 | 116:invalid_class_id 118 | 117:invalid_class_id 119 | 118:wood floor 120 | 119:flower 121 | 120:invalid_class_id 122 | 121:invalid_class_id 123 | 122:fruit 124 | 123:invalid_class_id 125 | 124:invalid_class_id 126 | 125:gravel 127 | 126:invalid_class_id 128 | 127:invalid_class_id 129 | 128:house 130 | 129:invalid_class_id 131 | 130:light 132 | 131:invalid_class_id 133 | 132:invalid_class_id 134 | 133:mirror-stuff 135 | 134:invalid_class_id 136 | 135:invalid_class_id 137 | 136:invalid_class_id 138 | 137:invalid_class_id 139 | 138:net 140 | 139:invalid_class_id 141 | 140:invalid_class_id 142 | 141:pillow 143 | 142:invalid_class_id 144 | 143:invalid_class_id 145 | 144:platform 146 | 145:playingfield 147 | 146:invalid_class_id 148 | 147:railroad 149 | 148:river 150 | 149:road 151 | 150:invalid_class_id 152 | 151:roof 153 | 152:invalid_class_id 154 | 153:invalid_class_id 155 | 154:sand 156 | 155:sea 157 | 156:shelf 158 | 157:invalid_class_id 159 | 158:invalid_class_id 160 | 159:snow 161 | 160:invalid_class_id 162 | 161:stairs 163 | 162:invalid_class_id 164 | 163:invalid_class_id 165 | 164:invalid_class_id 166 | 165:invalid_class_id 167 | 166:tent 168 | 167:invalid_class_id 169 | 168:towel 170 | 169:invalid_class_id 171 | 170:invalid_class_id 172 | 171:brick wall 173 | 172:invalid_class_id 174 | 173:invalid_class_id 175 | 174:invalid_class_id 176 | 175:stone wall 177 | 176:tile wall 178 | 177:wood wall 179 | 178:water 180 | 179:invalid_class_id 181 | 180:window blind 182 | 181:window 183 | 182:invalid_class_id 184 | 183:invalid_class_id 185 | 184:tree 186 | 185:fence 187 | 186:ceiling 188 | 187:sky 189 | 188:cabinet 190 | 189:table 191 | 190:floor 192 | 191:pavement 193 | 192:mountain 194 | 193:grass 195 | 194:dirt 196 | 195:paper 197 | 196:food 198 | 197:building 199 | 198:rock 200 | 199:wall 201 | 200:rug 202 | -------------------------------------------------------------------------------- /odise/data/datasets/openseg_labels/pascal_context_59.txt: -------------------------------------------------------------------------------- 1 | 0:invalid_class_id 2 | 1:aeroplane 3 | 2:bag 4 | 3:bed 5 | 4:bedclothes 6 | 5:bench 7 | 6:bicycle 8 | 7:bird 9 | 8:boat 10 | 9:book 11 | 10:bottle 12 | 11:building 13 | 12:bus 14 | 13:cabinet 15 | 14:car 16 | 15:cat 17 | 16:ceiling 18 | 17:chair 19 | 18:cloth 20 | 19:computer 21 | 20:cow 22 | 21:cup 23 | 22:curtain 24 | 23:dog 25 | 24:door 26 | 25:fence 27 | 26:floor 28 | 27:flower 29 | 28:food 30 | 29:grass 31 | 30:ground 32 | 31:horse 33 | 32:keyboard 34 | 33:light 35 | 34:motorbike 36 | 35:mountain 37 | 36:mouse 38 | 37:person 39 | 38:plate 40 | 39:platform 41 | 40:pottedplant 42 | 41:road 43 | 42:rock 44 | 43:sheep 45 | 44:shelves 46 | 45:sidewalk 47 | 46:sign 48 | 47:sky 49 | 48:snow 50 | 49:sofa 51 | 50:diningtable 52 | 51:track 53 | 52:train 54 | 53:tree 55 | 54:truck 56 | 55:tvmonitor 57 | 56:wall 58 | 57:water 59 | 58:window 60 | 59:wood 61 | -------------------------------------------------------------------------------- /odise/data/datasets/openseg_labels/pascal_context_59_with_prompt_eng.txt: -------------------------------------------------------------------------------- 1 | 0:invalid_class_id 2 | 1:aeroplane,aeroplanes,airplanes,airplane 3 | 2:bag,bags 4 | 3:bed,beds 5 | 4:bedclothes 6 | 5:bench,benches 7 | 6:bicycle,bicycles 8 | 7:bird,birds 9 | 8:boat,boats 10 | 9:book,books 11 | 10:bottle,bottles,water bottle 12 | 11:building,buildings 13 | 12:bus,buses 14 | 13:cabinet,cabinets,drawer,drawers 15 | 14:car,cars 16 | 15:cat,cats,kitties,kitty 17 | 16:ceiling 18 | 17:chair,chairs 19 | 18:cloth,clothes 20 | 19:computer case 21 | 20:cow,cows 22 | 21:cup,cups 23 | 22:curtain,curtains 24 | 23:dog,dogs,puppy,puppies 25 | 24:door,doors 26 | 25:fence,fences 27 | 26:floor,tile ground,carpet,rug,flooring 28 | 27:flower,flowers 29 | 28:food 30 | 29:grass,grasses,lawn,turf 31 | 30:ground,soil,soil ground,dirt ground 32 | 31:horse,horses,foal 33 | 32:keyboard,keyboards 34 | 33:lamp,lamps,bulb,bulbs 35 | 34:motorbike,motorcycle,motorbikes,motorcycles 36 | 35:mountain,mountains 37 | 36:mouse 38 | 37:person,child,girl,boy,woman,man,people,childeren,girls,boys,women,men,lady,guy,ladies,guys 39 | 38:plate,plates 40 | 39:platform,platforms 41 | 40:pottedplant,pottedplants,plant pot,plant pots,planter,planters 42 | 41:street,streets 43 | 42:rock,rocks,stone,stones 44 | 43:sheep 45 | 44:shelves,shelf 46 | 45:sidewalk 47 | 46:sign,signs 48 | 47:sky,clouds 49 | 48:snow 50 | 49:sofa 51 | 50:diningtable,diningtables,table,tables,desk,desks,side table,side tables,coffee table 52 | 51:track,train track,railroad 53 | 52:train,trains,locomotive,locomotives,freight train 54 | 53:tree,trees 55 | 54:truck,trucks 56 | 55:tvmonitor,monitor,tv 57 | 56:wall,walls 58 | 57:water 59 | 58:window,windows 60 | 59:wood piece 61 | -------------------------------------------------------------------------------- /odise/data/datasets/openseg_labels/pascal_voc_21.txt: -------------------------------------------------------------------------------- 1 | 0:background,bag,bed,bench,book,building,cabinet,ceiling,cloth,computer,cup,door,fence,floor,flower,food,grass,ground,keyboard,light,mountain,mouse,curtain,platform,sign,plate,road,rock,shelves,sidewalk,sky,snow,bedclothes,track,tree,truck,wall,water,window,wood 2 | 1:aeroplane 3 | 2:bicycle 4 | 3:bird 5 | 4:boat 6 | 5:bottle 7 | 6:bus 8 | 7:car 9 | 8:cat 10 | 9:chair 11 | 10:cow 12 | 11:diningtable 13 | 12:dog 14 | 13:horse 15 | 14:motorbike 16 | 15:person 17 | 16:pottedplant 18 | 17:sheep 19 | 18:sofa 20 | 19:train 21 | 20:tvmonitor 22 | -------------------------------------------------------------------------------- /odise/data/datasets/openseg_labels/pascal_voc_21_with_prompt_eng.txt: -------------------------------------------------------------------------------- 1 | 0:background,crops,bush,shrub,tiles,pavement,rug,carpet,box,boxes,speaker,storage,painting,board,panel,poster,clock,cage,drinking glass,park,plaything,toy,fireplace,bag,bag,bed,bench,book,books,building,buildings,cabinet,drawer,ceiling,computer,computer case,cup,cups,door,fence,floor,flower,grass,lawn,turf,ground,soil,dirt,tiles,keyboard,lamp,mountain,hills,mouse,curtain,platform,sign,street,rock,stone,shelf,sidewalk,sky,clouds,snow,track,train track,tree,trees,wall,water,window,wood,woods 2 | 1:aeroplane,airplane,aeroplanes,airplanes 3 | 2:bicycle,bicycles,bike,bikes 4 | 3:bird,birds 5 | 4:boat,boats 6 | 5:bottle,bottles,water bottle 7 | 6:bus,buses 8 | 7:car,cars 9 | 8:cat,cats,kitties,kitty 10 | 9:chair,chairs 11 | 10:cow,cows,calf 12 | 11:diningtable,dining table,diningtables,dining tables,plate,plates 13 | 12:dog,dogs,puppy,puppies 14 | 13:horse,horses,foal 15 | 14:motorbike,motorcycle,motorbikes,motorcycles 16 | 15:person,child,girl,boy,woman,man,people,childeren,girls,boys,women,men,lady,guy,ladies,guys,clothes 17 | 16:pottedplant,pottedplants,plant pot,plant pots,planter,planters 18 | 17:sheep 19 | 18:sofa,sofas 20 | 19:train,trains,locomotive,locomotives,freight train 21 | 20:tvmonitor,monitor,tv 22 | -------------------------------------------------------------------------------- /odise/data/datasets/register_coco_caption.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # 4 | # This work is made available under the Nvidia Source Code License. 5 | # To view a copy of this license, visit 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE 7 | # 8 | # Written by Jiarui Xu 9 | # ------------------------------------------------------------------------------ 10 | 11 | import os 12 | from detectron2.data import MetadataCatalog 13 | from mask2former.data.datasets.register_coco_panoptic_annos_semseg import ( 14 | get_metadata, 15 | register_coco_panoptic_annos_sem_seg, 16 | ) 17 | 18 | _PREDEFINED_SPLITS_COCO_PANOPTIC_CAPTION = { 19 | "coco_2017_train_panoptic_caption": ( 20 | # This is the original panoptic annotation directory 21 | "coco/panoptic_train2017", 22 | "coco/annotations/panoptic_caption_train2017.json", 23 | # This directory contains semantic annotations that are 24 | # converted from panoptic annotations. 25 | # It is used by PanopticFPN. 26 | # You can use the script at detectron2/datasets/prepare_panoptic_fpn.py 27 | # to create these directories. 28 | "coco/panoptic_semseg_train2017", 29 | ), 30 | "coco_2017_val_panoptic_caption": ( 31 | "coco/panoptic_val2017", 32 | "coco/annotations/panoptic_caption_val2017.json", 33 | "coco/panoptic_semseg_val2017", 34 | ), 35 | "coco_2017_val_100_panoptic_caption": ( 36 | "coco/panoptic_val2017_100", 37 | "coco/annotations/panoptic_caption_val2017_100.json", 38 | "coco/panoptic_semseg_val2017_100", 39 | ), 40 | } 41 | 42 | 43 | # NOTE: the name is "coco_2017_train_panoptic_caption_with_sem_seg" and "coco_2017_val_panoptic_caption_with_sem_seg" # noqa 44 | def register_all_coco_panoptic_annos_sem_seg_caption(root): 45 | for ( 46 | prefix, 47 | (panoptic_root, panoptic_json, semantic_root), 48 | ) in _PREDEFINED_SPLITS_COCO_PANOPTIC_CAPTION.items(): 49 | if prefix.endswith("_panoptic_caption"): 50 | prefix_instances = prefix[: -len("_panoptic_caption")] 51 | else: 52 | raise ValueError("Unknown prefix: {}".format(prefix)) 53 | instances_meta = MetadataCatalog.get(prefix_instances) 54 | image_root, instances_json = instances_meta.image_root, instances_meta.json_file 55 | 56 | register_coco_panoptic_annos_sem_seg( 57 | prefix, 58 | get_metadata(), 59 | image_root, 60 | os.path.join(root, panoptic_root), 61 | os.path.join(root, panoptic_json), 62 | os.path.join(root, semantic_root), 63 | instances_json, 64 | ) 65 | 66 | 67 | register_all_coco_panoptic_annos_sem_seg_caption(os.getenv("DETECTRON2_DATASETS", "datasets")) 68 | -------------------------------------------------------------------------------- /odise/engine/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # 4 | # This work is made available under the Nvidia Source Code License. 5 | # To view a copy of this license, visit 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE 7 | # 8 | # Written by Jiarui Xu 9 | # ------------------------------------------------------------------------------ 10 | 11 | from .train_loop import SimpleTrainer, AMPTrainer 12 | 13 | __all__ = ["SimpleTrainer", "AMPTrainer"] 14 | -------------------------------------------------------------------------------- /odise/engine/hooks.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # To view a copy of this license, visit 4 | # https://github.com/facebookresearch/detectron2/blob/main/LICENSE 5 | # ------------------------------------------------------------------------------ 6 | # 7 | # ------------------------------------------------------------------------------ 8 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 9 | # 10 | # This work is made available under the Nvidia Source Code License. 11 | # To view a copy of this license, visit 12 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE 13 | # 14 | # Written by Jiarui Xu 15 | # ------------------------------------------------------------------------------ 16 | 17 | import inspect 18 | import detectron2.utils.comm as comm 19 | from detectron2.engine import EvalHook as _EvalHook 20 | from detectron2.evaluation.testing import flatten_results_dict 21 | 22 | 23 | class EvalHook(_EvalHook): 24 | def __init__(self, eval_period, eval_function): 25 | super().__init__(eval_period, eval_function) 26 | func_args = inspect.getfullargspec(eval_function).args 27 | assert {"final_iter", "next_iter"}.issubset(set(func_args)), ( 28 | f"Eval function must have either 'final_iter' or 'next_iter' as an argument." 29 | f"Got {func_args} instead." 30 | ) 31 | 32 | def _do_eval(self, final_iter=False, next_iter=0): 33 | results = self._func(final_iter=final_iter, next_iter=next_iter) 34 | 35 | if results: 36 | assert isinstance( 37 | results, dict 38 | ), "Eval function must return a dict. Got {} instead.".format(results) 39 | 40 | flattened_results = flatten_results_dict(results) 41 | for k, v in flattened_results.items(): 42 | try: 43 | v = float(v) 44 | except Exception as e: 45 | raise ValueError( 46 | "[EvalHook] eval_function should return a nested dict of float. " 47 | "Got '{}: {}' instead.".format(k, v) 48 | ) from e 49 | self.trainer.storage.put_scalars(**flattened_results, smoothing_hint=False) 50 | 51 | # Evaluation may take different time among workers. 52 | # A barrier make them start the next iteration together. 53 | comm.synchronize() 54 | 55 | def after_step(self): 56 | next_iter = self.trainer.iter + 1 57 | if self._period > 0 and next_iter % self._period == 0: 58 | # do the last eval in after_train 59 | if next_iter != self.trainer.max_iter: 60 | self._do_eval(next_iter=next_iter) 61 | 62 | def after_train(self): 63 | # This condition is to prevent the eval from running after a failed training 64 | if self.trainer.iter + 1 >= self.trainer.max_iter: 65 | self._do_eval(final_iter=True) 66 | # func is likely a closure that holds reference to the trainer 67 | # therefore we clean it to avoid circular reference in the end 68 | del self._func 69 | -------------------------------------------------------------------------------- /odise/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # 4 | # This work is made available under the Nvidia Source Code License. 5 | # To view a copy of this license, visit 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE 7 | # 8 | # Written by Jiarui Xu 9 | # ------------------------------------------------------------------------------ 10 | 11 | from .evaluator import inference_on_dataset 12 | from .d2_evaluator import ( 13 | COCOPanopticEvaluator, 14 | InstanceSegEvaluator, 15 | SemSegEvaluator, 16 | COCOEvaluator, 17 | ) 18 | 19 | __all__ = [ 20 | "inference_on_dataset", 21 | "COCOPanopticEvaluator", 22 | "InstanceSegEvaluator", 23 | "SemSegEvaluator", 24 | "COCOEvaluator", 25 | ] 26 | -------------------------------------------------------------------------------- /odise/model_zoo/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # To view a copy of this license, visit 4 | # https://github.com/facebookresearch/detectron2/blob/main/LICENSE 5 | # ------------------------------------------------------------------------------ 6 | # 7 | # ------------------------------------------------------------------------------ 8 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 9 | # 10 | # This work is made available under the Nvidia Source Code License. 11 | # To view a copy of this license, visit 12 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE 13 | # 14 | # Written by Jiarui Xu 15 | # ------------------------------------------------------------------------------ 16 | 17 | """ 18 | Model Zoo API for ODISE: a collection of functions to create common model architectures 19 | listed in `MODEL_ZOO.md `_, 20 | and optionally load their pre-trained weights. 21 | """ 22 | 23 | from .model_zoo import get, get_config_file, get_checkpoint_url, get_config 24 | 25 | __all__ = ["get_checkpoint_url", "get", "get_config_file", "get_config"] 26 | -------------------------------------------------------------------------------- /odise/modeling/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # 4 | # This work is made available under the Nvidia Source Code License. 5 | # To view a copy of this license, visit 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE 7 | # 8 | # Written by Jiarui Xu 9 | # ------------------------------------------------------------------------------ 10 | 11 | from .backbone import FeatureExtractorBackbone 12 | 13 | __all__ = ["FeatureExtractorBackbone"] 14 | -------------------------------------------------------------------------------- /odise/modeling/backbone/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # 4 | # This work is made available under the Nvidia Source Code License. 5 | # To view a copy of this license, visit 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE 7 | # 8 | # Written by Jiarui Xu 9 | # ------------------------------------------------------------------------------ 10 | 11 | from .feature_extractor import FeatureExtractorBackbone 12 | 13 | __all__ = ["FeatureExtractorBackbone"] 14 | -------------------------------------------------------------------------------- /odise/modeling/diffusion/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # 4 | # This work is made available under the Nvidia Source Code License. 5 | # To view a copy of this license, visit 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE 7 | # 8 | # Written by Jiarui Xu 9 | # ------------------------------------------------------------------------------ 10 | 11 | from .diffusion_builder import create_gaussian_diffusion 12 | from .gaussian_diffusion import GaussianDiffusion 13 | 14 | __all__ = ["create_gaussian_diffusion", "GaussianDiffusion"] 15 | -------------------------------------------------------------------------------- /odise/modeling/diffusion/diffusion_builder.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) 2021 OpenAI 3 | # To view a copy of this license, visit 4 | # https://github.com/openai/glide-text2im/blob/main/LICENSE 5 | # ------------------------------------------------------------------------------ 6 | # 7 | # ------------------------------------------------------------------------------ 8 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 9 | # 10 | # This work is made available under the Nvidia Source Code License. 11 | # To view a copy of this license, visit 12 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE 13 | # 14 | # Written by Jiarui Xu 15 | # ------------------------------------------------------------------------------ 16 | 17 | from . import gaussian_diffusion as gd 18 | from .respace import SpacedDiffusion, space_timesteps 19 | 20 | 21 | def create_gaussian_diffusion( 22 | *, 23 | steps=1000, 24 | learn_sigma=False, 25 | sigma_small=False, 26 | noise_schedule="linear", 27 | use_kl=False, 28 | predict_xstart=False, 29 | rescale_timesteps=False, 30 | rescale_learned_sigmas=False, 31 | timestep_respacing="", 32 | ): 33 | betas = gd.get_named_beta_schedule(noise_schedule, steps) 34 | if use_kl: 35 | loss_type = gd.LossType.RESCALED_KL 36 | elif rescale_learned_sigmas: 37 | loss_type = gd.LossType.RESCALED_MSE 38 | else: 39 | loss_type = gd.LossType.MSE 40 | if not timestep_respacing: 41 | timestep_respacing = [steps] 42 | return SpacedDiffusion( 43 | use_timesteps=space_timesteps(steps, timestep_respacing), 44 | betas=betas, 45 | model_mean_type=( 46 | gd.ModelMeanType.EPSILON if not predict_xstart else gd.ModelMeanType.START_X 47 | ), 48 | model_var_type=( 49 | (gd.ModelVarType.FIXED_LARGE if not sigma_small else gd.ModelVarType.FIXED_SMALL) 50 | if not learn_sigma 51 | else gd.ModelVarType.LEARNED_RANGE 52 | ), 53 | loss_type=loss_type, 54 | rescale_timesteps=rescale_timesteps, 55 | ) 56 | -------------------------------------------------------------------------------- /odise/modeling/meta_arch/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # 4 | # This work is made available under the Nvidia Source Code License. 5 | # To view a copy of this license, visit 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE 7 | # 8 | # Written by Jiarui Xu 9 | # ------------------------------------------------------------------------------ 10 | from .odise import CategoryODISE, CaptionODISE 11 | 12 | __all__ = [ 13 | "CategoryODISE", 14 | "CaptionODISE", 15 | ] 16 | -------------------------------------------------------------------------------- /odise/modeling/preprocess.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # 4 | # This work is made available under the Nvidia Source Code License. 5 | # To view a copy of this license, visit 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE 7 | # 8 | # Written by Jiarui Xu 9 | # ------------------------------------------------------------------------------ 10 | 11 | import collections.abc 12 | import torch 13 | 14 | 15 | def batched_input_to_device(batched_inputs, device, exclude=()): 16 | 17 | if isinstance(exclude, str): 18 | exclude = [exclude] 19 | 20 | if isinstance(batched_inputs, torch.Tensor): 21 | batch = batched_inputs.to(device, non_blocking=True) 22 | return batch 23 | elif isinstance(batched_inputs, collections.abc.Mapping): 24 | batch = {} 25 | for k in batched_inputs: 26 | if k not in exclude: 27 | batched_inputs[k] = batched_input_to_device(batched_inputs[k], device) 28 | return batched_inputs 29 | 30 | elif isinstance(batched_inputs, collections.abc.Sequence) and not isinstance( 31 | batched_inputs, str 32 | ): 33 | return [batched_input_to_device(d, device) for d in batched_inputs] 34 | elif isinstance(batched_inputs, str): 35 | return batched_inputs 36 | else: 37 | raise TypeError(f"Unsupported type {type(batched_inputs)}") 38 | -------------------------------------------------------------------------------- /odise/modeling/wrapper/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # 4 | # This work is made available under the Nvidia Source Code License. 5 | # To view a copy of this license, visit 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE 7 | # 8 | # Written by Jiarui Xu 9 | # ------------------------------------------------------------------------------ 10 | 11 | from .pano_wrapper import OpenPanopticInference 12 | 13 | __all__ = ["OpenPanopticInference"] 14 | -------------------------------------------------------------------------------- /odise/modeling/wrapper/pano_wrapper.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # 4 | # This work is made available under the Nvidia Source Code License. 5 | # To view a copy of this license, visit 6 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE 7 | # 8 | # Written by Jiarui Xu 9 | # ------------------------------------------------------------------------------ 10 | 11 | from collections import OrderedDict 12 | import torch.nn as nn 13 | 14 | 15 | class OpenPanopticInference(nn.Module): 16 | def __init__( 17 | self, 18 | model, 19 | labels, 20 | metadata=None, 21 | semantic_on=True, 22 | instance_on=True, 23 | panoptic_on=True, 24 | test_topk_per_image=100, 25 | ): 26 | super().__init__() 27 | self.model = model 28 | self.labels = labels 29 | self.metadata = metadata 30 | 31 | self.semantic_on = semantic_on 32 | self.instance_on = instance_on 33 | self.panoptic_on = panoptic_on 34 | self.test_topk_per_image = test_topk_per_image 35 | 36 | self.open_state_dict = OrderedDict() 37 | 38 | for k in self.model.open_state_dict(): 39 | if k.endswith("test_labels"): 40 | self.open_state_dict[k] = self.labels 41 | elif k.endswith("metadata"): 42 | self.open_state_dict[k] = self.metadata 43 | elif k.endswith("num_classes"): 44 | self.open_state_dict[k] = self.num_classes 45 | elif k.endswith("semantic_on"): 46 | self.open_state_dict[k] = self.semantic_on 47 | elif k.endswith("instance_on"): 48 | self.open_state_dict[k] = self.instance_on 49 | elif k.endswith("panoptic_on"): 50 | self.open_state_dict[k] = self.panoptic_on 51 | elif k.endswith("test_topk_per_image"): 52 | self.open_state_dict[k] = self.test_topk_per_image 53 | 54 | @property 55 | def num_classes(self): 56 | return len(self.labels) 57 | 58 | def forward(self, batched_inputs): 59 | assert not self.training 60 | 61 | _open_state_dict = self.model.open_state_dict() 62 | self.model.load_open_state_dict(self.open_state_dict) 63 | 64 | results = self.model(batched_inputs) 65 | 66 | self.model.load_open_state_dict(_open_state_dict) 67 | 68 | return results 69 | -------------------------------------------------------------------------------- /odise/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/ODISE/2b187e4b2ff4c3d5da342aec2cc234b537720a65/odise/utils/__init__.py -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [isort] 2 | line_length=100 3 | multi_line_output=3 4 | include_trailing_comma=True 5 | known_standard_library=numpy,setuptools,mock 6 | skip=./datasets,docs,local_data,third_party 7 | skip_glob=*/__init__.py,**/configs/**,tests/config/**,vision/modeling/mask2former/**,output/** 8 | known_myself=odise 9 | known_third_party=fvcore,matplotlib,cv2,torch,torchvision,PIL,pycocotools,yacs,termcolor,cityscapesscripts,tabulate,tqdm,scipy,lvis,psutil,pkg_resources,caffe2,onnx,panopticapi,black,isort,av,iopath,omegaconf,hydra,yaml,pydoc,submitit,cloudpickle 10 | no_lines_before=STDLIB,THIRDPARTY 11 | sections=FUTURE,STDLIB,THIRDPARTY,myself,FIRSTPARTY,LOCALFOLDER 12 | default_section=FIRSTPARTY 13 | 14 | [mypy] 15 | python_version=3.6 16 | ignore_missing_imports = True 17 | warn_unused_configs = True 18 | disallow_untyped_defs = True 19 | check_untyped_defs = True 20 | warn_unused_ignores = True 21 | warn_redundant_casts = True 22 | show_column_numbers = True 23 | follow_imports = silent 24 | allow_redefinition = True 25 | ; Require all functions to be annotated 26 | disallow_incomplete_defs = True 27 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # ------------------------------------------------------------------------------ 4 | # Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 | # 6 | # This work is made available under the Nvidia Source Code License. 7 | # To view a copy of this license, visit 8 | # https://github.com/NVlabs/ODISE/blob/main/LICENSE 9 | # 10 | # Written by Jiarui Xu 11 | # ------------------------------------------------------------------------------ 12 | 13 | import glob 14 | import os 15 | import shutil 16 | from os import path 17 | from setuptools import find_packages, setup 18 | from typing import List 19 | import torch 20 | 21 | torch_ver = [int(x) for x in torch.__version__.split(".")[:2]] 22 | assert torch_ver >= [1, 8], "Requires PyTorch >= 1.8" 23 | 24 | 25 | def get_version(): 26 | init_py_path = path.join(path.abspath(path.dirname(__file__)), "odise", "__init__.py") 27 | init_py = open(init_py_path, "r").readlines() 28 | version_line = [l.strip() for l in init_py if l.startswith("__version__")][0] 29 | version = version_line.split("=")[-1].strip().strip("'\"") 30 | 31 | return version 32 | 33 | 34 | def get_model_zoo_configs() -> List[str]: 35 | """ 36 | Return a list of configs to include in package for model zoo. Copy over these configs inside 37 | odise/model_zoo. 38 | """ 39 | 40 | # Use absolute paths while symlinking. 41 | source_configs_dir = path.join(path.dirname(path.realpath(__file__)), "configs") 42 | destination = path.join(path.dirname(path.realpath(__file__)), "odise", "model_zoo", "configs") 43 | # Symlink the config directory inside package to have a cleaner pip install. 44 | 45 | # Remove stale symlink/directory from a previous build. 46 | if path.exists(source_configs_dir): 47 | if path.islink(destination): 48 | os.unlink(destination) 49 | elif path.isdir(destination): 50 | shutil.rmtree(destination) 51 | 52 | if not path.exists(destination): 53 | try: 54 | os.symlink(source_configs_dir, destination) 55 | except OSError: 56 | # Fall back to copying if symlink fails: ex. on Windows. 57 | shutil.copytree(source_configs_dir, destination) 58 | 59 | config_paths = glob.glob("configs/**/*.yaml", recursive=True) + glob.glob( 60 | "configs/**/*.py", recursive=True 61 | ) 62 | return config_paths 63 | 64 | 65 | setup( 66 | name="odise", 67 | version=get_version(), 68 | author="Jiarui Xu", 69 | url="https://github.com/NVlabs/ODISE", 70 | description="Open-vocabulary DIffusion-based Panoptic Segmentation", 71 | packages=find_packages(exclude=("configs", "tests*")), 72 | package_data={"odise.model_zoo": get_model_zoo_configs()}, 73 | python_requires=">=3.8", 74 | install_requires=[ 75 | "timm==0.6.11", # freeze timm version for stabliity 76 | "opencv-python==4.6.0.66", 77 | "diffdist==0.1", 78 | "nltk>=3.6.2", 79 | "einops>=0.3.0", 80 | "wandb>=0.12.11", 81 | # "transformers==4.20.1", # freeze transformers version for stabliity 82 | # there is BC breaking in omegaconf 2.2.1 83 | # see: https://github.com/omry/omegaconf/issues/939 84 | "omegaconf==2.1.1", 85 | "open-clip-torch==2.0.2", 86 | f"mask2former @ file://localhost/{os.getcwd()}/third_party/Mask2Former/", 87 | "stable-diffusion-sdkit==2.1.3", 88 | ], 89 | extras_require={ 90 | # dev dependencies. Install them by `pip install 'odise[dev]'` 91 | "dev": [ 92 | "flake8==3.8.1", 93 | "isort==4.3.21", 94 | "flake8-bugbear", 95 | "flake8-comprehensions", 96 | "click==8.0.4", 97 | "importlib-metadata==4.11.3", 98 | ], 99 | }, 100 | include_package_data=True, 101 | ) 102 | -------------------------------------------------------------------------------- /third_party/Mask2Former/.gitignore: -------------------------------------------------------------------------------- 1 | # output dir 2 | output 3 | instant_test_output 4 | inference_test_output 5 | 6 | 7 | *.png 8 | *.json 9 | *.diff 10 | *.jpg 11 | !/projects/DensePose/doc/images/*.jpg 12 | 13 | # compilation and distribution 14 | __pycache__ 15 | _ext 16 | *.pyc 17 | *.pyd 18 | *.so 19 | *.dll 20 | *.egg-info/ 21 | build/ 22 | dist/ 23 | wheels/ 24 | 25 | # pytorch/python/numpy formats 26 | *.pth 27 | *.pkl 28 | *.npy 29 | *.ts 30 | model_ts*.txt 31 | 32 | # ipython/jupyter notebooks 33 | *.ipynb 34 | **/.ipynb_checkpoints/ 35 | 36 | # Editor temporaries 37 | *.swn 38 | *.swo 39 | *.swp 40 | *~ 41 | 42 | # editor settings 43 | .idea 44 | .vscode 45 | _darcs 46 | 47 | # project dirs 48 | /detectron2/model_zoo/configs 49 | /datasets/* 50 | !/datasets/*.* 51 | /projects/*/datasets 52 | /models 53 | /snippet -------------------------------------------------------------------------------- /third_party/Mask2Former/ADVANCED_USAGE.md: -------------------------------------------------------------------------------- 1 | ## Advanced Usage of Mask2Former 2 | 3 | This document provides a brief intro of the advanced usage of Mask2Former for research purpose. 4 | 5 | Mask2Former is highly modulized, it consists of three components: a backbone, a pixel decoder and a Transformer decoder. 6 | You can easily replace each of these three components with your own implementation. 7 | 8 | ### Test Mask2Former with your own backbone 9 | 10 | 1. Define and register your backbone under `mask2former/modeling/backbone`. You can follow the Swin Transformer as an example. 11 | 2. Change the config file accordingly. 12 | 13 | ### Test Mask2Former with your own pixel decoder 14 | 15 | 1. Define and register your pixel decoder under `mask2former/modeling/pixel_decoder`. 16 | 2. Change the config file accordingly. 17 | 18 | Note that, your pixel decoder must have a `self.forward_features(features)` methods that returns three values: 19 | 1. `mask_features`, which is the per-pixel embeddings with resolution 1/4 of the original image. This is used to produce binary masks. 20 | 2. `None`, you can simply return `None` for the second value. 21 | 3. `multi_scale_features`, which is the multi-scale inputs to the Transformer decoder. This must be a list with length 3. 22 | We use resolution 1/32, 1/16, and 1/8 but you can use arbitrary resolutions here. 23 | 24 | Example config to use a Transformer-encoder enhanced FPN instead of MSDeformAttn: 25 | ``` 26 | MODEL: 27 | SEM_SEG_HEAD: 28 | # pixel decoder 29 | PIXEL_DECODER_NAME: "TransformerEncoderPixelDecoder" 30 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 31 | COMMON_STRIDE: 4 32 | TRANSFORMER_ENC_LAYERS: 6 33 | ``` 34 | 35 | ### Build a new Transformer decoder. 36 | 37 | Transformer decoders are defined under `mask2former/modeling/transformer_decoder`. 38 | -------------------------------------------------------------------------------- /third_party/Mask2Former/CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Code of Conduct 2 | 3 | Facebook has adopted a Code of Conduct that we expect project participants to adhere to. 4 | Please read the [full text](https://code.fb.com/codeofconduct/) 5 | so that you can understand what actions will and will not be tolerated. 6 | -------------------------------------------------------------------------------- /third_party/Mask2Former/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to maskformer2 2 | We want to make contributing to this project as easy and transparent as 3 | possible. 4 | 5 | ## Our Development Process 6 | Minor changes and improvements will be released on an ongoing basis. Larger changes (e.g., changesets implementing a new paper) will be released on a more periodic basis. 7 | 8 | ## Pull Requests 9 | We actively welcome your pull requests. 10 | 11 | 1. Fork the repo and create your branch from `main`. 12 | 2. If you've added code that should be tested, add tests. 13 | 3. If you've changed APIs, update the documentation. 14 | 4. Ensure the test suite passes. 15 | 5. Make sure your code lints. 16 | 6. If you haven't already, complete the Contributor License Agreement ("CLA"). 17 | 18 | ## Contributor License Agreement ("CLA") 19 | In order to accept your pull request, we need you to submit a CLA. You only need 20 | to do this once to work on any of Facebook's open source projects. 21 | 22 | Complete your CLA here: 23 | 24 | ## Issues 25 | We use GitHub issues to track public bugs. Please ensure your description is 26 | clear and has sufficient instructions to be able to reproduce the issue. 27 | 28 | Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe 29 | disclosure of security bugs. In those cases, please go through the process 30 | outlined on that page and do not file a public issue. 31 | 32 | ## Coding Style 33 | * 4 spaces for indentation rather than tabs 34 | * 80 character line length 35 | * PEP8 formatting following [Black](https://black.readthedocs.io/en/stable/) 36 | 37 | ## License 38 | By contributing to MaskFormer, you agree that your contributions will be licensed 39 | under the LICENSE file in the root directory of this source tree. 40 | -------------------------------------------------------------------------------- /third_party/Mask2Former/GETTING_STARTED.md: -------------------------------------------------------------------------------- 1 | ## Getting Started with Mask2Former 2 | 3 | This document provides a brief intro of the usage of Mask2Former. 4 | 5 | Please see [Getting Started with Detectron2](https://github.com/facebookresearch/detectron2/blob/master/GETTING_STARTED.md) for full usage. 6 | 7 | 8 | ### Inference Demo with Pre-trained Models 9 | 10 | 1. Pick a model and its config file from 11 | [model zoo](MODEL_ZOO.md), 12 | for example, `configs/coco/panoptic-segmentation/maskformer2_R50_bs16_50ep.yaml`. 13 | 2. We provide `demo.py` that is able to demo builtin configs. Run it with: 14 | ``` 15 | cd demo/ 16 | python demo.py --config-file ../configs/coco/panoptic-segmentation/maskformer2_R50_bs16_50ep.yaml \ 17 | --input input1.jpg input2.jpg \ 18 | [--other-options] 19 | --opts MODEL.WEIGHTS /path/to/checkpoint_file 20 | ``` 21 | The configs are made for training, therefore we need to specify `MODEL.WEIGHTS` to a model from model zoo for evaluation. 22 | This command will run the inference and show visualizations in an OpenCV window. 23 | 24 | For details of the command line arguments, see `demo.py -h` or look at its source code 25 | to understand its behavior. Some common arguments are: 26 | * To run __on your webcam__, replace `--input files` with `--webcam`. 27 | * To run __on a video__, replace `--input files` with `--video-input video.mp4`. 28 | * To run __on cpu__, add `MODEL.DEVICE cpu` after `--opts`. 29 | * To save outputs to a directory (for images) or a file (for webcam or video), use `--output`. 30 | 31 | 32 | ### Training & Evaluation in Command Line 33 | 34 | We provide a script `train_net.py`, that is made to train all the configs provided in Mask2Former. 35 | 36 | To train a model with "train_net.py", first 37 | setup the corresponding datasets following 38 | [datasets/README.md](./datasets/README.md), 39 | then run: 40 | ``` 41 | python train_net.py --num-gpus 8 \ 42 | --config-file configs/coco/panoptic-segmentation/maskformer2_R50_bs16_50ep.yaml 43 | ``` 44 | 45 | The configs are made for 8-GPU training. 46 | Since we use ADAMW optimizer, it is not clear how to scale learning rate with batch size. 47 | To train on 1 GPU, you need to figure out learning rate and batch size by yourself: 48 | ``` 49 | python train_net.py \ 50 | --config-file configs/coco/panoptic-segmentation/maskformer2_R50_bs16_50ep.yaml \ 51 | --num-gpus 1 SOLVER.IMS_PER_BATCH SET_TO_SOME_REASONABLE_VALUE SOLVER.BASE_LR SET_TO_SOME_REASONABLE_VALUE 52 | ``` 53 | 54 | To evaluate a model's performance, use 55 | ``` 56 | python train_net.py \ 57 | --config-file configs/coco/panoptic-segmentation/maskformer2_R50_bs16_50ep.yaml \ 58 | --eval-only MODEL.WEIGHTS /path/to/checkpoint_file 59 | ``` 60 | For more options, see `python train_net.py -h`. 61 | 62 | 63 | ### Video instance segmentation 64 | Please use `demo_video/demo.py` for video instance segmentation demo and `train_net_video.py` to train 65 | and evaluate video instance segmentation models. 66 | -------------------------------------------------------------------------------- /third_party/Mask2Former/INSTALL.md: -------------------------------------------------------------------------------- 1 | ## Installation 2 | 3 | ### Requirements 4 | - Linux or macOS with Python ≥ 3.6 5 | - PyTorch ≥ 1.9 and [torchvision](https://github.com/pytorch/vision/) that matches the PyTorch installation. 6 | Install them together at [pytorch.org](https://pytorch.org) to make sure of this. Note, please check 7 | PyTorch version matches that is required by Detectron2. 8 | - Detectron2: follow [Detectron2 installation instructions](https://detectron2.readthedocs.io/tutorials/install.html). 9 | - OpenCV is optional but needed by demo and visualization 10 | - `pip install -r requirements.txt` 11 | 12 | ### CUDA kernel for MSDeformAttn 13 | After preparing the required environment, run the following command to compile CUDA kernel for MSDeformAttn: 14 | 15 | `CUDA_HOME` must be defined and points to the directory of the installed CUDA toolkit. 16 | 17 | ```bash 18 | cd mask2former/modeling/pixel_decoder/ops 19 | sh make.sh 20 | ``` 21 | 22 | #### Building on another system 23 | To build on a system that does not have a GPU device but provide the drivers: 24 | ```bash 25 | TORCH_CUDA_ARCH_LIST='8.0' FORCE_CUDA=1 python setup.py build install 26 | ``` 27 | 28 | ### Example conda environment setup 29 | ```bash 30 | conda create --name mask2former python=3.8 -y 31 | conda activate mask2former 32 | conda install pytorch==1.9.0 torchvision==0.10.0 cudatoolkit=11.1 -c pytorch -c nvidia 33 | pip install -U opencv-python 34 | 35 | # under your working directory 36 | git clone git@github.com:facebookresearch/detectron2.git 37 | cd detectron2 38 | pip install -e . 39 | pip install git+https://github.com/cocodataset/panopticapi.git 40 | pip install git+https://github.com/mcordts/cityscapesScripts.git 41 | 42 | cd .. 43 | git clone git@github.com:facebookresearch/Mask2Former.git 44 | cd Mask2Former 45 | pip install -r requirements.txt 46 | cd mask2former/modeling/pixel_decoder/ops 47 | sh make.sh 48 | ``` 49 | -------------------------------------------------------------------------------- /third_party/Mask2Former/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2022 Meta, Inc. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. 20 | -------------------------------------------------------------------------------- /third_party/Mask2Former/README.md: -------------------------------------------------------------------------------- 1 | # Mask2Former: Masked-attention Mask Transformer for Universal Image Segmentation (CVPR 2022) 2 | 3 | [Bowen Cheng](https://bowenc0221.github.io/), [Ishan Misra](https://imisra.github.io/), [Alexander G. Schwing](https://alexander-schwing.de/), [Alexander Kirillov](https://alexander-kirillov.github.io/), [Rohit Girdhar](https://rohitgirdhar.github.io/) 4 | 5 | [[`arXiv`](https://arxiv.org/abs/2112.01527)] [[`Project`](https://bowenc0221.github.io/mask2former)] [[`BibTeX`](#CitingMask2Former)] 6 | 7 |
8 | 9 |

10 | 11 | ### Features 12 | * A single architecture for panoptic, instance and semantic segmentation. 13 | * Support major segmentation datasets: ADE20K, Cityscapes, COCO, Mapillary Vistas. 14 | 15 | ## Updates 16 | * Add Google Colab demo. 17 | * Video instance segmentation is now supported! Please check our [tech report](https://arxiv.org/abs/2112.10764) for more details. 18 | 19 | ## Installation 20 | 21 | See [installation instructions](INSTALL.md). 22 | 23 | ## Getting Started 24 | 25 | See [Preparing Datasets for Mask2Former](datasets/README.md). 26 | 27 | See [Getting Started with Mask2Former](GETTING_STARTED.md). 28 | 29 | Run our demo using Colab: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1uIWE5KbGFSjrxey2aRd5pWkKNY1_SaNq) 30 | 31 | Integrated into [Huggingface Spaces 🤗](https://huggingface.co/spaces) using [Gradio](https://github.com/gradio-app/gradio). Try out the Web Demo: [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/akhaliq/Mask2Former) 32 | 33 | Replicate web demo and docker image is available here: [![Replicate](https://replicate.com/facebookresearch/mask2former/badge)](https://replicate.com/facebookresearch/mask2former) 34 | 35 | ## Advanced usage 36 | 37 | See [Advanced Usage of Mask2Former](ADVANCED_USAGE.md). 38 | 39 | ## Model Zoo and Baselines 40 | 41 | We provide a large set of baseline results and trained models available for download in the [Mask2Former Model Zoo](MODEL_ZOO.md). 42 | 43 | ## License 44 | 45 | Shield: [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) 46 | 47 | The majority of Mask2Former is licensed under a [MIT License](LICENSE). 48 | 49 | 50 | However portions of the project are available under separate license terms: Swin-Transformer-Semantic-Segmentation is licensed under the [MIT license](https://github.com/SwinTransformer/Swin-Transformer-Semantic-Segmentation/blob/main/LICENSE), Deformable-DETR is licensed under the [Apache-2.0 License](https://github.com/fundamentalvision/Deformable-DETR/blob/main/LICENSE). 51 | 52 | ## Citing Mask2Former 53 | 54 | If you use Mask2Former in your research or wish to refer to the baseline results published in the [Model Zoo](MODEL_ZOO.md), please use the following BibTeX entry. 55 | 56 | ```BibTeX 57 | @inproceedings{cheng2021mask2former, 58 | title={Masked-attention Mask Transformer for Universal Image Segmentation}, 59 | author={Bowen Cheng and Ishan Misra and Alexander G. Schwing and Alexander Kirillov and Rohit Girdhar}, 60 | journal={CVPR}, 61 | year={2022} 62 | } 63 | ``` 64 | 65 | If you find the code useful, please also consider the following BibTeX entry. 66 | 67 | ```BibTeX 68 | @inproceedings{cheng2021maskformer, 69 | title={Per-Pixel Classification is Not All You Need for Semantic Segmentation}, 70 | author={Bowen Cheng and Alexander G. Schwing and Alexander Kirillov}, 71 | journal={NeurIPS}, 72 | year={2021} 73 | } 74 | ``` 75 | 76 | ## Acknowledgement 77 | 78 | Code is largely based on MaskFormer (https://github.com/facebookresearch/MaskFormer). 79 | -------------------------------------------------------------------------------- /third_party/Mask2Former/cog.yaml: -------------------------------------------------------------------------------- 1 | build: 2 | gpu: true 3 | cuda: "10.1" 4 | python_version: "3.8" 5 | system_packages: 6 | - "libgl1-mesa-glx" 7 | - "libglib2.0-0" 8 | python_packages: 9 | - "ipython==7.30.1" 10 | - "numpy==1.21.4" 11 | - "torch==1.8.1" 12 | - "torchvision==0.9.1" 13 | - "opencv-python==4.5.5.62" 14 | - "Shapely==1.8.0" 15 | - "h5py==3.6.0" 16 | - "scipy==1.7.3" 17 | - "submitit==1.4.1" 18 | - "scikit-image==0.19.1" 19 | - "Cython==0.29.27" 20 | - "timm==0.4.12" 21 | run: 22 | - pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu101/torch1.8/index.html 23 | - pip install git+https://github.com/cocodataset/panopticapi.git 24 | - pip install git+https://github.com/mcordts/cityscapesScripts.git 25 | - git clone https://github.com/facebookresearch/Mask2Former 26 | - TORCH_CUDA_ARCH_LIST='7.5' FORCE_CUDA=1 python Mask2Former/mask2former/modeling/pixel_decoder/ops/setup.py build install 27 | 28 | predict: "predict.py:Predictor" 29 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/ade20k/instance-segmentation/Base-ADE20K-InstanceSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | # NORM: "SyncBN" 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("ade20k_instance_train",) 18 | TEST: ("ade20k_instance_val",) 19 | SOLVER: 20 | IMS_PER_BATCH: 16 21 | BASE_LR: 0.0001 22 | MAX_ITER: 160000 23 | WARMUP_FACTOR: 1.0 24 | WARMUP_ITERS: 0 25 | WEIGHT_DECAY: 0.05 26 | OPTIMIZER: "ADAMW" 27 | LR_SCHEDULER_NAME: "WarmupPolyLR" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | AMP: 35 | ENABLED: True 36 | INPUT: 37 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"] 38 | MIN_SIZE_TRAIN_SAMPLING: "choice" 39 | MIN_SIZE_TEST: 640 40 | MAX_SIZE_TRAIN: 2560 41 | MAX_SIZE_TEST: 2560 42 | CROP: 43 | ENABLED: True 44 | TYPE: "absolute" 45 | SIZE: (640, 640) 46 | SINGLE_CATEGORY_MAX_AREA: 1.0 47 | COLOR_AUG_SSD: True 48 | SIZE_DIVISIBILITY: 640 # used in dataset mapper 49 | FORMAT: "RGB" 50 | DATASET_MAPPER_NAME: "mask_former_instance" 51 | TEST: 52 | EVAL_PERIOD: 5000 53 | AUG: 54 | ENABLED: False 55 | MIN_SIZES: [320, 480, 640, 800, 960, 1120] 56 | MAX_SIZE: 4480 57 | FLIP: True 58 | DATALOADER: 59 | FILTER_EMPTY_ANNOTATIONS: True 60 | NUM_WORKERS: 4 61 | VERSION: 2 62 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/ade20k/instance-segmentation/maskformer2_R50_bs16_160k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-ADE20K-InstanceSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IGNORE_VALUE: 255 7 | NUM_CLASSES: 100 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 14 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 15 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 16 | COMMON_STRIDE: 4 17 | TRANSFORMER_ENC_LAYERS: 6 18 | MASK_FORMER: 19 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 20 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 21 | DEEP_SUPERVISION: True 22 | NO_OBJECT_WEIGHT: 0.1 23 | CLASS_WEIGHT: 2.0 24 | MASK_WEIGHT: 5.0 25 | DICE_WEIGHT: 5.0 26 | HIDDEN_DIM: 256 27 | NUM_OBJECT_QUERIES: 100 28 | NHEADS: 8 29 | DROPOUT: 0.0 30 | DIM_FEEDFORWARD: 2048 31 | ENC_LAYERS: 0 32 | PRE_NORM: False 33 | ENFORCE_INPUT_PROJ: False 34 | SIZE_DIVISIBILITY: 32 35 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 36 | TRAIN_NUM_POINTS: 12544 37 | OVERSAMPLE_RATIO: 3.0 38 | IMPORTANCE_SAMPLE_RATIO: 0.75 39 | TEST: 40 | SEMANTIC_ON: True 41 | INSTANCE_ON: True 42 | PANOPTIC_ON: True 43 | OVERLAP_THRESHOLD: 0.8 44 | OBJECT_MASK_THRESHOLD: 0.8 45 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/ade20k/instance-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_160k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | MASK_FORMER: 18 | NUM_OBJECT_QUERIES: 200 19 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/ade20k/panoptic-segmentation/Base-ADE20K-PanopticSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | # NORM: "SyncBN" 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("ade20k_panoptic_train",) 18 | TEST: ("ade20k_panoptic_val",) 19 | SOLVER: 20 | IMS_PER_BATCH: 16 21 | BASE_LR: 0.0001 22 | MAX_ITER: 160000 23 | WARMUP_FACTOR: 1.0 24 | WARMUP_ITERS: 0 25 | WEIGHT_DECAY: 0.05 26 | OPTIMIZER: "ADAMW" 27 | LR_SCHEDULER_NAME: "WarmupPolyLR" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | AMP: 35 | ENABLED: True 36 | INPUT: 37 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"] 38 | MIN_SIZE_TRAIN_SAMPLING: "choice" 39 | MIN_SIZE_TEST: 640 40 | MAX_SIZE_TRAIN: 2560 41 | MAX_SIZE_TEST: 2560 42 | CROP: 43 | ENABLED: True 44 | TYPE: "absolute" 45 | SIZE: (640, 640) 46 | SINGLE_CATEGORY_MAX_AREA: 1.0 47 | COLOR_AUG_SSD: True 48 | SIZE_DIVISIBILITY: 640 # used in dataset mapper 49 | FORMAT: "RGB" 50 | DATASET_MAPPER_NAME: "mask_former_panoptic" 51 | TEST: 52 | EVAL_PERIOD: 5000 53 | AUG: 54 | ENABLED: False 55 | MIN_SIZES: [320, 480, 640, 800, 960, 1120] 56 | MAX_SIZE: 4480 57 | FLIP: True 58 | DATALOADER: 59 | FILTER_EMPTY_ANNOTATIONS: True 60 | NUM_WORKERS: 4 61 | VERSION: 2 62 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/ade20k/panoptic-segmentation/maskformer2_R50_bs16_160k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-ADE20K-PanopticSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IGNORE_VALUE: 255 7 | NUM_CLASSES: 150 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 14 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 15 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 16 | COMMON_STRIDE: 4 17 | TRANSFORMER_ENC_LAYERS: 6 18 | MASK_FORMER: 19 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 20 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 21 | DEEP_SUPERVISION: True 22 | NO_OBJECT_WEIGHT: 0.1 23 | CLASS_WEIGHT: 2.0 24 | MASK_WEIGHT: 5.0 25 | DICE_WEIGHT: 5.0 26 | HIDDEN_DIM: 256 27 | NUM_OBJECT_QUERIES: 100 28 | NHEADS: 8 29 | DROPOUT: 0.0 30 | DIM_FEEDFORWARD: 2048 31 | ENC_LAYERS: 0 32 | PRE_NORM: False 33 | ENFORCE_INPUT_PROJ: False 34 | SIZE_DIVISIBILITY: 32 35 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 36 | TRAIN_NUM_POINTS: 12544 37 | OVERSAMPLE_RATIO: 3.0 38 | IMPORTANCE_SAMPLE_RATIO: 0.75 39 | TEST: 40 | SEMANTIC_ON: True 41 | INSTANCE_ON: True 42 | PANOPTIC_ON: True 43 | OVERLAP_THRESHOLD: 0.8 44 | OBJECT_MASK_THRESHOLD: 0.8 45 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/ade20k/panoptic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_160k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | MASK_FORMER: 18 | NUM_OBJECT_QUERIES: 200 19 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/ade20k/semantic-segmentation/Base-ADE20K-SemanticSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | # NORM: "SyncBN" 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("ade20k_sem_seg_train",) 18 | TEST: ("ade20k_sem_seg_val",) 19 | SOLVER: 20 | IMS_PER_BATCH: 16 21 | BASE_LR: 0.0001 22 | MAX_ITER: 160000 23 | WARMUP_FACTOR: 1.0 24 | WARMUP_ITERS: 0 25 | WEIGHT_DECAY: 0.05 26 | OPTIMIZER: "ADAMW" 27 | LR_SCHEDULER_NAME: "WarmupPolyLR" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | AMP: 35 | ENABLED: True 36 | INPUT: 37 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 512) for x in range(5, 21)]"] 38 | MIN_SIZE_TRAIN_SAMPLING: "choice" 39 | MIN_SIZE_TEST: 512 40 | MAX_SIZE_TRAIN: 2048 41 | MAX_SIZE_TEST: 2048 42 | CROP: 43 | ENABLED: True 44 | TYPE: "absolute" 45 | SIZE: (512, 512) 46 | SINGLE_CATEGORY_MAX_AREA: 1.0 47 | COLOR_AUG_SSD: True 48 | SIZE_DIVISIBILITY: 512 # used in dataset mapper 49 | FORMAT: "RGB" 50 | DATASET_MAPPER_NAME: "mask_former_semantic" 51 | TEST: 52 | EVAL_PERIOD: 5000 53 | AUG: 54 | ENABLED: False 55 | MIN_SIZES: [256, 384, 512, 640, 768, 896] 56 | MAX_SIZE: 3584 57 | FLIP: True 58 | DATALOADER: 59 | FILTER_EMPTY_ANNOTATIONS: True 60 | NUM_WORKERS: 4 61 | VERSION: 2 62 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/ade20k/semantic-segmentation/maskformer2_R101_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: maskformer2_R50_bs16_160k.yaml 2 | MODEL: 3 | WEIGHTS: "R-101.pkl" 4 | RESNETS: 5 | DEPTH: 101 6 | STEM_TYPE: "basic" # not used 7 | STEM_OUT_CHANNELS: 64 8 | STRIDE_IN_1X1: False 9 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 10 | NORM: "SyncBN" 11 | RES5_MULTI_GRID: [1, 1, 1] # not used 12 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/ade20k/semantic-segmentation/maskformer2_R50_bs16_160k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-ADE20K-SemanticSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IGNORE_VALUE: 255 7 | NUM_CLASSES: 150 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 14 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 15 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 16 | COMMON_STRIDE: 4 17 | TRANSFORMER_ENC_LAYERS: 6 18 | MASK_FORMER: 19 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 20 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 21 | DEEP_SUPERVISION: True 22 | NO_OBJECT_WEIGHT: 0.1 23 | CLASS_WEIGHT: 2.0 24 | MASK_WEIGHT: 5.0 25 | DICE_WEIGHT: 5.0 26 | HIDDEN_DIM: 256 27 | NUM_OBJECT_QUERIES: 100 28 | NHEADS: 8 29 | DROPOUT: 0.0 30 | DIM_FEEDFORWARD: 2048 31 | ENC_LAYERS: 0 32 | PRE_NORM: False 33 | ENFORCE_INPUT_PROJ: False 34 | SIZE_DIVISIBILITY: 32 35 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 36 | TRAIN_NUM_POINTS: 12544 37 | OVERSAMPLE_RATIO: 3.0 38 | IMPORTANCE_SAMPLE_RATIO: 0.75 39 | TEST: 40 | SEMANTIC_ON: True 41 | INSTANCE_ON: False 42 | PANOPTIC_ON: False 43 | OVERLAP_THRESHOLD: 0.8 44 | OBJECT_MASK_THRESHOLD: 0.8 45 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/ade20k/semantic-segmentation/swin/maskformer2_swin_base_384_bs16_160k_res640.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 128 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [4, 8, 16, 32] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_base_patch4_window12_384.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | INPUT: 18 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"] 19 | MIN_SIZE_TRAIN_SAMPLING: "choice" 20 | MIN_SIZE_TEST: 640 21 | MAX_SIZE_TRAIN: 2560 22 | MAX_SIZE_TEST: 2560 23 | CROP: 24 | ENABLED: True 25 | TYPE: "absolute" 26 | SIZE: (640, 640) 27 | SINGLE_CATEGORY_MAX_AREA: 1.0 28 | COLOR_AUG_SSD: True 29 | SIZE_DIVISIBILITY: 640 # used in dataset mapper 30 | FORMAT: "RGB" 31 | TEST: 32 | EVAL_PERIOD: 5000 33 | AUG: 34 | ENABLED: False 35 | MIN_SIZES: [320, 480, 640, 800, 960, 1120] 36 | MAX_SIZE: 4480 37 | FLIP: True 38 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/ade20k/semantic-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_160k_res640.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 128 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [4, 8, 16, 32] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_base_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | INPUT: 18 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"] 19 | MIN_SIZE_TRAIN_SAMPLING: "choice" 20 | MIN_SIZE_TEST: 640 21 | MAX_SIZE_TRAIN: 2560 22 | MAX_SIZE_TEST: 2560 23 | CROP: 24 | ENABLED: True 25 | TYPE: "absolute" 26 | SIZE: (640, 640) 27 | SINGLE_CATEGORY_MAX_AREA: 1.0 28 | COLOR_AUG_SSD: True 29 | SIZE_DIVISIBILITY: 640 # used in dataset mapper 30 | FORMAT: "RGB" 31 | TEST: 32 | EVAL_PERIOD: 5000 33 | AUG: 34 | ENABLED: False 35 | MIN_SIZES: [320, 480, 640, 800, 960, 1120] 36 | MAX_SIZE: 4480 37 | FLIP: True 38 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/ade20k/semantic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_160k_res640.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | INPUT: 18 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"] 19 | MIN_SIZE_TRAIN_SAMPLING: "choice" 20 | MIN_SIZE_TEST: 640 21 | MAX_SIZE_TRAIN: 2560 22 | MAX_SIZE_TEST: 2560 23 | CROP: 24 | ENABLED: True 25 | TYPE: "absolute" 26 | SIZE: (640, 640) 27 | SINGLE_CATEGORY_MAX_AREA: 1.0 28 | COLOR_AUG_SSD: True 29 | SIZE_DIVISIBILITY: 640 # used in dataset mapper 30 | FORMAT: "RGB" 31 | TEST: 32 | EVAL_PERIOD: 5000 33 | AUG: 34 | ENABLED: False 35 | MIN_SIZES: [320, 480, 640, 800, 960, 1120] 36 | MAX_SIZE: 4480 37 | FLIP: True 38 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/ade20k/semantic-segmentation/swin/maskformer2_swin_small_bs16_160k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_small_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/ade20k/semantic-segmentation/swin/maskformer2_swin_tiny_bs16_160k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 6, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_tiny_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/cityscapes/instance-segmentation/Base-Cityscapes-InstanceSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | NORM: "SyncBN" # use syncbn for cityscapes dataset 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("cityscapes_fine_instance_seg_train",) 18 | TEST: ("cityscapes_fine_instance_seg_val",) 19 | SOLVER: 20 | IMS_PER_BATCH: 16 21 | BASE_LR: 0.0001 22 | MAX_ITER: 90000 23 | WARMUP_FACTOR: 1.0 24 | WARMUP_ITERS: 0 25 | WEIGHT_DECAY: 0.05 26 | OPTIMIZER: "ADAMW" 27 | LR_SCHEDULER_NAME: "WarmupPolyLR" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | AMP: 35 | ENABLED: True 36 | INPUT: 37 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1024) for x in range(5, 21)]"] 38 | MIN_SIZE_TRAIN_SAMPLING: "choice" 39 | MIN_SIZE_TEST: 1024 40 | MAX_SIZE_TRAIN: 4096 41 | MAX_SIZE_TEST: 2048 42 | CROP: 43 | ENABLED: True 44 | TYPE: "absolute" 45 | SIZE: (512, 1024) 46 | SINGLE_CATEGORY_MAX_AREA: 1.0 47 | COLOR_AUG_SSD: True 48 | SIZE_DIVISIBILITY: -1 49 | FORMAT: "RGB" 50 | DATASET_MAPPER_NAME: "mask_former_instance" 51 | TEST: 52 | EVAL_PERIOD: 5000 53 | AUG: 54 | ENABLED: False 55 | MIN_SIZES: [512, 768, 1024, 1280, 1536, 1792] 56 | MAX_SIZE: 4096 57 | FLIP: True 58 | DATALOADER: 59 | FILTER_EMPTY_ANNOTATIONS: True 60 | NUM_WORKERS: 4 61 | VERSION: 2 62 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/cityscapes/instance-segmentation/maskformer2_R101_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | WEIGHTS: "R-101.pkl" 4 | RESNETS: 5 | DEPTH: 101 6 | STEM_TYPE: "basic" # not used 7 | STEM_OUT_CHANNELS: 64 8 | STRIDE_IN_1X1: False 9 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 10 | NORM: "SyncBN" 11 | RES5_MULTI_GRID: [1, 1, 1] # not used 12 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/cityscapes/instance-segmentation/maskformer2_R50_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-Cityscapes-InstanceSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IGNORE_VALUE: 255 7 | NUM_CLASSES: 8 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 14 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 15 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 16 | COMMON_STRIDE: 4 17 | TRANSFORMER_ENC_LAYERS: 6 18 | MASK_FORMER: 19 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 20 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 21 | DEEP_SUPERVISION: True 22 | NO_OBJECT_WEIGHT: 0.1 23 | CLASS_WEIGHT: 2.0 24 | MASK_WEIGHT: 5.0 25 | DICE_WEIGHT: 5.0 26 | HIDDEN_DIM: 256 27 | NUM_OBJECT_QUERIES: 100 28 | NHEADS: 8 29 | DROPOUT: 0.0 30 | DIM_FEEDFORWARD: 2048 31 | ENC_LAYERS: 0 32 | PRE_NORM: False 33 | ENFORCE_INPUT_PROJ: False 34 | SIZE_DIVISIBILITY: 32 35 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 36 | TRAIN_NUM_POINTS: 12544 37 | OVERSAMPLE_RATIO: 3.0 38 | IMPORTANCE_SAMPLE_RATIO: 0.75 39 | TEST: 40 | SEMANTIC_ON: False 41 | INSTANCE_ON: True 42 | PANOPTIC_ON: False 43 | OVERLAP_THRESHOLD: 0.8 44 | OBJECT_MASK_THRESHOLD: 0.8 45 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/cityscapes/instance-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 128 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [4, 8, 16, 32] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_base_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/cityscapes/instance-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | MASK_FORMER: 18 | NUM_OBJECT_QUERIES: 200 19 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/cityscapes/instance-segmentation/swin/maskformer2_swin_small_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_small_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/cityscapes/instance-segmentation/swin/maskformer2_swin_tiny_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 6, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_tiny_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/cityscapes/panoptic-segmentation/Base-Cityscapes-PanopticSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | NORM: "SyncBN" # use syncbn for cityscapes dataset 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("cityscapes_fine_panoptic_train",) 18 | TEST: ("cityscapes_fine_panoptic_val",) 19 | SOLVER: 20 | IMS_PER_BATCH: 16 21 | BASE_LR: 0.0001 22 | MAX_ITER: 90000 23 | WARMUP_FACTOR: 1.0 24 | WARMUP_ITERS: 0 25 | WEIGHT_DECAY: 0.05 26 | OPTIMIZER: "ADAMW" 27 | LR_SCHEDULER_NAME: "WarmupPolyLR" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | AMP: 35 | ENABLED: True 36 | INPUT: 37 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1024) for x in range(5, 21)]"] 38 | MIN_SIZE_TRAIN_SAMPLING: "choice" 39 | MIN_SIZE_TEST: 1024 40 | MAX_SIZE_TRAIN: 4096 41 | MAX_SIZE_TEST: 2048 42 | CROP: 43 | ENABLED: True 44 | TYPE: "absolute" 45 | SIZE: (512, 1024) 46 | SINGLE_CATEGORY_MAX_AREA: 1.0 47 | COLOR_AUG_SSD: True 48 | SIZE_DIVISIBILITY: -1 49 | FORMAT: "RGB" 50 | DATASET_MAPPER_NAME: "mask_former_panoptic" 51 | TEST: 52 | EVAL_PERIOD: 5000 53 | AUG: 54 | ENABLED: False 55 | MIN_SIZES: [512, 768, 1024, 1280, 1536, 1792] 56 | MAX_SIZE: 4096 57 | FLIP: True 58 | DATALOADER: 59 | FILTER_EMPTY_ANNOTATIONS: True 60 | NUM_WORKERS: 4 61 | VERSION: 2 62 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/cityscapes/panoptic-segmentation/maskformer2_R101_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | WEIGHTS: "R-101.pkl" 4 | RESNETS: 5 | DEPTH: 101 6 | STEM_TYPE: "basic" # not used 7 | STEM_OUT_CHANNELS: 64 8 | STRIDE_IN_1X1: False 9 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 10 | NORM: "SyncBN" 11 | RES5_MULTI_GRID: [1, 1, 1] # not used 12 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/cityscapes/panoptic-segmentation/maskformer2_R50_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-Cityscapes-PanopticSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IGNORE_VALUE: 255 7 | NUM_CLASSES: 19 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 14 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 15 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 16 | COMMON_STRIDE: 4 17 | TRANSFORMER_ENC_LAYERS: 6 18 | MASK_FORMER: 19 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 20 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 21 | DEEP_SUPERVISION: True 22 | NO_OBJECT_WEIGHT: 0.1 23 | CLASS_WEIGHT: 2.0 24 | MASK_WEIGHT: 5.0 25 | DICE_WEIGHT: 5.0 26 | HIDDEN_DIM: 256 27 | NUM_OBJECT_QUERIES: 100 28 | NHEADS: 8 29 | DROPOUT: 0.0 30 | DIM_FEEDFORWARD: 2048 31 | ENC_LAYERS: 0 32 | PRE_NORM: False 33 | ENFORCE_INPUT_PROJ: False 34 | SIZE_DIVISIBILITY: 32 35 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 36 | TRAIN_NUM_POINTS: 12544 37 | OVERSAMPLE_RATIO: 3.0 38 | IMPORTANCE_SAMPLE_RATIO: 0.75 39 | TEST: 40 | SEMANTIC_ON: True 41 | INSTANCE_ON: True 42 | PANOPTIC_ON: True 43 | OVERLAP_THRESHOLD: 0.8 44 | OBJECT_MASK_THRESHOLD: 0.8 45 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/cityscapes/panoptic-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 128 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [4, 8, 16, 32] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_base_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/cityscapes/panoptic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | MASK_FORMER: 18 | NUM_OBJECT_QUERIES: 200 19 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/cityscapes/panoptic-segmentation/swin/maskformer2_swin_small_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_small_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/cityscapes/panoptic-segmentation/swin/maskformer2_swin_tiny_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 6, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_tiny_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/cityscapes/semantic-segmentation/Base-Cityscapes-SemanticSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | NORM: "SyncBN" # use syncbn for cityscapes dataset 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("cityscapes_fine_sem_seg_train",) 18 | TEST: ("cityscapes_fine_sem_seg_val",) 19 | SOLVER: 20 | IMS_PER_BATCH: 16 21 | BASE_LR: 0.0001 22 | MAX_ITER: 90000 23 | WARMUP_FACTOR: 1.0 24 | WARMUP_ITERS: 0 25 | WEIGHT_DECAY: 0.05 26 | OPTIMIZER: "ADAMW" 27 | LR_SCHEDULER_NAME: "WarmupPolyLR" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | AMP: 35 | ENABLED: True 36 | INPUT: 37 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1024) for x in range(5, 21)]"] 38 | MIN_SIZE_TRAIN_SAMPLING: "choice" 39 | MIN_SIZE_TEST: 1024 40 | MAX_SIZE_TRAIN: 4096 41 | MAX_SIZE_TEST: 2048 42 | CROP: 43 | ENABLED: True 44 | TYPE: "absolute" 45 | SIZE: (512, 1024) 46 | SINGLE_CATEGORY_MAX_AREA: 1.0 47 | COLOR_AUG_SSD: True 48 | SIZE_DIVISIBILITY: -1 49 | FORMAT: "RGB" 50 | DATASET_MAPPER_NAME: "mask_former_semantic" 51 | TEST: 52 | EVAL_PERIOD: 5000 53 | AUG: 54 | ENABLED: False 55 | MIN_SIZES: [512, 768, 1024, 1280, 1536, 1792] 56 | MAX_SIZE: 4096 57 | FLIP: True 58 | DATALOADER: 59 | FILTER_EMPTY_ANNOTATIONS: True 60 | NUM_WORKERS: 4 61 | VERSION: 2 62 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/cityscapes/semantic-segmentation/maskformer2_R101_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | WEIGHTS: "R-101.pkl" 4 | RESNETS: 5 | DEPTH: 101 6 | STEM_TYPE: "basic" # not used 7 | STEM_OUT_CHANNELS: 64 8 | STRIDE_IN_1X1: False 9 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 10 | NORM: "SyncBN" 11 | RES5_MULTI_GRID: [1, 1, 1] # not used 12 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/cityscapes/semantic-segmentation/maskformer2_R50_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-Cityscapes-SemanticSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IGNORE_VALUE: 255 7 | NUM_CLASSES: 19 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 14 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 15 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 16 | COMMON_STRIDE: 4 17 | TRANSFORMER_ENC_LAYERS: 6 18 | MASK_FORMER: 19 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 20 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 21 | DEEP_SUPERVISION: True 22 | NO_OBJECT_WEIGHT: 0.1 23 | CLASS_WEIGHT: 2.0 24 | MASK_WEIGHT: 5.0 25 | DICE_WEIGHT: 5.0 26 | HIDDEN_DIM: 256 27 | NUM_OBJECT_QUERIES: 100 28 | NHEADS: 8 29 | DROPOUT: 0.0 30 | DIM_FEEDFORWARD: 2048 31 | ENC_LAYERS: 0 32 | PRE_NORM: False 33 | ENFORCE_INPUT_PROJ: False 34 | SIZE_DIVISIBILITY: 32 35 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 36 | TRAIN_NUM_POINTS: 12544 37 | OVERSAMPLE_RATIO: 3.0 38 | IMPORTANCE_SAMPLE_RATIO: 0.75 39 | TEST: 40 | SEMANTIC_ON: True 41 | INSTANCE_ON: False 42 | PANOPTIC_ON: False 43 | OVERLAP_THRESHOLD: 0.8 44 | OBJECT_MASK_THRESHOLD: 0.8 45 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/cityscapes/semantic-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 128 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [4, 8, 16, 32] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_base_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/cityscapes/semantic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | MASK_FORMER: 18 | NUM_OBJECT_QUERIES: 100 19 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/cityscapes/semantic-segmentation/swin/maskformer2_swin_small_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_small_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/cityscapes/semantic-segmentation/swin/maskformer2_swin_tiny_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 6, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_tiny_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/coco/instance-segmentation/Base-COCO-InstanceSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | # NORM: "SyncBN" 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("coco_2017_train",) 18 | TEST: ("coco_2017_val",) 19 | SOLVER: 20 | IMS_PER_BATCH: 16 21 | BASE_LR: 0.0001 22 | STEPS: (327778, 355092) 23 | MAX_ITER: 368750 24 | WARMUP_FACTOR: 1.0 25 | WARMUP_ITERS: 10 26 | WEIGHT_DECAY: 0.05 27 | OPTIMIZER: "ADAMW" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | AMP: 35 | ENABLED: True 36 | INPUT: 37 | IMAGE_SIZE: 1024 38 | MIN_SCALE: 0.1 39 | MAX_SCALE: 2.0 40 | FORMAT: "RGB" 41 | DATASET_MAPPER_NAME: "coco_instance_lsj" 42 | TEST: 43 | EVAL_PERIOD: 5000 44 | DATALOADER: 45 | FILTER_EMPTY_ANNOTATIONS: True 46 | NUM_WORKERS: 4 47 | VERSION: 2 48 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/coco/instance-segmentation/maskformer2_R101_bs16_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | WEIGHTS: "R-101.pkl" 4 | RESNETS: 5 | DEPTH: 101 6 | STEM_TYPE: "basic" # not used 7 | STEM_OUT_CHANNELS: 64 8 | STRIDE_IN_1X1: False 9 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 10 | # NORM: "SyncBN" 11 | RES5_MULTI_GRID: [1, 1, 1] # not used 12 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/coco/instance-segmentation/maskformer2_R50_bs16_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-COCO-InstanceSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IGNORE_VALUE: 255 7 | NUM_CLASSES: 80 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 14 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 15 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 16 | COMMON_STRIDE: 4 17 | TRANSFORMER_ENC_LAYERS: 6 18 | MASK_FORMER: 19 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 20 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 21 | DEEP_SUPERVISION: True 22 | NO_OBJECT_WEIGHT: 0.1 23 | CLASS_WEIGHT: 2.0 24 | MASK_WEIGHT: 5.0 25 | DICE_WEIGHT: 5.0 26 | HIDDEN_DIM: 256 27 | NUM_OBJECT_QUERIES: 100 28 | NHEADS: 8 29 | DROPOUT: 0.0 30 | DIM_FEEDFORWARD: 2048 31 | ENC_LAYERS: 0 32 | PRE_NORM: False 33 | ENFORCE_INPUT_PROJ: False 34 | SIZE_DIVISIBILITY: 32 35 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 36 | TRAIN_NUM_POINTS: 12544 37 | OVERSAMPLE_RATIO: 3.0 38 | IMPORTANCE_SAMPLE_RATIO: 0.75 39 | TEST: 40 | SEMANTIC_ON: False 41 | INSTANCE_ON: True 42 | PANOPTIC_ON: False 43 | OVERLAP_THRESHOLD: 0.8 44 | OBJECT_MASK_THRESHOLD: 0.8 45 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/coco/instance-segmentation/swin/maskformer2_swin_base_384_bs16_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 128 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [4, 8, 16, 32] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_base_patch4_window12_384.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/coco/instance-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 128 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [4, 8, 16, 32] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_base_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/coco/instance-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_100ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | MASK_FORMER: 18 | NUM_OBJECT_QUERIES: 200 19 | SOLVER: 20 | STEPS: (655556, 710184) 21 | MAX_ITER: 737500 22 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/coco/instance-segmentation/swin/maskformer2_swin_small_bs16_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_small_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/coco/instance-segmentation/swin/maskformer2_swin_tiny_bs16_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 6, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_tiny_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/coco/panoptic-segmentation/Base-COCO-PanopticSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | # NORM: "SyncBN" 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("coco_2017_train_panoptic",) 18 | TEST: ("coco_2017_val_panoptic_with_sem_seg",) # to evaluate instance and semantic performance as well 19 | SOLVER: 20 | IMS_PER_BATCH: 16 21 | BASE_LR: 0.0001 22 | STEPS: (327778, 355092) 23 | MAX_ITER: 368750 24 | WARMUP_FACTOR: 1.0 25 | WARMUP_ITERS: 10 26 | WEIGHT_DECAY: 0.05 27 | OPTIMIZER: "ADAMW" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | AMP: 35 | ENABLED: True 36 | INPUT: 37 | IMAGE_SIZE: 1024 38 | MIN_SCALE: 0.1 39 | MAX_SCALE: 2.0 40 | FORMAT: "RGB" 41 | DATASET_MAPPER_NAME: "coco_panoptic_lsj" 42 | TEST: 43 | EVAL_PERIOD: 5000 44 | DATALOADER: 45 | FILTER_EMPTY_ANNOTATIONS: True 46 | NUM_WORKERS: 4 47 | VERSION: 2 48 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/coco/panoptic-segmentation/maskformer2_R101_bs16_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | WEIGHTS: "R-101.pkl" 4 | RESNETS: 5 | DEPTH: 101 6 | STEM_TYPE: "basic" # not used 7 | STEM_OUT_CHANNELS: 64 8 | STRIDE_IN_1X1: False 9 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 10 | # NORM: "SyncBN" 11 | RES5_MULTI_GRID: [1, 1, 1] # not used 12 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/coco/panoptic-segmentation/maskformer2_R50_bs16_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-COCO-PanopticSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 7 | IGNORE_VALUE: 255 8 | NUM_CLASSES: 133 9 | LOSS_WEIGHT: 1.0 10 | CONVS_DIM: 256 11 | MASK_DIM: 256 12 | NORM: "GN" 13 | # pixel decoder 14 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 15 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 16 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 17 | COMMON_STRIDE: 4 18 | TRANSFORMER_ENC_LAYERS: 6 19 | MASK_FORMER: 20 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 21 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 22 | DEEP_SUPERVISION: True 23 | NO_OBJECT_WEIGHT: 0.1 24 | CLASS_WEIGHT: 2.0 25 | MASK_WEIGHT: 5.0 26 | DICE_WEIGHT: 5.0 27 | HIDDEN_DIM: 256 28 | NUM_OBJECT_QUERIES: 100 29 | NHEADS: 8 30 | DROPOUT: 0.0 31 | DIM_FEEDFORWARD: 2048 32 | ENC_LAYERS: 0 33 | PRE_NORM: False 34 | ENFORCE_INPUT_PROJ: False 35 | SIZE_DIVISIBILITY: 32 36 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 37 | TRAIN_NUM_POINTS: 12544 38 | OVERSAMPLE_RATIO: 3.0 39 | IMPORTANCE_SAMPLE_RATIO: 0.75 40 | TEST: 41 | SEMANTIC_ON: True 42 | INSTANCE_ON: True 43 | PANOPTIC_ON: True 44 | OVERLAP_THRESHOLD: 0.8 45 | OBJECT_MASK_THRESHOLD: 0.8 46 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/coco/panoptic-segmentation/swin/maskformer2_swin_base_384_bs16_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 128 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [4, 8, 16, 32] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_base_patch4_window12_384.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/coco/panoptic-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 128 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [4, 8, 16, 32] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_base_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/coco/panoptic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_100ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | MASK_FORMER: 18 | NUM_OBJECT_QUERIES: 200 19 | SOLVER: 20 | STEPS: (655556, 710184) 21 | MAX_ITER: 737500 22 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/coco/panoptic-segmentation/swin/maskformer2_swin_small_bs16_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_small_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/coco/panoptic-segmentation/swin/maskformer2_swin_tiny_bs16_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 6, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_tiny_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/mapillary-vistas/panoptic-segmentation/Base-MapillaryVistas-PanopticSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | # NORM: "SyncBN" 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("mapillary_vistas_panoptic_train",) 18 | TEST: ("mapillary_vistas_panoptic_val",) 19 | SOLVER: 20 | IMS_PER_BATCH: 16 21 | BASE_LR: 0.0001 22 | MAX_ITER: 300000 23 | WARMUP_FACTOR: 1.0 24 | WARMUP_ITERS: 0 25 | WEIGHT_DECAY: 0.05 26 | OPTIMIZER: "ADAMW" 27 | LR_SCHEDULER_NAME: "WarmupPolyLR" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | AMP: 35 | ENABLED: True 36 | INPUT: 37 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 2048) for x in range(5, 21)]"] 38 | MIN_SIZE_TRAIN_SAMPLING: "choice" 39 | MIN_SIZE_TEST: 2048 40 | MAX_SIZE_TRAIN: 8192 41 | MAX_SIZE_TEST: 2048 42 | CROP: 43 | ENABLED: True 44 | TYPE: "absolute" 45 | SIZE: (1024, 1024) 46 | SINGLE_CATEGORY_MAX_AREA: 1.0 47 | COLOR_AUG_SSD: True 48 | SIZE_DIVISIBILITY: 1024 # used in dataset mapper 49 | FORMAT: "RGB" 50 | DATASET_MAPPER_NAME: "mask_former_panoptic" 51 | TEST: 52 | EVAL_PERIOD: 0 53 | DATALOADER: 54 | FILTER_EMPTY_ANNOTATIONS: True 55 | NUM_WORKERS: 10 56 | VERSION: 2 57 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/mapillary-vistas/panoptic-segmentation/maskformer_R50_bs16_300k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-MapillaryVistas-PanopticSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IGNORE_VALUE: 65 7 | NUM_CLASSES: 65 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 14 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 15 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 16 | COMMON_STRIDE: 4 17 | TRANSFORMER_ENC_LAYERS: 6 18 | MASK_FORMER: 19 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 20 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 21 | DEEP_SUPERVISION: True 22 | NO_OBJECT_WEIGHT: 0.1 23 | CLASS_WEIGHT: 2.0 24 | MASK_WEIGHT: 5.0 25 | DICE_WEIGHT: 5.0 26 | HIDDEN_DIM: 256 27 | NUM_OBJECT_QUERIES: 100 28 | NHEADS: 8 29 | DROPOUT: 0.0 30 | DIM_FEEDFORWARD: 2048 31 | ENC_LAYERS: 0 32 | PRE_NORM: False 33 | ENFORCE_INPUT_PROJ: False 34 | SIZE_DIVISIBILITY: 32 35 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 36 | TRAIN_NUM_POINTS: 12544 37 | OVERSAMPLE_RATIO: 3.0 38 | IMPORTANCE_SAMPLE_RATIO: 0.75 39 | TEST: 40 | SEMANTIC_ON: True 41 | INSTANCE_ON: False 42 | PANOPTIC_ON: True 43 | OVERLAP_THRESHOLD: 0.8 44 | OBJECT_MASK_THRESHOLD: 0.0 45 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/mapillary-vistas/panoptic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_300k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer_R50_bs16_300k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | MASK_FORMER: 18 | NUM_OBJECT_QUERIES: 200 19 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/mapillary-vistas/semantic-segmentation/Base-MapillaryVistas-SemanticSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | # NORM: "SyncBN" 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("mapillary_vistas_sem_seg_train",) 18 | TEST: ("mapillary_vistas_sem_seg_val",) 19 | SOLVER: 20 | IMS_PER_BATCH: 16 21 | BASE_LR: 0.0001 22 | MAX_ITER: 300000 23 | WARMUP_FACTOR: 1.0 24 | WARMUP_ITERS: 0 25 | WEIGHT_DECAY: 0.05 26 | OPTIMIZER: "ADAMW" 27 | LR_SCHEDULER_NAME: "WarmupPolyLR" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | AMP: 35 | ENABLED: True 36 | INPUT: 37 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 2048) for x in range(5, 21)]"] 38 | MIN_SIZE_TRAIN_SAMPLING: "choice" 39 | MIN_SIZE_TEST: 2048 40 | MAX_SIZE_TRAIN: 8192 41 | MAX_SIZE_TEST: 2048 42 | CROP: 43 | ENABLED: True 44 | TYPE: "absolute" 45 | SIZE: (1024, 1024) 46 | SINGLE_CATEGORY_MAX_AREA: 1.0 47 | COLOR_AUG_SSD: True 48 | SIZE_DIVISIBILITY: 1024 # used in dataset mapper 49 | FORMAT: "RGB" 50 | DATASET_MAPPER_NAME: "mask_former_semantic" 51 | TEST: 52 | EVAL_PERIOD: 0 53 | DATALOADER: 54 | FILTER_EMPTY_ANNOTATIONS: True 55 | NUM_WORKERS: 10 56 | VERSION: 2 57 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/mapillary-vistas/semantic-segmentation/maskformer2_R50_bs16_300k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-MapillaryVistas-SemanticSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IGNORE_VALUE: 65 7 | NUM_CLASSES: 65 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 14 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 15 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 16 | COMMON_STRIDE: 4 17 | TRANSFORMER_ENC_LAYERS: 6 18 | MASK_FORMER: 19 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 20 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 21 | DEEP_SUPERVISION: True 22 | NO_OBJECT_WEIGHT: 0.1 23 | CLASS_WEIGHT: 2.0 24 | MASK_WEIGHT: 5.0 25 | DICE_WEIGHT: 5.0 26 | HIDDEN_DIM: 256 27 | NUM_OBJECT_QUERIES: 100 28 | NHEADS: 8 29 | DROPOUT: 0.0 30 | DIM_FEEDFORWARD: 2048 31 | ENC_LAYERS: 0 32 | PRE_NORM: False 33 | ENFORCE_INPUT_PROJ: False 34 | SIZE_DIVISIBILITY: 32 35 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 36 | TRAIN_NUM_POINTS: 12544 37 | OVERSAMPLE_RATIO: 3.0 38 | IMPORTANCE_SAMPLE_RATIO: 0.75 39 | TEST: 40 | SEMANTIC_ON: True 41 | INSTANCE_ON: False 42 | PANOPTIC_ON: False 43 | OVERLAP_THRESHOLD: 0.8 44 | OBJECT_MASK_THRESHOLD: 0.0 45 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/mapillary-vistas/semantic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_300k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_300k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | MASK_FORMER: 18 | NUM_OBJECT_QUERIES: 100 19 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/youtubevis_2019/Base-YouTubeVIS-VideoInstanceSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | MASK_ON: True 9 | RESNETS: 10 | DEPTH: 50 11 | STEM_TYPE: "basic" # not used 12 | STEM_OUT_CHANNELS: 64 13 | STRIDE_IN_1X1: False 14 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 15 | # NORM: "SyncBN" 16 | RES5_MULTI_GRID: [1, 1, 1] # not used 17 | DATASETS: 18 | TRAIN: ("ytvis_2019_train",) 19 | TEST: ("ytvis_2019_val",) 20 | SOLVER: 21 | IMS_PER_BATCH: 16 22 | BASE_LR: 0.0001 23 | STEPS: (4000,) 24 | MAX_ITER: 6000 25 | WARMUP_FACTOR: 1.0 26 | WARMUP_ITERS: 10 27 | WEIGHT_DECAY: 0.05 28 | OPTIMIZER: "ADAMW" 29 | BACKBONE_MULTIPLIER: 0.1 30 | CLIP_GRADIENTS: 31 | ENABLED: True 32 | CLIP_TYPE: "full_model" 33 | CLIP_VALUE: 0.01 34 | NORM_TYPE: 2.0 35 | AMP: 36 | ENABLED: True 37 | INPUT: 38 | MIN_SIZE_TRAIN_SAMPLING: "choice_by_clip" 39 | RANDOM_FLIP: "flip_by_clip" 40 | AUGMENTATIONS: [] 41 | MIN_SIZE_TRAIN: (360, 480) 42 | MIN_SIZE_TEST: 360 43 | CROP: 44 | ENABLED: False 45 | TYPE: "absolute_range" 46 | SIZE: (600, 720) 47 | FORMAT: "RGB" 48 | TEST: 49 | EVAL_PERIOD: 0 50 | DATALOADER: 51 | FILTER_EMPTY_ANNOTATIONS: False 52 | NUM_WORKERS: 4 53 | VERSION: 2 54 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/youtubevis_2019/swin/video_maskformer2_swin_base_IN21k_384_bs16_8ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 128 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [4, 8, 16, 32] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "model_final_83d103.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | INPUT: 18 | MIN_SIZE_TEST: 480 19 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/youtubevis_2019/swin/video_maskformer2_swin_large_IN21k_384_bs16_8ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "model_final_e5f453.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | MASK_FORMER: 18 | NUM_OBJECT_QUERIES: 200 19 | INPUT: 20 | MIN_SIZE_TEST: 480 21 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/youtubevis_2019/swin/video_maskformer2_swin_small_bs16_8ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "model_final_1e7f22.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | INPUT: 17 | MIN_SIZE_TEST: 480 18 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/youtubevis_2019/swin/video_maskformer2_swin_tiny_bs16_8ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 6, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "model_final_86143f.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | INPUT: 17 | MIN_SIZE_TEST: 480 18 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/youtubevis_2019/video_maskformer2_R101_bs16_8ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: video_maskformer2_R50_bs16_8ep.yaml 2 | MODEL: 3 | WEIGHTS: "model_final_eba159.pkl" 4 | RESNETS: 5 | DEPTH: 101 6 | STEM_TYPE: "basic" # not used 7 | STEM_OUT_CHANNELS: 64 8 | STRIDE_IN_1X1: False 9 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 10 | # NORM: "SyncBN" 11 | RES5_MULTI_GRID: [1, 1, 1] # not used 12 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/youtubevis_2019/video_maskformer2_R50_bs16_8ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-YouTubeVIS-VideoInstanceSegmentation.yaml 2 | MODEL: 3 | WEIGHTS: "model_final_3c8ec9.pkl" 4 | META_ARCHITECTURE: "VideoMaskFormer" 5 | SEM_SEG_HEAD: 6 | NAME: "MaskFormerHead" 7 | IGNORE_VALUE: 255 8 | NUM_CLASSES: 40 9 | LOSS_WEIGHT: 1.0 10 | CONVS_DIM: 256 11 | MASK_DIM: 256 12 | NORM: "GN" 13 | # pixel decoder 14 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 15 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 16 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 17 | COMMON_STRIDE: 4 18 | TRANSFORMER_ENC_LAYERS: 6 19 | MASK_FORMER: 20 | TRANSFORMER_DECODER_NAME: "VideoMultiScaleMaskedTransformerDecoder" 21 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 22 | DEEP_SUPERVISION: True 23 | NO_OBJECT_WEIGHT: 0.1 24 | CLASS_WEIGHT: 2.0 25 | MASK_WEIGHT: 5.0 26 | DICE_WEIGHT: 5.0 27 | HIDDEN_DIM: 256 28 | NUM_OBJECT_QUERIES: 100 29 | NHEADS: 8 30 | DROPOUT: 0.0 31 | DIM_FEEDFORWARD: 2048 32 | ENC_LAYERS: 0 33 | PRE_NORM: False 34 | ENFORCE_INPUT_PROJ: False 35 | SIZE_DIVISIBILITY: 32 36 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 37 | TRAIN_NUM_POINTS: 12544 38 | OVERSAMPLE_RATIO: 3.0 39 | IMPORTANCE_SAMPLE_RATIO: 0.75 40 | TEST: 41 | SEMANTIC_ON: False 42 | INSTANCE_ON: True 43 | PANOPTIC_ON: False 44 | OVERLAP_THRESHOLD: 0.8 45 | OBJECT_MASK_THRESHOLD: 0.8 46 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/youtubevis_2021/Base-YouTubeVIS-VideoInstanceSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | MASK_ON: True 9 | RESNETS: 10 | DEPTH: 50 11 | STEM_TYPE: "basic" # not used 12 | STEM_OUT_CHANNELS: 64 13 | STRIDE_IN_1X1: False 14 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 15 | # NORM: "SyncBN" 16 | RES5_MULTI_GRID: [1, 1, 1] # not used 17 | DATASETS: 18 | TRAIN: ("ytvis_2021_train",) 19 | TEST: ("ytvis_2021_val",) 20 | SOLVER: 21 | IMS_PER_BATCH: 16 22 | BASE_LR: 0.0001 23 | STEPS: (5500,) 24 | MAX_ITER: 8000 25 | WARMUP_FACTOR: 1.0 26 | WARMUP_ITERS: 10 27 | WEIGHT_DECAY: 0.05 28 | OPTIMIZER: "ADAMW" 29 | BACKBONE_MULTIPLIER: 0.1 30 | CLIP_GRADIENTS: 31 | ENABLED: True 32 | CLIP_TYPE: "full_model" 33 | CLIP_VALUE: 0.01 34 | NORM_TYPE: 2.0 35 | AMP: 36 | ENABLED: True 37 | INPUT: 38 | MIN_SIZE_TRAIN_SAMPLING: "choice_by_clip" 39 | RANDOM_FLIP: "flip_by_clip" 40 | AUGMENTATIONS: [] 41 | MIN_SIZE_TRAIN: (360, 480) 42 | MIN_SIZE_TEST: 360 43 | CROP: 44 | ENABLED: False 45 | TYPE: "absolute_range" 46 | SIZE: (600, 720) 47 | FORMAT: "RGB" 48 | TEST: 49 | EVAL_PERIOD: 0 50 | DATALOADER: 51 | FILTER_EMPTY_ANNOTATIONS: False 52 | NUM_WORKERS: 4 53 | VERSION: 2 54 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/youtubevis_2021/swin/video_maskformer2_swin_base_IN21k_384_bs16_8ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 128 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [4, 8, 16, 32] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "model_final_83d103.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | INPUT: 18 | MIN_SIZE_TEST: 480 19 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/youtubevis_2021/swin/video_maskformer2_swin_large_IN21k_384_bs16_8ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "model_final_e5f453.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | MASK_FORMER: 18 | NUM_OBJECT_QUERIES: 200 19 | # OOM when using a larger test size 20 | # INPUT: 21 | # MIN_SIZE_TEST: 480 22 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/youtubevis_2021/swin/video_maskformer2_swin_small_bs16_8ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "model_final_1e7f22.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | INPUT: 17 | MIN_SIZE_TEST: 480 18 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/youtubevis_2021/swin/video_maskformer2_swin_tiny_bs16_8ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 6, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "model_final_86143f.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | INPUT: 17 | MIN_SIZE_TEST: 480 18 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/youtubevis_2021/video_maskformer2_R101_bs16_8ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: video_maskformer2_R50_bs16_8ep.yaml 2 | MODEL: 3 | WEIGHTS: "model_final_eba159.pkl" 4 | RESNETS: 5 | DEPTH: 101 6 | STEM_TYPE: "basic" # not used 7 | STEM_OUT_CHANNELS: 64 8 | STRIDE_IN_1X1: False 9 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 10 | # NORM: "SyncBN" 11 | RES5_MULTI_GRID: [1, 1, 1] # not used 12 | -------------------------------------------------------------------------------- /third_party/Mask2Former/configs/youtubevis_2021/video_maskformer2_R50_bs16_8ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-YouTubeVIS-VideoInstanceSegmentation.yaml 2 | MODEL: 3 | WEIGHTS: "model_final_3c8ec9.pkl" 4 | META_ARCHITECTURE: "VideoMaskFormer" 5 | SEM_SEG_HEAD: 6 | NAME: "MaskFormerHead" 7 | IGNORE_VALUE: 255 8 | NUM_CLASSES: 40 9 | LOSS_WEIGHT: 1.0 10 | CONVS_DIM: 256 11 | MASK_DIM: 256 12 | NORM: "GN" 13 | # pixel decoder 14 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 15 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 16 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 17 | COMMON_STRIDE: 4 18 | TRANSFORMER_ENC_LAYERS: 6 19 | MASK_FORMER: 20 | TRANSFORMER_DECODER_NAME: "VideoMultiScaleMaskedTransformerDecoder" 21 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 22 | DEEP_SUPERVISION: True 23 | NO_OBJECT_WEIGHT: 0.1 24 | CLASS_WEIGHT: 2.0 25 | MASK_WEIGHT: 5.0 26 | DICE_WEIGHT: 5.0 27 | HIDDEN_DIM: 256 28 | NUM_OBJECT_QUERIES: 100 29 | NHEADS: 8 30 | DROPOUT: 0.0 31 | DIM_FEEDFORWARD: 2048 32 | ENC_LAYERS: 0 33 | PRE_NORM: False 34 | ENFORCE_INPUT_PROJ: False 35 | SIZE_DIVISIBILITY: 32 36 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 37 | TRAIN_NUM_POINTS: 12544 38 | OVERSAMPLE_RATIO: 3.0 39 | IMPORTANCE_SAMPLE_RATIO: 0.75 40 | TEST: 41 | SEMANTIC_ON: False 42 | INSTANCE_ON: True 43 | PANOPTIC_ON: False 44 | OVERLAP_THRESHOLD: 0.8 45 | OBJECT_MASK_THRESHOLD: 0.8 46 | -------------------------------------------------------------------------------- /third_party/Mask2Former/datasets/ade20k_instance_catid_mapping.txt: -------------------------------------------------------------------------------- 1 | Instacne100 SceneParse150 FullADE20K 2 | 1 8 165 3 | 2 9 3055 4 | 3 11 350 5 | 4 13 1831 6 | 5 15 774 7 | 5 15 783 8 | 6 16 2684 9 | 7 19 687 10 | 8 20 471 11 | 9 21 401 12 | 10 23 1735 13 | 11 24 2473 14 | 12 25 2329 15 | 13 28 1564 16 | 14 31 57 17 | 15 32 2272 18 | 16 33 907 19 | 17 34 724 20 | 18 36 2985 21 | 18 36 533 22 | 19 37 1395 23 | 20 38 155 24 | 21 39 2053 25 | 22 40 689 26 | 23 42 266 27 | 24 43 581 28 | 25 44 2380 29 | 26 45 491 30 | 27 46 627 31 | 28 48 2388 32 | 29 50 943 33 | 30 51 2096 34 | 31 54 2530 35 | 32 56 420 36 | 33 57 1948 37 | 34 58 1869 38 | 35 59 2251 39 | 36 63 239 40 | 37 65 571 41 | 38 66 2793 42 | 39 67 978 43 | 40 68 236 44 | 41 70 181 45 | 42 71 629 46 | 43 72 2598 47 | 44 73 1744 48 | 45 74 1374 49 | 46 75 591 50 | 47 76 2679 51 | 48 77 223 52 | 49 79 47 53 | 50 81 327 54 | 51 82 2821 55 | 52 83 1451 56 | 53 84 2880 57 | 54 86 480 58 | 55 87 77 59 | 56 88 2616 60 | 57 89 246 61 | 57 89 247 62 | 58 90 2733 63 | 59 91 14 64 | 60 93 38 65 | 61 94 1936 66 | 62 96 120 67 | 63 98 1702 68 | 64 99 249 69 | 65 103 2928 70 | 66 104 2337 71 | 67 105 1023 72 | 68 108 2989 73 | 69 109 1930 74 | 70 111 2586 75 | 71 112 131 76 | 72 113 146 77 | 73 116 95 78 | 74 117 1563 79 | 75 119 1708 80 | 76 120 103 81 | 77 121 1002 82 | 78 122 2569 83 | 79 124 2833 84 | 80 125 1551 85 | 81 126 1981 86 | 82 127 29 87 | 83 128 187 88 | 84 130 747 89 | 85 131 2254 90 | 86 133 2262 91 | 87 134 1260 92 | 88 135 2243 93 | 89 136 2932 94 | 90 137 2836 95 | 91 138 2850 96 | 92 139 64 97 | 93 140 894 98 | 94 143 1919 99 | 95 144 1583 100 | 96 145 318 101 | 97 147 2046 102 | 98 148 1098 103 | 99 149 530 104 | 100 150 954 105 | -------------------------------------------------------------------------------- /third_party/Mask2Former/datasets/prepare_ade20k_sem_seg.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # Copyright (c) Facebook, Inc. and its affiliates. 4 | import os 5 | from pathlib import Path 6 | 7 | import numpy as np 8 | import tqdm 9 | from PIL import Image 10 | 11 | 12 | def convert(input, output): 13 | img = np.asarray(Image.open(input)) 14 | assert img.dtype == np.uint8 15 | img = img - 1 # 0 (ignore) becomes 255. others are shifted by 1 16 | Image.fromarray(img).save(output) 17 | 18 | 19 | if __name__ == "__main__": 20 | dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "ADEChallengeData2016" 21 | for name in ["training", "validation"]: 22 | annotation_dir = dataset_dir / "annotations" / name 23 | output_dir = dataset_dir / "annotations_detectron2" / name 24 | output_dir.mkdir(parents=True, exist_ok=True) 25 | for file in tqdm.tqdm(list(annotation_dir.iterdir())): 26 | output_file = output_dir / file.name 27 | convert(file, output_file) 28 | -------------------------------------------------------------------------------- /third_party/Mask2Former/datasets/prepare_coco_semantic_annos_from_panoptic_annos.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # Copyright (c) Facebook, Inc. and its affiliates. 4 | 5 | import functools 6 | import json 7 | import multiprocessing as mp 8 | import numpy as np 9 | import os 10 | import time 11 | from fvcore.common.download import download 12 | from panopticapi.utils import rgb2id 13 | from PIL import Image 14 | 15 | from detectron2.data.datasets.builtin_meta import COCO_CATEGORIES 16 | 17 | 18 | def _process_panoptic_to_semantic(input_panoptic, output_semantic, segments, id_map): 19 | panoptic = np.asarray(Image.open(input_panoptic), dtype=np.uint32) 20 | panoptic = rgb2id(panoptic) 21 | output = np.zeros_like(panoptic, dtype=np.uint8) + 255 22 | for seg in segments: 23 | cat_id = seg["category_id"] 24 | new_cat_id = id_map[cat_id] 25 | output[panoptic == seg["id"]] = new_cat_id 26 | Image.fromarray(output).save(output_semantic) 27 | 28 | 29 | def separate_coco_semantic_from_panoptic(panoptic_json, panoptic_root, sem_seg_root, categories): 30 | """ 31 | Create semantic segmentation annotations from panoptic segmentation 32 | annotations, to be used by PanopticFPN. 33 | It maps all thing categories to class 0, and maps all unlabeled pixels to class 255. 34 | It maps all stuff categories to contiguous ids starting from 1. 35 | Args: 36 | panoptic_json (str): path to the panoptic json file, in COCO's format. 37 | panoptic_root (str): a directory with panoptic annotation files, in COCO's format. 38 | sem_seg_root (str): a directory to output semantic annotation files 39 | categories (list[dict]): category metadata. Each dict needs to have: 40 | "id": corresponds to the "category_id" in the json annotations 41 | "isthing": 0 or 1 42 | """ 43 | os.makedirs(sem_seg_root, exist_ok=True) 44 | 45 | id_map = {} # map from category id to id in the output semantic annotation 46 | assert len(categories) <= 254 47 | for i, k in enumerate(categories): 48 | id_map[k["id"]] = i 49 | # what is id = 0? 50 | # id_map[0] = 255 51 | print(id_map) 52 | 53 | with open(panoptic_json) as f: 54 | obj = json.load(f) 55 | 56 | pool = mp.Pool(processes=max(mp.cpu_count() // 2, 4)) 57 | 58 | def iter_annotations(): 59 | for anno in obj["annotations"]: 60 | file_name = anno["file_name"] 61 | segments = anno["segments_info"] 62 | input = os.path.join(panoptic_root, file_name) 63 | output = os.path.join(sem_seg_root, file_name) 64 | yield input, output, segments 65 | 66 | print("Start writing to {} ...".format(sem_seg_root)) 67 | start = time.time() 68 | pool.starmap( 69 | functools.partial(_process_panoptic_to_semantic, id_map=id_map), 70 | iter_annotations(), 71 | chunksize=100, 72 | ) 73 | print("Finished. time: {:.2f}s".format(time.time() - start)) 74 | 75 | 76 | if __name__ == "__main__": 77 | dataset_dir = os.path.join(os.getenv("DETECTRON2_DATASETS", "datasets"), "coco") 78 | for s in ["val2017", "train2017"]: 79 | separate_coco_semantic_from_panoptic( 80 | os.path.join(dataset_dir, "annotations/panoptic_{}.json".format(s)), 81 | os.path.join(dataset_dir, "panoptic_{}".format(s)), 82 | os.path.join(dataset_dir, "panoptic_semseg_{}".format(s)), 83 | COCO_CATEGORIES, 84 | ) 85 | -------------------------------------------------------------------------------- /third_party/Mask2Former/demo/README.md: -------------------------------------------------------------------------------- 1 | ## Mask2Former Demo 2 | 3 | We provide a command line tool to run a simple demo of builtin configs. 4 | The usage is explained in [GETTING_STARTED.md](../GETTING_STARTED.md). 5 | -------------------------------------------------------------------------------- /third_party/Mask2Former/demo_video/README.md: -------------------------------------------------------------------------------- 1 | ## Video Mask2Former Demo 2 | 3 | We provide a command line tool to run a simple demo of builtin configs. 4 | The usage is explained in [GETTING_STARTED.md](../GETTING_STARTED.md). 5 | -------------------------------------------------------------------------------- /third_party/Mask2Former/mask2former/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from . import data # register all new datasets 3 | from . import modeling 4 | 5 | # config 6 | from .config import add_maskformer2_config 7 | 8 | # dataset loading 9 | from .data.dataset_mappers.coco_instance_new_baseline_dataset_mapper import COCOInstanceNewBaselineDatasetMapper 10 | from .data.dataset_mappers.coco_panoptic_new_baseline_dataset_mapper import COCOPanopticNewBaselineDatasetMapper 11 | from .data.dataset_mappers.mask_former_instance_dataset_mapper import ( 12 | MaskFormerInstanceDatasetMapper, 13 | ) 14 | from .data.dataset_mappers.mask_former_panoptic_dataset_mapper import ( 15 | MaskFormerPanopticDatasetMapper, 16 | ) 17 | from .data.dataset_mappers.mask_former_semantic_dataset_mapper import ( 18 | MaskFormerSemanticDatasetMapper, 19 | ) 20 | 21 | # models 22 | from .maskformer_model import MaskFormer 23 | from .test_time_augmentation import SemanticSegmentorWithTTA 24 | 25 | # evaluation 26 | from .evaluation.instance_evaluation import InstanceSegEvaluator 27 | 28 | __version__ = "0.1" -------------------------------------------------------------------------------- /third_party/Mask2Former/mask2former/data/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from . import datasets 3 | -------------------------------------------------------------------------------- /third_party/Mask2Former/mask2former/data/dataset_mappers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /third_party/Mask2Former/mask2former/data/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from . import ( 3 | register_ade20k_full, 4 | register_ade20k_panoptic, 5 | register_coco_stuff_10k, 6 | register_mapillary_vistas, 7 | register_coco_panoptic_annos_semseg, 8 | register_ade20k_instance, 9 | register_mapillary_vistas_panoptic, 10 | ) 11 | -------------------------------------------------------------------------------- /third_party/Mask2Former/mask2former/evaluation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/ODISE/2b187e4b2ff4c3d5da342aec2cc234b537720a65/third_party/Mask2Former/mask2former/evaluation/__init__.py -------------------------------------------------------------------------------- /third_party/Mask2Former/mask2former/modeling/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from .backbone.swin import D2SwinTransformer 3 | from .pixel_decoder.fpn import BasePixelDecoder 4 | from .pixel_decoder.msdeformattn import MSDeformAttnPixelDecoder 5 | from .meta_arch.mask_former_head import MaskFormerHead 6 | from .meta_arch.per_pixel_baseline import PerPixelBaselineHead, PerPixelBaselinePlusHead 7 | -------------------------------------------------------------------------------- /third_party/Mask2Former/mask2former/modeling/backbone/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /third_party/Mask2Former/mask2former/modeling/meta_arch/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /third_party/Mask2Former/mask2former/modeling/pixel_decoder/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/ODISE/2b187e4b2ff4c3d5da342aec2cc234b537720a65/third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/__init__.py -------------------------------------------------------------------------------- /third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/functions/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | from .ms_deform_attn_func import MSDeformAttnFunction 13 | 14 | -------------------------------------------------------------------------------- /third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/functions/ms_deform_attn_func.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | from __future__ import absolute_import 13 | from __future__ import print_function 14 | from __future__ import division 15 | 16 | import torch 17 | import torch.nn.functional as F 18 | from torch.autograd import Function 19 | from torch.autograd.function import once_differentiable 20 | 21 | try: 22 | import MultiScaleDeformableAttention as MSDA 23 | except ModuleNotFoundError as e: 24 | info_string = ( 25 | "\n\nPlease compile MultiScaleDeformableAttention CUDA op with the following commands:\n" 26 | "\t`cd mask2former/modeling/pixel_decoder/ops`\n" 27 | "\t`sh make.sh`\n" 28 | ) 29 | MSDA = None 30 | 31 | 32 | class MSDeformAttnFunction(Function): 33 | @staticmethod 34 | def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step): 35 | ctx.im2col_step = im2col_step 36 | output = MSDA.ms_deform_attn_forward( 37 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step) 38 | ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights) 39 | return output 40 | 41 | @staticmethod 42 | @once_differentiable 43 | def backward(ctx, grad_output): 44 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors 45 | grad_value, grad_sampling_loc, grad_attn_weight = \ 46 | MSDA.ms_deform_attn_backward( 47 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step) 48 | 49 | return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None 50 | 51 | 52 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights): 53 | # for debug and test only, 54 | # need to use cuda version instead 55 | N_, S_, M_, D_ = value.shape 56 | _, Lq_, M_, L_, P_, _ = sampling_locations.shape 57 | value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1) 58 | sampling_grids = 2 * sampling_locations - 1 59 | sampling_value_list = [] 60 | for lid_, (H_, W_) in enumerate(value_spatial_shapes): 61 | # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_ 62 | value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_) 63 | # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2 64 | sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1) 65 | # N_*M_, D_, Lq_, P_ 66 | sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_, 67 | mode='bilinear', padding_mode='zeros', align_corners=False) 68 | sampling_value_list.append(sampling_value_l_) 69 | # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_) 70 | attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_) 71 | output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_) 72 | return output.transpose(1, 2).contiguous() 73 | -------------------------------------------------------------------------------- /third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | from .ms_deform_attn import MSDeformAttn 13 | -------------------------------------------------------------------------------- /third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #include 17 | 18 | #include 19 | #include 20 | 21 | 22 | at::Tensor 23 | ms_deform_attn_cpu_forward( 24 | const at::Tensor &value, 25 | const at::Tensor &spatial_shapes, 26 | const at::Tensor &level_start_index, 27 | const at::Tensor &sampling_loc, 28 | const at::Tensor &attn_weight, 29 | const int im2col_step) 30 | { 31 | AT_ERROR("Not implement on cpu"); 32 | } 33 | 34 | std::vector 35 | ms_deform_attn_cpu_backward( 36 | const at::Tensor &value, 37 | const at::Tensor &spatial_shapes, 38 | const at::Tensor &level_start_index, 39 | const at::Tensor &sampling_loc, 40 | const at::Tensor &attn_weight, 41 | const at::Tensor &grad_output, 42 | const int im2col_step) 43 | { 44 | AT_ERROR("Not implement on cpu"); 45 | } 46 | 47 | -------------------------------------------------------------------------------- /third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #pragma once 17 | #include 18 | 19 | at::Tensor 20 | ms_deform_attn_cpu_forward( 21 | const at::Tensor &value, 22 | const at::Tensor &spatial_shapes, 23 | const at::Tensor &level_start_index, 24 | const at::Tensor &sampling_loc, 25 | const at::Tensor &attn_weight, 26 | const int im2col_step); 27 | 28 | std::vector 29 | ms_deform_attn_cpu_backward( 30 | const at::Tensor &value, 31 | const at::Tensor &spatial_shapes, 32 | const at::Tensor &level_start_index, 33 | const at::Tensor &sampling_loc, 34 | const at::Tensor &attn_weight, 35 | const at::Tensor &grad_output, 36 | const int im2col_step); 37 | 38 | 39 | -------------------------------------------------------------------------------- /third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #pragma once 17 | #include 18 | 19 | at::Tensor ms_deform_attn_cuda_forward( 20 | const at::Tensor &value, 21 | const at::Tensor &spatial_shapes, 22 | const at::Tensor &level_start_index, 23 | const at::Tensor &sampling_loc, 24 | const at::Tensor &attn_weight, 25 | const int im2col_step); 26 | 27 | std::vector ms_deform_attn_cuda_backward( 28 | const at::Tensor &value, 29 | const at::Tensor &spatial_shapes, 30 | const at::Tensor &level_start_index, 31 | const at::Tensor &sampling_loc, 32 | const at::Tensor &attn_weight, 33 | const at::Tensor &grad_output, 34 | const int im2col_step); 35 | 36 | -------------------------------------------------------------------------------- /third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/src/ms_deform_attn.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #pragma once 17 | 18 | #include "cpu/ms_deform_attn_cpu.h" 19 | 20 | #ifdef WITH_CUDA 21 | #include "cuda/ms_deform_attn_cuda.h" 22 | #endif 23 | 24 | 25 | at::Tensor 26 | ms_deform_attn_forward( 27 | const at::Tensor &value, 28 | const at::Tensor &spatial_shapes, 29 | const at::Tensor &level_start_index, 30 | const at::Tensor &sampling_loc, 31 | const at::Tensor &attn_weight, 32 | const int im2col_step) 33 | { 34 | if (value.type().is_cuda()) 35 | { 36 | #ifdef WITH_CUDA 37 | return ms_deform_attn_cuda_forward( 38 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step); 39 | #else 40 | AT_ERROR("Not compiled with GPU support"); 41 | #endif 42 | } 43 | AT_ERROR("Not implemented on the CPU"); 44 | } 45 | 46 | std::vector 47 | ms_deform_attn_backward( 48 | const at::Tensor &value, 49 | const at::Tensor &spatial_shapes, 50 | const at::Tensor &level_start_index, 51 | const at::Tensor &sampling_loc, 52 | const at::Tensor &attn_weight, 53 | const at::Tensor &grad_output, 54 | const int im2col_step) 55 | { 56 | if (value.type().is_cuda()) 57 | { 58 | #ifdef WITH_CUDA 59 | return ms_deform_attn_cuda_backward( 60 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step); 61 | #else 62 | AT_ERROR("Not compiled with GPU support"); 63 | #endif 64 | } 65 | AT_ERROR("Not implemented on the CPU"); 66 | } 67 | 68 | -------------------------------------------------------------------------------- /third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/src/vision.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #include "ms_deform_attn.h" 17 | 18 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 19 | m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward"); 20 | m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward"); 21 | } 22 | -------------------------------------------------------------------------------- /third_party/Mask2Former/mask2former/modeling/transformer_decoder/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from .maskformer_transformer_decoder import StandardTransformerDecoder 3 | from .mask2former_transformer_decoder import MultiScaleMaskedTransformerDecoder 4 | -------------------------------------------------------------------------------- /third_party/Mask2Former/mask2former/modeling/transformer_decoder/position_encoding.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py 3 | """ 4 | Various positional encodings for the transformer. 5 | """ 6 | import math 7 | 8 | import torch 9 | from torch import nn 10 | 11 | 12 | class PositionEmbeddingSine(nn.Module): 13 | """ 14 | This is a more standard version of the position embedding, very similar to the one 15 | used by the Attention is all you need paper, generalized to work on images. 16 | """ 17 | 18 | def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None): 19 | super().__init__() 20 | self.num_pos_feats = num_pos_feats 21 | self.temperature = temperature 22 | self.normalize = normalize 23 | if scale is not None and normalize is False: 24 | raise ValueError("normalize should be True if scale is passed") 25 | if scale is None: 26 | scale = 2 * math.pi 27 | self.scale = scale 28 | 29 | def forward(self, x, mask=None): 30 | if mask is None: 31 | mask = torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool) 32 | not_mask = ~mask 33 | y_embed = not_mask.cumsum(1, dtype=torch.float32) 34 | x_embed = not_mask.cumsum(2, dtype=torch.float32) 35 | if self.normalize: 36 | eps = 1e-6 37 | y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale 38 | x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale 39 | 40 | dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) 41 | dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) 42 | 43 | pos_x = x_embed[:, :, :, None] / dim_t 44 | pos_y = y_embed[:, :, :, None] / dim_t 45 | pos_x = torch.stack( 46 | (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4 47 | ).flatten(3) 48 | pos_y = torch.stack( 49 | (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4 50 | ).flatten(3) 51 | pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) 52 | return pos 53 | 54 | def __repr__(self, _repr_indent=4): 55 | head = "Positional encoding " + self.__class__.__name__ 56 | body = [ 57 | "num_pos_feats: {}".format(self.num_pos_feats), 58 | "temperature: {}".format(self.temperature), 59 | "normalize: {}".format(self.normalize), 60 | "scale: {}".format(self.scale), 61 | ] 62 | # _repr_indent = 4 63 | lines = [head] + [" " * _repr_indent + line for line in body] 64 | return "\n".join(lines) 65 | -------------------------------------------------------------------------------- /third_party/Mask2Former/mask2former/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /third_party/Mask2Former/mask2former_video/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from . import modeling 3 | 4 | # config 5 | from .config import add_maskformer2_video_config 6 | 7 | # models 8 | from .video_maskformer_model import VideoMaskFormer 9 | 10 | # video 11 | from .data_video import ( 12 | YTVISDatasetMapper, 13 | YTVISEvaluator, 14 | build_detection_train_loader, 15 | build_detection_test_loader, 16 | get_detection_dataset_dicts, 17 | ) 18 | -------------------------------------------------------------------------------- /third_party/Mask2Former/mask2former_video/config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | from detectron2.config import CfgNode as CN 4 | 5 | 6 | def add_maskformer2_video_config(cfg): 7 | # video data 8 | # DataLoader 9 | cfg.INPUT.SAMPLING_FRAME_NUM = 2 10 | cfg.INPUT.SAMPLING_FRAME_RANGE = 20 11 | cfg.INPUT.SAMPLING_FRAME_SHUFFLE = False 12 | cfg.INPUT.AUGMENTATIONS = [] # "brightness", "contrast", "saturation", "rotation" 13 | -------------------------------------------------------------------------------- /third_party/Mask2Former/mask2former_video/data_video/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC 3 | 4 | from .dataset_mapper import YTVISDatasetMapper, CocoClipDatasetMapper 5 | from .build import * 6 | 7 | from .datasets import * 8 | from .ytvis_eval import YTVISEvaluator 9 | -------------------------------------------------------------------------------- /third_party/Mask2Former/mask2former_video/data_video/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC 3 | 4 | from . import builtin # ensure the builtin datasets are registered 5 | 6 | __all__ = [k for k in globals().keys() if "builtin" not in k and not k.startswith("_")] 7 | -------------------------------------------------------------------------------- /third_party/Mask2Former/mask2former_video/data_video/datasets/builtin.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC 3 | 4 | import os 5 | 6 | from .ytvis import ( 7 | register_ytvis_instances, 8 | _get_ytvis_2019_instances_meta, 9 | _get_ytvis_2021_instances_meta, 10 | ) 11 | 12 | # ==== Predefined splits for YTVIS 2019 =========== 13 | _PREDEFINED_SPLITS_YTVIS_2019 = { 14 | "ytvis_2019_train": ("ytvis_2019/train/JPEGImages", 15 | "ytvis_2019/train.json"), 16 | "ytvis_2019_val": ("ytvis_2019/valid/JPEGImages", 17 | "ytvis_2019/valid.json"), 18 | "ytvis_2019_test": ("ytvis_2019/test/JPEGImages", 19 | "ytvis_2019/test.json"), 20 | } 21 | 22 | 23 | # ==== Predefined splits for YTVIS 2021 =========== 24 | _PREDEFINED_SPLITS_YTVIS_2021 = { 25 | "ytvis_2021_train": ("ytvis_2021/train/JPEGImages", 26 | "ytvis_2021/train.json"), 27 | "ytvis_2021_val": ("ytvis_2021/valid/JPEGImages", 28 | "ytvis_2021/valid.json"), 29 | "ytvis_2021_test": ("ytvis_2021/test/JPEGImages", 30 | "ytvis_2021/test.json"), 31 | } 32 | 33 | 34 | def register_all_ytvis_2019(root): 35 | for key, (image_root, json_file) in _PREDEFINED_SPLITS_YTVIS_2019.items(): 36 | # Assume pre-defined datasets live in `./datasets`. 37 | register_ytvis_instances( 38 | key, 39 | _get_ytvis_2019_instances_meta(), 40 | os.path.join(root, json_file) if "://" not in json_file else json_file, 41 | os.path.join(root, image_root), 42 | ) 43 | 44 | 45 | def register_all_ytvis_2021(root): 46 | for key, (image_root, json_file) in _PREDEFINED_SPLITS_YTVIS_2021.items(): 47 | # Assume pre-defined datasets live in `./datasets`. 48 | register_ytvis_instances( 49 | key, 50 | _get_ytvis_2021_instances_meta(), 51 | os.path.join(root, json_file) if "://" not in json_file else json_file, 52 | os.path.join(root, image_root), 53 | ) 54 | 55 | 56 | if __name__.endswith(".builtin"): 57 | # Assume pre-defined datasets live in `./datasets`. 58 | _root = os.getenv("DETECTRON2_DATASETS", "datasets") 59 | register_all_ytvis_2019(_root) 60 | register_all_ytvis_2021(_root) 61 | -------------------------------------------------------------------------------- /third_party/Mask2Former/mask2former_video/data_video/datasets/ytvis_api/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Modified by Bowen Cheng from https://github.com/youtubevos/cocoapi 3 | -------------------------------------------------------------------------------- /third_party/Mask2Former/mask2former_video/modeling/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from .transformer_decoder.video_mask2former_transformer_decoder import VideoMultiScaleMaskedTransformerDecoder 3 | -------------------------------------------------------------------------------- /third_party/Mask2Former/mask2former_video/modeling/transformer_decoder/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from .video_mask2former_transformer_decoder import VideoMultiScaleMaskedTransformerDecoder 3 | -------------------------------------------------------------------------------- /third_party/Mask2Former/mask2former_video/modeling/transformer_decoder/position_encoding.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py 3 | """ 4 | Various positional encodings for the transformer. 5 | """ 6 | import math 7 | 8 | import torch 9 | from torch import nn 10 | 11 | 12 | class PositionEmbeddingSine3D(nn.Module): 13 | """ 14 | This is a more standard version of the position embedding, very similar to the one 15 | used by the Attention is all you need paper, generalized to work on images. 16 | """ 17 | 18 | def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None): 19 | super().__init__() 20 | self.num_pos_feats = num_pos_feats 21 | self.temperature = temperature 22 | self.normalize = normalize 23 | if scale is not None and normalize is False: 24 | raise ValueError("normalize should be True if scale is passed") 25 | if scale is None: 26 | scale = 2 * math.pi 27 | self.scale = scale 28 | 29 | def forward(self, x, mask=None): 30 | # b, t, c, h, w 31 | assert x.dim() == 5, f"{x.shape} should be a 5-dimensional Tensor, got {x.dim()}-dimensional Tensor instead" 32 | if mask is None: 33 | mask = torch.zeros((x.size(0), x.size(1), x.size(3), x.size(4)), device=x.device, dtype=torch.bool) 34 | not_mask = ~mask 35 | z_embed = not_mask.cumsum(1, dtype=torch.float32) 36 | y_embed = not_mask.cumsum(2, dtype=torch.float32) 37 | x_embed = not_mask.cumsum(3, dtype=torch.float32) 38 | if self.normalize: 39 | eps = 1e-6 40 | z_embed = z_embed / (z_embed[:, -1:, :, :] + eps) * self.scale 41 | y_embed = y_embed / (y_embed[:, :, -1:, :] + eps) * self.scale 42 | x_embed = x_embed / (x_embed[:, :, :, -1:] + eps) * self.scale 43 | 44 | dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) 45 | dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) 46 | 47 | dim_t_z = torch.arange((self.num_pos_feats * 2), dtype=torch.float32, device=x.device) 48 | dim_t_z = self.temperature ** (2 * (dim_t_z // 2) / (self.num_pos_feats * 2)) 49 | 50 | pos_x = x_embed[:, :, :, :, None] / dim_t 51 | pos_y = y_embed[:, :, :, :, None] / dim_t 52 | pos_z = z_embed[:, :, :, :, None] / dim_t_z 53 | pos_x = torch.stack((pos_x[:, :, :, :, 0::2].sin(), pos_x[:, :, :, :, 1::2].cos()), dim=5).flatten(4) 54 | pos_y = torch.stack((pos_y[:, :, :, :, 0::2].sin(), pos_y[:, :, :, :, 1::2].cos()), dim=5).flatten(4) 55 | pos_z = torch.stack((pos_z[:, :, :, :, 0::2].sin(), pos_z[:, :, :, :, 1::2].cos()), dim=5).flatten(4) 56 | pos = (torch.cat((pos_y, pos_x), dim=4) + pos_z).permute(0, 1, 4, 2, 3) # b, t, c, h, w 57 | return pos 58 | -------------------------------------------------------------------------------- /third_party/Mask2Former/mask2former_video/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /third_party/Mask2Former/mask2former_video/utils/memory.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | import logging 4 | from contextlib import contextmanager 5 | from functools import wraps 6 | import torch 7 | from torch.cuda.amp import autocast 8 | 9 | __all__ = ["retry_if_cuda_oom"] 10 | 11 | 12 | @contextmanager 13 | def _ignore_torch_cuda_oom(): 14 | """ 15 | A context which ignores CUDA OOM exception from pytorch. 16 | """ 17 | try: 18 | yield 19 | except RuntimeError as e: 20 | # NOTE: the string may change? 21 | if "CUDA out of memory. " in str(e): 22 | pass 23 | else: 24 | raise 25 | 26 | 27 | def retry_if_cuda_oom(func): 28 | """ 29 | Makes a function retry itself after encountering 30 | pytorch's CUDA OOM error. 31 | It will first retry after calling `torch.cuda.empty_cache()`. 32 | If that still fails, it will then retry by trying to convert inputs to CPUs. 33 | In this case, it expects the function to dispatch to CPU implementation. 34 | The return values may become CPU tensors as well and it's user's 35 | responsibility to convert it back to CUDA tensor if needed. 36 | Args: 37 | func: a stateless callable that takes tensor-like objects as arguments 38 | Returns: 39 | a callable which retries `func` if OOM is encountered. 40 | Examples: 41 | :: 42 | output = retry_if_cuda_oom(some_torch_function)(input1, input2) 43 | # output may be on CPU even if inputs are on GPU 44 | Note: 45 | 1. When converting inputs to CPU, it will only look at each argument and check 46 | if it has `.device` and `.to` for conversion. Nested structures of tensors 47 | are not supported. 48 | 2. Since the function might be called more than once, it has to be 49 | stateless. 50 | """ 51 | 52 | def maybe_to_cpu(x): 53 | try: 54 | like_gpu_tensor = x.device.type == "cuda" and hasattr(x, "to") 55 | except AttributeError: 56 | like_gpu_tensor = False 57 | if like_gpu_tensor: 58 | return x.to(device="cpu").to(torch.float32) 59 | else: 60 | return x 61 | 62 | @wraps(func) 63 | def wrapped(*args, **kwargs): 64 | with _ignore_torch_cuda_oom(): 65 | return func(*args, **kwargs) 66 | 67 | # Clear cache and retry 68 | torch.cuda.empty_cache() 69 | with _ignore_torch_cuda_oom(): 70 | return func(*args, **kwargs) 71 | 72 | # Try on CPU. This slows down the code significantly, therefore print a notice. 73 | logger = logging.getLogger(__name__) 74 | logger.info("Attempting to copy inputs to CPU due to CUDA OOM") 75 | new_args = (maybe_to_cpu(x) for x in args) 76 | new_kwargs = {k: maybe_to_cpu(v) for k, v in kwargs.items()} 77 | with autocast(enabled=False): 78 | return func(*new_args, **new_kwargs) 79 | 80 | return wrapped 81 | -------------------------------------------------------------------------------- /third_party/Mask2Former/predict.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.insert(0, "Mask2Former") 3 | import tempfile 4 | from pathlib import Path 5 | import numpy as np 6 | import cv2 7 | import cog 8 | 9 | # import some common detectron2 utilities 10 | from detectron2.config import CfgNode as CN 11 | from detectron2.engine import DefaultPredictor 12 | from detectron2.config import get_cfg 13 | from detectron2.utils.visualizer import Visualizer, ColorMode 14 | from detectron2.data import MetadataCatalog 15 | from detectron2.projects.deeplab import add_deeplab_config 16 | 17 | # import Mask2Former project 18 | from mask2former import add_maskformer2_config 19 | 20 | 21 | class Predictor(cog.Predictor): 22 | def setup(self): 23 | cfg = get_cfg() 24 | add_deeplab_config(cfg) 25 | add_maskformer2_config(cfg) 26 | cfg.merge_from_file("Mask2Former/configs/coco/panoptic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_100ep.yaml") 27 | cfg.MODEL.WEIGHTS = 'model_final_f07440.pkl' 28 | cfg.MODEL.MASK_FORMER.TEST.SEMANTIC_ON = True 29 | cfg.MODEL.MASK_FORMER.TEST.INSTANCE_ON = True 30 | cfg.MODEL.MASK_FORMER.TEST.PANOPTIC_ON = True 31 | self.predictor = DefaultPredictor(cfg) 32 | self.coco_metadata = MetadataCatalog.get("coco_2017_val_panoptic") 33 | 34 | 35 | @cog.input( 36 | "image", 37 | type=Path, 38 | help="Input image for segmentation. Output will be the concatenation of Panoptic segmentation (top), " 39 | "instance segmentation (middle), and semantic segmentation (bottom).", 40 | ) 41 | def predict(self, image): 42 | im = cv2.imread(str(image)) 43 | outputs = self.predictor(im) 44 | v = Visualizer(im[:, :, ::-1], self.coco_metadata, scale=1.2, instance_mode=ColorMode.IMAGE_BW) 45 | panoptic_result = v.draw_panoptic_seg(outputs["panoptic_seg"][0].to("cpu"), 46 | outputs["panoptic_seg"][1]).get_image() 47 | v = Visualizer(im[:, :, ::-1], self.coco_metadata, scale=1.2, instance_mode=ColorMode.IMAGE_BW) 48 | instance_result = v.draw_instance_predictions(outputs["instances"].to("cpu")).get_image() 49 | v = Visualizer(im[:, :, ::-1], self.coco_metadata, scale=1.2, instance_mode=ColorMode.IMAGE_BW) 50 | semantic_result = v.draw_sem_seg(outputs["sem_seg"].argmax(0).to("cpu")).get_image() 51 | result = np.concatenate((panoptic_result, instance_result, semantic_result), axis=0)[:, :, ::-1] 52 | out_path = Path(tempfile.mkdtemp()) / "out.png" 53 | cv2.imwrite(str(out_path), result) 54 | return out_path 55 | -------------------------------------------------------------------------------- /third_party/Mask2Former/requirements.txt: -------------------------------------------------------------------------------- 1 | cython 2 | scipy 3 | shapely 4 | timm 5 | h5py 6 | submitit 7 | scikit-image 8 | -------------------------------------------------------------------------------- /third_party/Mask2Former/setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | 4 | import glob 5 | import os 6 | from os import path 7 | from setuptools import find_packages, setup 8 | import torch 9 | from torch.utils.cpp_extension import CUDA_HOME, CppExtension, CUDAExtension 10 | 11 | torch_ver = [int(x) for x in torch.__version__.split(".")[:2]] 12 | assert torch_ver >= [1, 8], "Requires PyTorch >= 1.8" 13 | 14 | 15 | def get_version(): 16 | init_py_path = path.join(path.abspath(path.dirname(__file__)), "mask2former", "__init__.py") 17 | init_py = open(init_py_path, "r").readlines() 18 | version_line = [l.strip() for l in init_py if l.startswith("__version__")][0] 19 | version = version_line.split("=")[-1].strip().strip("'\"") 20 | 21 | return version 22 | 23 | 24 | # Copied from Detectron2 25 | def get_extensions(): 26 | # skip building 27 | if not (os.environ.get("FORCE_CUDA") or torch.cuda.is_available()) or CUDA_HOME is None: 28 | return [] 29 | 30 | this_dir = os.path.dirname(os.path.abspath(__file__)) 31 | extensions_dir = os.path.join(this_dir, "mask2former/modeling/pixel_decoder/ops/src") 32 | 33 | main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) 34 | source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) 35 | source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) 36 | 37 | sources = main_file + source_cpu 38 | extension = CppExtension 39 | extra_compile_args = {"cxx": []} 40 | define_macros = [] 41 | 42 | # Force cuda since torch ask for a device, not if cuda is in fact available. 43 | if (os.environ.get("FORCE_CUDA") or torch.cuda.is_available()) and CUDA_HOME is not None: 44 | extension = CUDAExtension 45 | sources += source_cuda 46 | define_macros += [("WITH_CUDA", None)] 47 | extra_compile_args["nvcc"] = [ 48 | "-DCUDA_HAS_FP16=1", 49 | "-D__CUDA_NO_HALF_OPERATORS__", 50 | "-D__CUDA_NO_HALF_CONVERSIONS__", 51 | "-D__CUDA_NO_HALF2_OPERATORS__", 52 | ] 53 | else: 54 | if CUDA_HOME is None: 55 | raise NotImplementedError( 56 | "CUDA_HOME is None. Please set environment variable CUDA_HOME." 57 | ) 58 | else: 59 | raise NotImplementedError( 60 | "No CUDA runtime is found. Please set FORCE_CUDA=1 or test it by running torch.cuda.is_available()." # noqa 61 | ) 62 | 63 | sources = [os.path.join(extensions_dir, s) for s in sources] 64 | include_dirs = [extensions_dir] 65 | ext_modules = [ 66 | extension( 67 | "MultiScaleDeformableAttention", 68 | sources, 69 | include_dirs=include_dirs, 70 | define_macros=define_macros, 71 | extra_compile_args=extra_compile_args, 72 | ) 73 | ] 74 | return ext_modules 75 | 76 | 77 | setup( 78 | name="mask2former", 79 | version=get_version(), 80 | author="Bowen Cheng", # Thanks Bowen! 81 | url="https://github.com/facebook/mask2former", 82 | description="A pip installable version of mask2former", 83 | packages=find_packages(exclude=("configs", "tests*")), 84 | python_requires=">=3.6", 85 | install_requires=[ 86 | "detectron2 @ https://github.com/facebookresearch/detectron2/archive/v0.6.zip", 87 | "scipy>=1.7.3", 88 | "boto3>=1.21.25", 89 | "hydra-core==1.1.1", 90 | # there is BC breaking in omegaconf 2.2.1 91 | # see: https://github.com/omry/omegaconf/issues/939 92 | "omegaconf==2.1.1", 93 | "panopticapi @ https://github.com/cocodataset/panopticapi/archive/master.zip", 94 | "lvis @ https://github.com/lvis-dataset/lvis-api/archive/master.zip", 95 | ], 96 | ext_modules=get_extensions(), 97 | cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, 98 | ) 99 | -------------------------------------------------------------------------------- /third_party/Mask2Former/tools/README.md: -------------------------------------------------------------------------------- 1 | This directory contains few tools for MaskFormer. 2 | 3 | * `convert-torchvision-to-d2.py` 4 | 5 | Tool to convert torchvision pre-trained weights for D2. 6 | 7 | ``` 8 | wget https://download.pytorch.org/models/resnet101-63fe2227.pth 9 | python tools/convert-torchvision-to-d2.py resnet101-63fe2227.pth R-101.pkl 10 | ``` 11 | 12 | * `convert-pretrained-swin-model-to-d2.py` 13 | 14 | Tool to convert Swin Transformer pre-trained weights for D2. 15 | 16 | ``` 17 | pip install timm 18 | 19 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth 20 | python tools/convert-pretrained-swin-model-to-d2.py swin_tiny_patch4_window7_224.pth swin_tiny_patch4_window7_224.pkl 21 | 22 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_small_patch4_window7_224.pth 23 | python tools/convert-pretrained-swin-model-to-d2.py swin_small_patch4_window7_224.pth swin_small_patch4_window7_224.pkl 24 | 25 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window12_384_22k.pth 26 | python tools/convert-pretrained-swin-model-to-d2.py swin_base_patch4_window12_384_22k.pth swin_base_patch4_window12_384_22k.pkl 27 | 28 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth 29 | python tools/convert-pretrained-swin-model-to-d2.py swin_large_patch4_window12_384_22k.pth swin_large_patch4_window12_384_22k.pkl 30 | ``` 31 | 32 | * `evaluate_pq_for_semantic_segmentation.py` 33 | 34 | Tool to evaluate PQ (PQ-stuff) for semantic segmentation predictions. 35 | 36 | Usage: 37 | 38 | ``` 39 | python tools/evaluate_pq_for_semantic_segmentation.py --dataset-name ade20k_sem_seg_val --json-file OUTPUT_DIR/inference/sem_seg_predictions.json 40 | ``` 41 | 42 | where `OUTPUT_DIR` is set in the config file. 43 | 44 | * `evaluate_coco_boundary_ap.py` 45 | 46 | Tool to evaluate Boundary AP for instance segmentation predictions. 47 | 48 | Usage: 49 | 50 | ``` 51 | python tools/coco_instance_evaluation.py --gt-json-file COCO_GT_JSON --dt-json-file COCO_DT_JSON 52 | ``` 53 | 54 | To install Boundary IoU API, run: 55 | 56 | ``` 57 | pip install git+https://github.com/bowenc0221/boundary-iou-api.git 58 | ``` 59 | 60 | * `analyze_model.py` 61 | 62 | Tool to analyze model parameters and flops. 63 | 64 | Usage for semantic segmentation (ADE20K only, use with caution!): 65 | 66 | ``` 67 | python tools/analyze_model.py --num-inputs 1 --tasks flop --use-fixed-input-size --config-file CONFIG_FILE 68 | ``` 69 | 70 | Note that, for semantic segmentation (ADE20K only), we use a dummy image with fixed size that equals to `cfg.INPUT.CROP.SIZE[0] x cfg.INPUT.CROP.SIZE[0]`. 71 | Please do not use `--use-fixed-input-size` for calculating FLOPs on other datasets like Cityscapes! 72 | 73 | Usage for panoptic and instance segmentation: 74 | 75 | ``` 76 | python tools/analyze_model.py --num-inputs 100 --tasks flop --config-file CONFIG_FILE 77 | ``` 78 | 79 | Note that, for panoptic and instance segmentation, we compute the average flops over 100 real validation images. 80 | -------------------------------------------------------------------------------- /third_party/Mask2Former/tools/convert-pretrained-swin-model-to-d2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 3 | 4 | import pickle as pkl 5 | import sys 6 | 7 | import torch 8 | 9 | """ 10 | Usage: 11 | # download pretrained swin model: 12 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth 13 | # run the conversion 14 | ./convert-pretrained-model-to-d2.py swin_tiny_patch4_window7_224.pth swin_tiny_patch4_window7_224.pkl 15 | # Then, use swin_tiny_patch4_window7_224.pkl with the following changes in config: 16 | MODEL: 17 | WEIGHTS: "/path/to/swin_tiny_patch4_window7_224.pkl" 18 | INPUT: 19 | FORMAT: "RGB" 20 | """ 21 | 22 | if __name__ == "__main__": 23 | input = sys.argv[1] 24 | 25 | obj = torch.load(input, map_location="cpu")["model"] 26 | 27 | res = {"model": obj, "__author__": "third_party", "matching_heuristics": True} 28 | 29 | with open(sys.argv[2], "wb") as f: 30 | pkl.dump(res, f) 31 | -------------------------------------------------------------------------------- /third_party/Mask2Former/tools/convert-torchvision-to-d2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | 4 | import pickle as pkl 5 | import sys 6 | 7 | import torch 8 | 9 | """ 10 | Usage: 11 | # download one of the ResNet{18,34,50,101,152} models from torchvision: 12 | wget https://download.pytorch.org/models/resnet50-19c8e357.pth -O r50.pth 13 | # run the conversion 14 | ./convert-torchvision-to-d2.py r50.pth r50.pkl 15 | # Then, use r50.pkl with the following changes in config: 16 | MODEL: 17 | WEIGHTS: "/path/to/r50.pkl" 18 | PIXEL_MEAN: [123.675, 116.280, 103.530] 19 | PIXEL_STD: [58.395, 57.120, 57.375] 20 | RESNETS: 21 | DEPTH: 50 22 | STRIDE_IN_1X1: False 23 | INPUT: 24 | FORMAT: "RGB" 25 | """ 26 | 27 | if __name__ == "__main__": 28 | input = sys.argv[1] 29 | 30 | obj = torch.load(input, map_location="cpu") 31 | 32 | newmodel = {} 33 | for k in list(obj.keys()): 34 | old_k = k 35 | if "layer" not in k: 36 | k = "stem." + k 37 | for t in [1, 2, 3, 4]: 38 | k = k.replace("layer{}".format(t), "res{}".format(t + 1)) 39 | for t in [1, 2, 3]: 40 | k = k.replace("bn{}".format(t), "conv{}.norm".format(t)) 41 | k = k.replace("downsample.0", "shortcut") 42 | k = k.replace("downsample.1", "shortcut.norm") 43 | print(old_k, "->", k) 44 | newmodel[k] = obj.pop(old_k).detach().numpy() 45 | 46 | res = {"model": newmodel, "__author__": "torchvision", "matching_heuristics": True} 47 | 48 | with open(sys.argv[2], "wb") as f: 49 | pkl.dump(res, f) 50 | if obj: 51 | print("Unconverted keys:", obj.keys()) 52 | -------------------------------------------------------------------------------- /third_party/Mask2Former/tools/evaluate_coco_boundary_ap.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 3 | # Modified by Bowen Cheng from: https://github.com/bowenc0221/boundary-iou-api/blob/master/tools/coco_instance_evaluation.py 4 | 5 | """ 6 | Evaluation for COCO val2017: 7 | python ./tools/coco_instance_evaluation.py \ 8 | --gt-json-file COCO_GT_JSON \ 9 | --dt-json-file COCO_DT_JSON 10 | """ 11 | import argparse 12 | import json 13 | 14 | from boundary_iou.coco_instance_api.coco import COCO 15 | from boundary_iou.coco_instance_api.cocoeval import COCOeval 16 | 17 | 18 | def main(): 19 | parser = argparse.ArgumentParser() 20 | parser.add_argument("--gt-json-file", default="") 21 | parser.add_argument("--dt-json-file", default="") 22 | parser.add_argument("--iou-type", default="boundary") 23 | parser.add_argument("--dilation-ratio", default="0.020", type=float) 24 | args = parser.parse_args() 25 | print(args) 26 | 27 | annFile = args.gt_json_file 28 | resFile = args.dt_json_file 29 | dilation_ratio = args.dilation_ratio 30 | if args.iou_type == "boundary": 31 | get_boundary = True 32 | else: 33 | get_boundary = False 34 | cocoGt = COCO(annFile, get_boundary=get_boundary, dilation_ratio=dilation_ratio) 35 | 36 | # remove box predictions 37 | resFile = json.load(open(resFile)) 38 | for c in resFile: 39 | c.pop("bbox", None) 40 | 41 | cocoDt = cocoGt.loadRes(resFile) 42 | cocoEval = COCOeval(cocoGt, cocoDt, iouType=args.iou_type, dilation_ratio=dilation_ratio) 43 | cocoEval.evaluate() 44 | cocoEval.accumulate() 45 | cocoEval.summarize() 46 | 47 | 48 | if __name__ == '__main__': 49 | main() 50 | --------------------------------------------------------------------------------