├── .gitignore ├── GETTING_STARTED.md ├── INSTALL.md ├── LICENSE ├── README.md ├── colab └── oneformer_colab.ipynb ├── configs ├── ade20k │ ├── Base-ADE20K-UnifiedSegmentation.yaml │ ├── convnext │ │ ├── oneformer_convnext_large_bs16_160k.yaml │ │ └── oneformer_convnext_xlarge_bs16_160k.yaml │ ├── dinat │ │ ├── coco_pretrain_oneformer_dinat_large_bs16_160k_1280x1280.yaml │ │ ├── oneformer_dinat_large_bs16_160k.yaml │ │ ├── oneformer_dinat_large_bs16_160k_1280x1280.yaml │ │ └── oneformer_dinat_large_bs16_160k_896x896.yaml │ ├── oneformer_R50_bs16_160k.yaml │ └── swin │ │ ├── oneformer_swin_large_bs16_160k.yaml │ │ ├── oneformer_swin_large_bs16_160k_1280x1280.yaml │ │ ├── oneformer_swin_large_bs16_160k_896x896.yaml │ │ └── oneformer_swin_tiny_bs16_160k.yaml ├── cityscapes │ ├── Base-Cityscapes-UnifiedSegmentation.yaml │ ├── convnext │ │ ├── mapillary_pretrain_oneformer_convnext_large_bs16_90k.yaml │ │ ├── mapillary_pretrain_oneformer_convnext_xlarge_bs16_90k.yaml │ │ ├── oneformer_convnext_large_bs16_90k.yaml │ │ └── oneformer_convnext_xlarge_bs16_90k.yaml │ ├── dinat │ │ └── oneformer_dinat_large_bs16_90k.yaml │ ├── oneformer_R50_bs16_90k.yaml │ └── swin │ │ └── oneformer_swin_large_bs16_90k.yaml ├── coco │ ├── Base-COCO-UnifiedSegmentation.yaml │ ├── dinat │ │ └── oneformer_dinat_large_bs16_100ep.yaml │ ├── oneformer_R50_bs16_50ep.yaml │ └── swin │ │ ├── oneformer_swin_large_bs16_100ep.yaml │ │ └── oneformer_swin_tiny_bs16_50ep.yaml └── mapillary_vistas │ ├── Base-Mapillary-UnifiedSegmentation.yaml │ ├── convnext │ ├── cityscapes_pretrain_oneformer_convnext_large_bs16_300k.yaml │ ├── cityscapes_pretrain_oneformer_convnext_xlarge_bs16_300k.yaml │ └── oneformer_convnext_large_bs16_300k.yaml │ ├── dinat │ └── oneformer_dinat_large_bs16_300k.yaml │ ├── oneformer_R50_bs16_300k.yaml │ └── swin │ └── oneformer_swin_large_bs16_300k.yaml ├── datasets ├── README.md ├── ade20k_instance_catid_mapping.txt ├── ade20k_instance_imgCatIds.json ├── custom_datasets │ ├── README.md │ ├── instance_coco_custom_dataset_mapper.py │ ├── instance_oneformer_custom_dataset_mapper.py │ └── semantic_oneformer_custom_dataset_mapper.py ├── fg_ids.py ├── panoptic2detection_coco_format.py ├── panoptic_coco_categories.json ├── prepare_ade20k_ins_seg.py ├── prepare_ade20k_pan_seg.py ├── prepare_ade20k_sem_seg.py └── prepare_coco_semantic_annos_from_panoptic_annos.py ├── demo ├── README.md ├── colormap.py ├── defaults.py ├── demo.py ├── predictor.py └── visualizer.py ├── images ├── oneformer.svg ├── plots.svg ├── teaser.png └── teaser.svg ├── oneformer ├── __init__.py ├── config.py ├── data │ ├── __init__.py │ ├── bpe_simple_vocab_16e6.txt │ ├── bpe_simple_vocab_16e6.txt.gz │ ├── build.py │ ├── dataset_mappers │ │ ├── __init__.py │ │ ├── coco_unified_new_baseline_dataset_mapper.py │ │ ├── dataset_mapper.py │ │ └── oneformer_unified_dataset_mapper.py │ ├── datasets │ │ ├── __init__.py │ │ ├── register_ade20k_instance.py │ │ ├── register_ade20k_panoptic.py │ │ ├── register_cityscapes_panoptic.py │ │ ├── register_coco_panoptic2instance.py │ │ ├── register_coco_panoptic_annos_semseg.py │ │ ├── register_mapillary_vistas.py │ │ └── register_mapillary_vistas_panoptic.py │ └── tokenizer.py ├── datasetmapper_tta.py ├── evaluation │ ├── __init__.py │ ├── cityscapes_evaluation.py │ ├── coco_evaluator.py │ ├── detection_coco_evaluator.py │ ├── evaluator.py │ └── instance_evaluation.py ├── modeling │ ├── __init__.py │ ├── backbone │ │ ├── __init__.py │ │ ├── convnext.py │ │ ├── dinat.py │ │ └── swin.py │ ├── criterion.py │ ├── matcher.py │ ├── meta_arch │ │ ├── __init__.py │ │ └── oneformer_head.py │ ├── pixel_decoder │ │ ├── __init__.py │ │ ├── fpn.py │ │ ├── msdeformattn.py │ │ └── ops │ │ │ ├── functions │ │ │ ├── __init__.py │ │ │ └── ms_deform_attn_func.py │ │ │ ├── make.sh │ │ │ ├── modules │ │ │ ├── __init__.py │ │ │ └── ms_deform_attn.py │ │ │ ├── setup.py │ │ │ ├── src │ │ │ ├── cpu │ │ │ │ ├── ms_deform_attn_cpu.cpp │ │ │ │ └── ms_deform_attn_cpu.h │ │ │ ├── cuda │ │ │ │ ├── ms_deform_attn_cuda.cu │ │ │ │ ├── ms_deform_attn_cuda.h │ │ │ │ └── ms_deform_im2col_cuda.cuh │ │ │ ├── ms_deform_attn.h │ │ │ └── vision.cpp │ │ │ └── test.py │ └── transformer_decoder │ │ ├── __init__.py │ │ ├── oneformer_transformer_decoder.py │ │ ├── position_encoding.py │ │ ├── text_transformer.py │ │ └── transformer.py ├── oneformer_model.py ├── test_time_augmentation.py └── utils │ ├── __init__.py │ ├── box_ops.py │ ├── events.py │ ├── misc.py │ └── pos_embed.py ├── requirements.txt ├── tools ├── README.md ├── analyze_model.py ├── calc_throughput.py ├── convert-pretrained-model-to-d2.py ├── convert-pretrained-nat-model-to-d2.py ├── convert-torchvision-to-d2.py ├── setup_detectron2.py └── trainers │ ├── trainer.py │ └── trainer_base.py └── train_net.py /.gitignore: -------------------------------------------------------------------------------- 1 | *_video 2 | *_video.py 3 | extras/ 4 | 5 | # output dir 6 | .DS_Store 7 | output 8 | instant_test_output 9 | inference_test_output 10 | 11 | *.json 12 | *.diff 13 | *.jpg 14 | !/projects/DensePose/doc/images/*.jpg 15 | 16 | # compilation and distribution 17 | __pycache__ 18 | _ext 19 | *.pyc 20 | *.pyd 21 | *.so 22 | *.dll 23 | *.egg-info/ 24 | build/ 25 | dist/ 26 | wheels/ 27 | 28 | # pytorch/python/numpy formats 29 | *.pth 30 | *.pkl 31 | *.npy 32 | *.ts 33 | model_ts*.txt 34 | 35 | # ipython/jupyter notebooks 36 | **/.ipynb_checkpoints/ 37 | 38 | # Editor temporaries 39 | *.swn 40 | *.swo 41 | *.swp 42 | *~ 43 | 44 | # editor settings 45 | .idea 46 | .vscode 47 | _darcs 48 | 49 | # project dirs 50 | /detectron2/model_zoo/configs 51 | /projects/*/datasets 52 | /models 53 | /snippet -------------------------------------------------------------------------------- /GETTING_STARTED.md: -------------------------------------------------------------------------------- 1 | # Getting Started with OneFormer 2 | 3 | This document provides a brief intro of the usage of OneFormer. 4 | 5 | Please see [Getting Started with Detectron2](https://github.com/facebookresearch/detectron2/blob/master/GETTING_STARTED.md) for full usage. 6 | 7 | ## Training 8 | 9 | - Make sure to setup wandb before training a model. 10 | 11 | ```bash 12 | pip install wandb 13 | wandb login 14 | ``` 15 | 16 | - We provide a script `train_net.py`, that is made to train all the configs provided in OneFormer. 17 | 18 | - To train a model with "train_net.py", first setup the corresponding datasets following [datasets/README.md](./datasets/README.md). 19 | 20 | - Be default, the model uses `task=panoptic` for evaluation during training. 21 | 22 | ```bash 23 | python train_net.py --dist-url 'tcp://127.0.0.1:50163' \ 24 | --num-gpus 8 \ 25 | --config-file configs/ade20k/swin/oneformer_swin_large_bs16_160k.yaml \ 26 | OUTPUT_DIR outputs/ade20k_swin_large WANDB.NAME ade20k_swin_large 27 | ``` 28 | 29 | ## Evaluation 30 | 31 | - You need to pass the value of `task` token. `task` belongs to [panoptic, semantic, instance]. 32 | 33 | - To evaluate a model's performance, use: 34 | 35 | ```bash 36 | python train_net.py --dist-url 'tcp://127.0.0.1:50164' \ 37 | --num-gpus 8 \ 38 | --config-file configs/ade20k/swin/oneformer_swin_large_bs16_160k.yaml \ 39 | --eval-only MODEL.IS_TRAIN False MODEL.WEIGHTS \ 40 | MODEL.TEST.TASK 41 | ``` 42 | 43 | ## Inference Demo 44 | 45 | We provide a demo script for inference on images. For more information, please see [demo/README.md](demo/README.md). 46 | -------------------------------------------------------------------------------- /INSTALL.md: -------------------------------------------------------------------------------- 1 | # Installation 2 | 3 | ## Requirements 4 | 5 | We use an evironment with the following specifications, packages and dependencies: 6 | 7 | - Ubuntu 20.04.3 LTS 8 | - Python 3.8.13 9 | - conda 4.12.0 10 | - [PyTorch v1.10.1](https://pytorch.org/get-started/previous-versions/) 11 | - [Torchvision v0.11.2](https://pytorch.org/get-started/previous-versions/) 12 | - [Detectron2 v0.6](https://github.com/facebookresearch/detectron2/releases/tag/v0.6) 13 | - [NATTEN v0.14.4](https://github.com/SHI-Labs/NATTEN/releases/tag/v0.14.4) 14 | 15 | ## Setup Instructions 16 | 17 | - Create a conda environment 18 | 19 | ```bash 20 | conda create --name oneformer python=3.8 -y 21 | conda activate oneformer 22 | ``` 23 | 24 | - Install packages and other dependencies. 25 | 26 | ```bash 27 | git clone https://github.com/SHI-Labs/OneFormer.git 28 | cd OneFormer 29 | 30 | # Install Pytorch 31 | conda install pytorch==1.10.1 torchvision==0.11.2 cudatoolkit=11.3 -c pytorch -c conda-forge 32 | 33 | # Install opencv (required for running the demo) 34 | pip3 install -U opencv-python 35 | 36 | # Install detectron2 37 | python tools/setup_detectron2.py 38 | 39 | # Install other dependencies 40 | pip3 install git+https://github.com/cocodataset/panopticapi.git 41 | pip3 install git+https://github.com/mcordts/cityscapesScripts.git 42 | pip3 install -r requirements.txt 43 | ``` 44 | 45 | - Setup wandb. 46 | 47 | ```bash 48 | # Setup wand 49 | pip3 install wandb 50 | wandb login 51 | ``` 52 | 53 | - Setup CUDA Kernel for MSDeformAttn. `CUDA_HOME` must be defined and points to the directory of the installed CUDA toolkit. 54 | 55 | ```bash 56 | # Setup MSDeformAttn 57 | cd oneformer/modeling/pixel_decoder/ops 58 | sh make.sh 59 | cd ../../../.. 60 | ``` 61 | 62 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 SHI Labs 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /configs/ade20k/Base-ADE20K-UnifiedSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | # NORM: "SyncBN" 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("ade20k_panoptic_train",) 18 | TEST_PANOPTIC: ("ade20k_panoptic_val",) 19 | TEST_INSTANCE: ("ade20k_instance_val",) 20 | TEST_SEMANTIC: ("ade20k_sem_seg_val",) 21 | SOLVER: 22 | IMS_PER_BATCH: 16 23 | BASE_LR: 0.0001 24 | MAX_ITER: 160000 25 | WARMUP_FACTOR: 1.0 26 | WARMUP_ITERS: 0 27 | WEIGHT_DECAY: 0.05 28 | OPTIMIZER: "ADAMW" 29 | LR_SCHEDULER_NAME: "WarmupPolyLR" 30 | BACKBONE_MULTIPLIER: 0.1 31 | CLIP_GRADIENTS: 32 | ENABLED: True 33 | CLIP_TYPE: "full_model" 34 | CLIP_VALUE: 0.01 35 | NORM_TYPE: 2.0 36 | AMP: 37 | ENABLED: True 38 | INPUT: 39 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 512) for x in range(5, 21)]"] 40 | MIN_SIZE_TRAIN_SAMPLING: "choice" 41 | MIN_SIZE_TEST: 512 42 | MAX_SIZE_TRAIN: 2048 43 | MAX_SIZE_TEST: 2048 44 | CROP: 45 | ENABLED: True 46 | TYPE: "absolute" 47 | SIZE: (512, 512) 48 | SINGLE_CATEGORY_MAX_AREA: 1.0 49 | COLOR_AUG_SSD: True 50 | SIZE_DIVISIBILITY: 512 # used in dataset mapper 51 | FORMAT: "RGB" 52 | DATASET_MAPPER_NAME: "oneformer_unified" 53 | MAX_SEQ_LEN: 77 54 | TASK_SEQ_LEN: 77 55 | TASK_PROB: 56 | SEMANTIC: 0.33 57 | INSTANCE: 0.66 58 | TEST: 59 | EVAL_PERIOD: 5000 60 | AUG: 61 | ENABLED: False 62 | MIN_SIZES: [256, 384, 512, 640, 768, 896] 63 | MAX_SIZE: 3584 64 | FLIP: True 65 | DATALOADER: 66 | FILTER_EMPTY_ANNOTATIONS: True 67 | NUM_WORKERS: 4 68 | VERSION: 2 -------------------------------------------------------------------------------- /configs/ade20k/convnext/oneformer_convnext_large_bs16_160k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../oneformer_R50_bs16_160k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2ConvNeXt" 5 | CONVNEXT: 6 | IN_CHANNELS: 3 7 | DEPTHS: [3, 3, 27, 3] 8 | DIMS: [192, 384, 768, 1536] 9 | DROP_PATH_RATE: 0.4 10 | LSIT: 1.0 11 | OUT_INDICES: [0, 1, 2, 3] 12 | WEIGHTS: "convnext_large_22k_1k_384.pkl" 13 | PIXEL_MEAN: [123.675, 116.280, 103.530] 14 | PIXEL_STD: [58.395, 57.120, 57.375] 15 | ONE_FORMER: 16 | NUM_OBJECT_QUERIES: 250 17 | INPUT: 18 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"] 19 | MIN_SIZE_TRAIN_SAMPLING: "choice" 20 | MIN_SIZE_TEST: 640 21 | MAX_SIZE_TRAIN: 2560 22 | MAX_SIZE_TEST: 2560 23 | CROP: 24 | ENABLED: True 25 | TYPE: "absolute" 26 | SIZE: (640, 640) 27 | SINGLE_CATEGORY_MAX_AREA: 1.0 28 | COLOR_AUG_SSD: True 29 | SIZE_DIVISIBILITY: 640 # used in dataset mapper 30 | FORMAT: "RGB" 31 | TEST: 32 | DETECTIONS_PER_IMAGE: 250 33 | EVAL_PERIOD: 5000 34 | AUG: 35 | ENABLED: False 36 | MIN_SIZES: [320, 480, 640, 800, 960, 1120] 37 | MAX_SIZE: 4480 38 | FLIP: True 39 | -------------------------------------------------------------------------------- /configs/ade20k/convnext/oneformer_convnext_xlarge_bs16_160k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../oneformer_R50_bs16_160k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2ConvNeXt" 5 | CONVNEXT: 6 | IN_CHANNELS: 3 7 | DEPTHS: [3, 3, 27, 3] 8 | DIMS: [256, 512, 1024, 2048] 9 | DROP_PATH_RATE: 0.4 10 | LSIT: 1.0 11 | OUT_INDICES: [0, 1, 2, 3] 12 | WEIGHTS: "convnext_xlarge_22k_1k_384_ema.pkl" 13 | PIXEL_MEAN: [123.675, 116.280, 103.530] 14 | PIXEL_STD: [58.395, 57.120, 57.375] 15 | ONE_FORMER: 16 | NUM_OBJECT_QUERIES: 250 17 | INPUT: 18 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"] 19 | MIN_SIZE_TRAIN_SAMPLING: "choice" 20 | MIN_SIZE_TEST: 640 21 | MAX_SIZE_TRAIN: 2560 22 | MAX_SIZE_TEST: 2560 23 | CROP: 24 | ENABLED: True 25 | TYPE: "absolute" 26 | SIZE: (640, 640) 27 | SINGLE_CATEGORY_MAX_AREA: 1.0 28 | COLOR_AUG_SSD: True 29 | SIZE_DIVISIBILITY: 640 # used in dataset mapper 30 | FORMAT: "RGB" 31 | TEST: 32 | DETECTIONS_PER_IMAGE: 250 33 | EVAL_PERIOD: 5000 34 | AUG: 35 | ENABLED: False 36 | MIN_SIZES: [320, 480, 640, 800, 960, 1120] 37 | MAX_SIZE: 4480 38 | FLIP: True 39 | -------------------------------------------------------------------------------- /configs/ade20k/dinat/coco_pretrain_oneformer_dinat_large_bs16_160k_1280x1280.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../oneformer_R50_bs16_160k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2DiNAT" 5 | DiNAT: 6 | EMBED_DIM: 192 7 | MLP_RATIO: 2.0 8 | DEPTHS: [3, 4, 18, 5] 9 | NUM_HEADS: [6, 12, 24, 48] 10 | KERNEL_SIZE: 11 11 | DROP_PATH_RATE: 0.3 12 | DILATIONS: [[1, 28, 1], [1, 7, 1, 14], [1, 3, 1, 5, 1, 5, 1, 7, 1, 3, 1, 5, 1, 5, 1, 7, 1, 7], [1, 3, 1, 3, 1]] 13 | WEIGHTS: "150_16_dinat_l_oneformer_coco_100ep.pth" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | ONE_FORMER: 17 | NUM_OBJECT_QUERIES: 150 18 | SOLVER: 19 | AMP: 20 | ENABLED: False 21 | INPUT: 22 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1280) for x in range(5, 21)]"] 23 | MIN_SIZE_TRAIN_SAMPLING: "choice" 24 | MIN_SIZE_TEST: 1280 25 | MAX_SIZE_TRAIN: 5120 26 | MAX_SIZE_TEST: 5120 27 | CROP: 28 | ENABLED: True 29 | TYPE: "absolute" 30 | SIZE: (1280, 1280) 31 | SINGLE_CATEGORY_MAX_AREA: 1.0 32 | COLOR_AUG_SSD: True 33 | SIZE_DIVISIBILITY: 1280 # used in dataset mapper 34 | FORMAT: "RGB" 35 | TEST: 36 | DETECTIONS_PER_IMAGE: 150 37 | EVAL_PERIOD: 5000 38 | AUG: 39 | ENABLED: False 40 | MIN_SIZES: [640, 960, 1280, 1600, 1920, 2240] 41 | MAX_SIZE: 8960 42 | FLIP: True -------------------------------------------------------------------------------- /configs/ade20k/dinat/oneformer_dinat_large_bs16_160k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../oneformer_R50_bs16_160k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2DiNAT" 5 | DiNAT: 6 | EMBED_DIM: 192 7 | MLP_RATIO: 2.0 8 | DEPTHS: [3, 4, 18, 5] 9 | NUM_HEADS: [6, 12, 24, 48] 10 | KERNEL_SIZE: 11 11 | DROP_PATH_RATE: 0.3 12 | DILATIONS: [[1, 20, 1], [1, 5, 1, 10], [1, 2, 1, 3, 1, 4, 1, 5, 1, 2, 1, 3, 1, 4, 1, 5, 1, 5], [1, 2, 1, 2, 1]] 13 | WEIGHTS: "dinat_large_in22k_in1k_384_11x11.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | ONE_FORMER: 17 | NUM_OBJECT_QUERIES: 250 18 | SOLVER: 19 | AMP: 20 | ENABLED: False 21 | INPUT: 22 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"] 23 | MIN_SIZE_TRAIN_SAMPLING: "choice" 24 | MIN_SIZE_TEST: 640 25 | MAX_SIZE_TRAIN: 2560 26 | MAX_SIZE_TEST: 2560 27 | CROP: 28 | ENABLED: True 29 | TYPE: "absolute" 30 | SIZE: (640, 640) 31 | SINGLE_CATEGORY_MAX_AREA: 1.0 32 | COLOR_AUG_SSD: True 33 | SIZE_DIVISIBILITY: 640 # used in dataset mapper 34 | FORMAT: "RGB" 35 | TEST: 36 | DETECTIONS_PER_IMAGE: 250 37 | EVAL_PERIOD: 5000 38 | AUG: 39 | ENABLED: False 40 | MIN_SIZES: [320, 480, 640, 800, 960, 1120] 41 | MAX_SIZE: 4480 42 | FLIP: True 43 | -------------------------------------------------------------------------------- /configs/ade20k/dinat/oneformer_dinat_large_bs16_160k_1280x1280.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../oneformer_R50_bs16_160k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2DiNAT" 5 | DiNAT: 6 | EMBED_DIM: 192 7 | MLP_RATIO: 2.0 8 | DEPTHS: [3, 4, 18, 5] 9 | NUM_HEADS: [6, 12, 24, 48] 10 | KERNEL_SIZE: 11 11 | DROP_PATH_RATE: 0.3 12 | DILATIONS: [[1, 28, 1], [1, 7, 1, 14], [1, 3, 1, 5, 1, 5, 1, 7, 1, 3, 1, 5, 1, 5, 1, 7, 1, 7], [1, 3, 1, 3, 1]] 13 | WEIGHTS: "dinat_large_in22k_in1k_384_11x11.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | ONE_FORMER: 17 | NUM_OBJECT_QUERIES: 250 18 | SOLVER: 19 | AMP: 20 | ENABLED: False 21 | INPUT: 22 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1280) for x in range(5, 21)]"] 23 | MIN_SIZE_TRAIN_SAMPLING: "choice" 24 | MIN_SIZE_TEST: 1280 25 | MAX_SIZE_TRAIN: 5120 26 | MAX_SIZE_TEST: 5120 27 | CROP: 28 | ENABLED: True 29 | TYPE: "absolute" 30 | SIZE: (1280, 1280) 31 | SINGLE_CATEGORY_MAX_AREA: 1.0 32 | COLOR_AUG_SSD: True 33 | SIZE_DIVISIBILITY: 1280 # used in dataset mapper 34 | FORMAT: "RGB" 35 | TEST: 36 | DETECTIONS_PER_IMAGE: 250 37 | EVAL_PERIOD: 5000 38 | AUG: 39 | ENABLED: False 40 | MIN_SIZES: [640, 960, 1280, 1600, 1920, 2240] 41 | MAX_SIZE: 8960 42 | FLIP: True -------------------------------------------------------------------------------- /configs/ade20k/dinat/oneformer_dinat_large_bs16_160k_896x896.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../oneformer_R50_bs16_160k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2DiNAT" 5 | DiNAT: 6 | EMBED_DIM: 192 7 | MLP_RATIO: 2.0 8 | DEPTHS: [3, 4, 18, 5] 9 | NUM_HEADS: [6, 12, 24, 48] 10 | KERNEL_SIZE: 11 11 | DROP_PATH_RATE: 0.3 12 | DILATIONS: [[1, 20, 1], [1, 5, 1, 10], [1, 2, 1, 3, 1, 4, 1, 5, 1, 2, 1, 3, 1, 4, 1, 5, 1, 5], [1, 2, 1, 2, 1]] 13 | WEIGHTS: "dinat_large_in22k_in1k_384_11x11.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | ONE_FORMER: 17 | NUM_OBJECT_QUERIES: 250 18 | SOLVER: 19 | AMP: 20 | ENABLED: False 21 | INPUT: 22 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 896) for x in range(5, 21)]"] 23 | MIN_SIZE_TRAIN_SAMPLING: "choice" 24 | MIN_SIZE_TEST: 896 25 | MAX_SIZE_TRAIN: 3584 26 | MAX_SIZE_TEST: 3584 27 | CROP: 28 | ENABLED: True 29 | TYPE: "absolute" 30 | SIZE: (896, 896) 31 | SINGLE_CATEGORY_MAX_AREA: 1.0 32 | COLOR_AUG_SSD: True 33 | SIZE_DIVISIBILITY: 896 # used in dataset mapper 34 | FORMAT: "RGB" 35 | TEST: 36 | DETECTIONS_PER_IMAGE: 250 37 | EVAL_PERIOD: 5000 38 | AUG: 39 | ENABLED: False 40 | MIN_SIZES: [448, 678, 896, 1120, 1344, 1568] 41 | MAX_SIZE: 6272 42 | FLIP: True 43 | -------------------------------------------------------------------------------- /configs/ade20k/oneformer_R50_bs16_160k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-ADE20K-UnifiedSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "OneFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "OneFormerHead" 6 | IGNORE_VALUE: 255 7 | NUM_CLASSES: 150 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 14 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 15 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 16 | COMMON_STRIDE: 4 17 | TRANSFORMER_ENC_LAYERS: 6 18 | ONE_FORMER: 19 | TRANSFORMER_DECODER_NAME: "ContrastiveMultiScaleMaskedTransformerDecoder" 20 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 21 | DEEP_SUPERVISION: True 22 | NO_OBJECT_WEIGHT: 0.1 23 | CLASS_WEIGHT: 2.0 24 | MASK_WEIGHT: 5.0 25 | DICE_WEIGHT: 5.0 26 | CONTRASTIVE_WEIGHT: 0.5 27 | CONTRASTIVE_TEMPERATURE: 0.07 28 | HIDDEN_DIM: 256 29 | NUM_OBJECT_QUERIES: 150 30 | USE_TASK_NORM: True 31 | NHEADS: 8 32 | DROPOUT: 0.1 33 | DIM_FEEDFORWARD: 2048 34 | ENC_LAYERS: 0 35 | PRE_NORM: False 36 | ENFORCE_INPUT_PROJ: False 37 | SIZE_DIVISIBILITY: 32 38 | CLASS_DEC_LAYERS: 2 39 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 40 | TRAIN_NUM_POINTS: 12544 41 | OVERSAMPLE_RATIO: 3.0 42 | IMPORTANCE_SAMPLE_RATIO: 0.75 43 | TEXT_ENCODER: 44 | WIDTH: 256 45 | CONTEXT_LENGTH: 77 46 | NUM_LAYERS: 6 47 | VOCAB_SIZE: 49408 48 | PROJ_NUM_LAYERS: 2 49 | N_CTX: 16 50 | TEST: 51 | SEMANTIC_ON: True 52 | INSTANCE_ON: True 53 | PANOPTIC_ON: True 54 | OVERLAP_THRESHOLD: 0.8 55 | OBJECT_MASK_THRESHOLD: 0.5 56 | TASK: "panoptic" 57 | TEST: 58 | DETECTIONS_PER_IMAGE: 150 59 | -------------------------------------------------------------------------------- /configs/ade20k/swin/oneformer_swin_large_bs16_160k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../oneformer_R50_bs16_160k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_large_patch4_window12_384_22kto1k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | ONE_FORMER: 18 | NUM_OBJECT_QUERIES: 250 19 | INPUT: 20 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"] 21 | MIN_SIZE_TRAIN_SAMPLING: "choice" 22 | MIN_SIZE_TEST: 640 23 | MAX_SIZE_TRAIN: 2560 24 | MAX_SIZE_TEST: 2560 25 | CROP: 26 | ENABLED: True 27 | TYPE: "absolute" 28 | SIZE: (640, 640) 29 | SINGLE_CATEGORY_MAX_AREA: 1.0 30 | COLOR_AUG_SSD: True 31 | SIZE_DIVISIBILITY: 640 # used in dataset mapper 32 | FORMAT: "RGB" 33 | TEST: 34 | DETECTIONS_PER_IMAGE: 250 35 | EVAL_PERIOD: 5000 36 | AUG: 37 | ENABLED: False 38 | MIN_SIZES: [320, 480, 640, 800, 960, 1120] 39 | MAX_SIZE: 4480 40 | FLIP: True 41 | -------------------------------------------------------------------------------- /configs/ade20k/swin/oneformer_swin_large_bs16_160k_1280x1280.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../oneformer_R50_bs16_160k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_large_patch4_window12_384_22kto1k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | ONE_FORMER: 18 | NUM_OBJECT_QUERIES: 250 19 | INPUT: 20 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1280) for x in range(5, 21)]"] 21 | MIN_SIZE_TRAIN_SAMPLING: "choice" 22 | MIN_SIZE_TEST: 1280 23 | MAX_SIZE_TRAIN: 5120 24 | MAX_SIZE_TEST: 5120 25 | CROP: 26 | ENABLED: True 27 | TYPE: "absolute" 28 | SIZE: (1280, 1280) 29 | SINGLE_CATEGORY_MAX_AREA: 1.0 30 | COLOR_AUG_SSD: True 31 | SIZE_DIVISIBILITY: 1280 # used in dataset mapper 32 | FORMAT: "RGB" 33 | TEST: 34 | DETECTIONS_PER_IMAGE: 250 35 | EVAL_PERIOD: 5000 36 | AUG: 37 | ENABLED: False 38 | MIN_SIZES: [640, 960, 1280, 1600, 1920, 2240] 39 | MAX_SIZE: 8960 40 | FLIP: True 41 | -------------------------------------------------------------------------------- /configs/ade20k/swin/oneformer_swin_large_bs16_160k_896x896.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../oneformer_R50_bs16_160k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_large_patch4_window12_384_22kto1k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | ONE_FORMER: 18 | NUM_OBJECT_QUERIES: 250 19 | INPUT: 20 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 896) for x in range(5, 21)]"] 21 | MIN_SIZE_TRAIN_SAMPLING: "choice" 22 | MIN_SIZE_TEST: 896 23 | MAX_SIZE_TRAIN: 3584 24 | MAX_SIZE_TEST: 3584 25 | CROP: 26 | ENABLED: True 27 | TYPE: "absolute" 28 | SIZE: (896, 896) 29 | SINGLE_CATEGORY_MAX_AREA: 1.0 30 | COLOR_AUG_SSD: True 31 | SIZE_DIVISIBILITY: 896 # used in dataset mapper 32 | FORMAT: "RGB" 33 | TEST: 34 | DETECTIONS_PER_IMAGE: 250 35 | EVAL_PERIOD: 5000 36 | AUG: 37 | ENABLED: False 38 | MIN_SIZES: [448, 678, 896, 1120, 1344, 1568] 39 | MAX_SIZE: 6272 40 | FLIP: True 41 | -------------------------------------------------------------------------------- /configs/ade20k/swin/oneformer_swin_tiny_bs16_160k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../oneformer_R50_bs16_160k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 6, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_tiny_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | -------------------------------------------------------------------------------- /configs/cityscapes/Base-Cityscapes-UnifiedSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | NORM: "SyncBN" # use syncbn for cityscapes dataset 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("cityscapes_fine_panoptic_train",) 18 | TEST_PANOPTIC: ("cityscapes_fine_panoptic_val",) 19 | TEST_INSTANCE: ("cityscapes_fine_instance_seg_val",) 20 | TEST_SEMANTIC: ("cityscapes_fine_sem_seg_val",) 21 | SOLVER: 22 | IMS_PER_BATCH: 16 23 | BASE_LR: 0.0001 24 | MAX_ITER: 90000 25 | WARMUP_FACTOR: 1.0 26 | WARMUP_ITERS: 0 27 | WEIGHT_DECAY: 0.05 28 | OPTIMIZER: "ADAMW" 29 | LR_SCHEDULER_NAME: "WarmupPolyLR" 30 | BACKBONE_MULTIPLIER: 0.1 31 | CLIP_GRADIENTS: 32 | ENABLED: True 33 | CLIP_TYPE: "full_model" 34 | CLIP_VALUE: 0.01 35 | NORM_TYPE: 2.0 36 | AMP: 37 | ENABLED: True 38 | INPUT: 39 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1024) for x in range(5, 21)]"] 40 | MIN_SIZE_TRAIN_SAMPLING: "choice" 41 | MIN_SIZE_TEST: 1024 42 | MAX_SIZE_TRAIN: 4096 43 | MAX_SIZE_TEST: 2048 44 | CROP: 45 | ENABLED: True 46 | TYPE: "absolute" 47 | SIZE: (512, 1024) 48 | SINGLE_CATEGORY_MAX_AREA: 1.0 49 | COLOR_AUG_SSD: True 50 | SIZE_DIVISIBILITY: -1 51 | FORMAT: "RGB" 52 | DATASET_MAPPER_NAME: "oneformer_unified" 53 | MAX_SEQ_LEN: 77 54 | TASK_SEQ_LEN: 77 55 | TASK_PROB: 56 | SEMANTIC: 0.33 57 | INSTANCE: 0.66 58 | TEST: 59 | EVAL_PERIOD: 5000 60 | AUG: 61 | ENABLED: False 62 | MIN_SIZES: [512, 768, 1024, 1280, 1536, 1792] 63 | MAX_SIZE: 4096 64 | FLIP: True 65 | DATALOADER: 66 | FILTER_EMPTY_ANNOTATIONS: True 67 | NUM_WORKERS: 4 68 | VERSION: 2 -------------------------------------------------------------------------------- /configs/cityscapes/convnext/mapillary_pretrain_oneformer_convnext_large_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../oneformer_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2ConvNeXt" 5 | CONVNEXT: 6 | IN_CHANNELS: 3 7 | DEPTHS: [3, 3, 27, 3] 8 | DIMS: [192, 384, 768, 1536] 9 | DROP_PATH_RATE: 0.4 10 | LSIT: 1.0 11 | OUT_INDICES: [0, 1, 2, 3] 12 | WEIGHTS: "250_16_convnext_l_oneformer_mapillary_300k.pth" 13 | PIXEL_MEAN: [123.675, 116.280, 103.530] 14 | PIXEL_STD: [58.395, 57.120, 57.375] 15 | ONE_FORMER: 16 | NUM_OBJECT_QUERIES: 250 17 | TEST: 18 | DETECTIONS_PER_IMAGE: 250 19 | -------------------------------------------------------------------------------- /configs/cityscapes/convnext/mapillary_pretrain_oneformer_convnext_xlarge_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../oneformer_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2ConvNeXt" 5 | CONVNEXT: 6 | IN_CHANNELS: 3 7 | DEPTHS: [3, 3, 27, 3] 8 | DIMS: [256, 512, 1024, 2048] 9 | DROP_PATH_RATE: 0.4 10 | LSIT: 1.0 11 | OUT_INDICES: [0, 1, 2, 3] 12 | WEIGHTS: "250_16_convnext_xl_oneformer_mapillary_300k.pth" 13 | PIXEL_MEAN: [123.675, 116.280, 103.530] 14 | PIXEL_STD: [58.395, 57.120, 57.375] 15 | ONE_FORMER: 16 | NUM_OBJECT_QUERIES: 250 17 | TEST: 18 | DETECTIONS_PER_IMAGE: 250 19 | -------------------------------------------------------------------------------- /configs/cityscapes/convnext/oneformer_convnext_large_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../oneformer_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2ConvNeXt" 5 | CONVNEXT: 6 | IN_CHANNELS: 3 7 | DEPTHS: [3, 3, 27, 3] 8 | DIMS: [192, 384, 768, 1536] 9 | DROP_PATH_RATE: 0.4 10 | LSIT: 1.0 11 | OUT_INDICES: [0, 1, 2, 3] 12 | WEIGHTS: "convnext_large_22k_1k_384.pkl" 13 | PIXEL_MEAN: [123.675, 116.280, 103.530] 14 | PIXEL_STD: [58.395, 57.120, 57.375] 15 | ONE_FORMER: 16 | NUM_OBJECT_QUERIES: 250 17 | TEST: 18 | DETECTIONS_PER_IMAGE: 250 19 | -------------------------------------------------------------------------------- /configs/cityscapes/convnext/oneformer_convnext_xlarge_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../oneformer_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2ConvNeXt" 5 | CONVNEXT: 6 | IN_CHANNELS: 3 7 | DEPTHS: [3, 3, 27, 3] 8 | DIMS: [256, 512, 1024, 2048] 9 | DROP_PATH_RATE: 0.4 10 | LSIT: 1.0 11 | OUT_INDICES: [0, 1, 2, 3] 12 | WEIGHTS: "convnext_xlarge_22k_1k_384_ema.pkl" 13 | PIXEL_MEAN: [123.675, 116.280, 103.530] 14 | PIXEL_STD: [58.395, 57.120, 57.375] 15 | ONE_FORMER: 16 | NUM_OBJECT_QUERIES: 250 17 | TEST: 18 | DETECTIONS_PER_IMAGE: 250 19 | -------------------------------------------------------------------------------- /configs/cityscapes/dinat/oneformer_dinat_large_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../oneformer_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2DiNAT" 5 | DiNAT: 6 | EMBED_DIM: 192 7 | MLP_RATIO: 2.0 8 | DEPTHS: [3, 4, 18, 5] 9 | NUM_HEADS: [6, 12, 24, 48] 10 | KERNEL_SIZE: 7 11 | DROP_PATH_RATE: 0.3 12 | DILATIONS: [[1, 18, 1], [1, 5, 1, 9], [1, 2, 1, 3, 1, 4, 1, 2, 1, 3, 1, 4, 1, 2, 1, 3, 1, 4], [1, 2, 1, 2, 1]] 13 | WEIGHTS: "dinat_large_in22k_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | ONE_FORMER: 17 | NUM_OBJECT_QUERIES: 250 18 | SOLVER: 19 | AMP: 20 | ENABLED: False 21 | TEST: 22 | DETECTIONS_PER_IMAGE: 250 -------------------------------------------------------------------------------- /configs/cityscapes/oneformer_R50_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-Cityscapes-UnifiedSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "OneFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "OneFormerHead" 6 | IGNORE_VALUE: 255 7 | NUM_CLASSES: 19 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 14 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 15 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 16 | COMMON_STRIDE: 4 17 | TRANSFORMER_ENC_LAYERS: 6 18 | ONE_FORMER: 19 | TRANSFORMER_DECODER_NAME: "ContrastiveMultiScaleMaskedTransformerDecoder" 20 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 21 | DEEP_SUPERVISION: True 22 | NO_OBJECT_WEIGHT: 0.1 23 | CLASS_WEIGHT: 2.0 24 | MASK_WEIGHT: 5.0 25 | DICE_WEIGHT: 5.0 26 | CONTRASTIVE_WEIGHT: 0.5 27 | CONTRASTIVE_TEMPERATURE: 0.07 28 | HIDDEN_DIM: 256 29 | NUM_OBJECT_QUERIES: 150 30 | USE_TASK_NORM: True 31 | NHEADS: 8 32 | DROPOUT: 0.1 33 | DIM_FEEDFORWARD: 2048 34 | ENC_LAYERS: 0 35 | PRE_NORM: False 36 | ENFORCE_INPUT_PROJ: False 37 | SIZE_DIVISIBILITY: 32 38 | ENC_LAYERS: 0 39 | CLASS_DEC_LAYERS: 2 40 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 41 | TRAIN_NUM_POINTS: 12544 42 | OVERSAMPLE_RATIO: 3.0 43 | IMPORTANCE_SAMPLE_RATIO: 0.75 44 | TEXT_ENCODER: 45 | WIDTH: 256 46 | CONTEXT_LENGTH: 77 47 | NUM_LAYERS: 6 48 | VOCAB_SIZE: 49408 49 | PROJ_NUM_LAYERS: 2 50 | N_CTX: 16 51 | TEST: 52 | SEMANTIC_ON: True 53 | INSTANCE_ON: True 54 | PANOPTIC_ON: True 55 | OVERLAP_THRESHOLD: 0.8 56 | OBJECT_MASK_THRESHOLD: 0.8 57 | TASK: "panoptic" 58 | TEST: 59 | DETECTIONS_PER_IMAGE: 150 60 | -------------------------------------------------------------------------------- /configs/cityscapes/swin/oneformer_swin_large_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../oneformer_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_large_patch4_window12_384_22kto1k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | ONE_FORMER: 18 | NUM_OBJECT_QUERIES: 250 19 | TEST: 20 | DETECTIONS_PER_IMAGE: 250 21 | -------------------------------------------------------------------------------- /configs/coco/Base-COCO-UnifiedSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | # NORM: "SyncBN" 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("coco_2017_train_panoptic_with_sem_seg",) 18 | TEST_PANOPTIC: ("coco_2017_val_panoptic_with_sem_seg",) # to evaluate instance and semantic performance as well 19 | TEST_INSTANCE: ("coco_2017_val_panoptic2instance",) 20 | TEST_SEMANTIC: ("coco_2017_val_panoptic_with_sem_seg",) 21 | SOLVER: 22 | IMS_PER_BATCH: 16 23 | BASE_LR: 0.0001 24 | STEPS: (327778, 355092) 25 | MAX_ITER: 368750 26 | WARMUP_FACTOR: 1.0 27 | WARMUP_ITERS: 10 28 | WEIGHT_DECAY: 0.05 29 | OPTIMIZER: "ADAMW" 30 | BACKBONE_MULTIPLIER: 0.1 31 | CLIP_GRADIENTS: 32 | ENABLED: True 33 | CLIP_TYPE: "full_model" 34 | CLIP_VALUE: 0.01 35 | NORM_TYPE: 2.0 36 | AMP: 37 | ENABLED: True 38 | INPUT: 39 | IMAGE_SIZE: 1024 40 | MIN_SCALE: 0.1 41 | MAX_SCALE: 2.0 42 | FORMAT: "RGB" 43 | DATASET_MAPPER_NAME: "coco_unified_lsj" 44 | MAX_SEQ_LEN: 77 45 | TASK_SEQ_LEN: 77 46 | TASK_PROB: 47 | SEMANTIC: 0.33 48 | INSTANCE: 0.66 49 | TEST: 50 | EVAL_PERIOD: 5000 51 | DATALOADER: 52 | FILTER_EMPTY_ANNOTATIONS: True 53 | NUM_WORKERS: 4 54 | VERSION: 2 55 | -------------------------------------------------------------------------------- /configs/coco/dinat/oneformer_dinat_large_bs16_100ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../oneformer_R50_bs16_50ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2DiNAT" 5 | DiNAT: 6 | EMBED_DIM: 192 7 | MLP_RATIO: 2.0 8 | DEPTHS: [3, 4, 18, 5] 9 | NUM_HEADS: [6, 12, 24, 48] 10 | KERNEL_SIZE: 11 11 | DROP_PATH_RATE: 0.3 12 | DILATIONS: [[1, 20, 1], [1, 5, 1, 10], [1, 2, 1, 3, 1, 4, 1, 5, 1, 2, 1, 3, 1, 4, 1, 5, 1, 5], [1, 2, 1, 2, 1]] 13 | WEIGHTS: "dinat_large_in22k_in1k_384_11x11.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | ONE_FORMER: 17 | NUM_OBJECT_QUERIES: 150 18 | SOLVER: 19 | STEPS: (655556, 710184) 20 | MAX_ITER: 737500 21 | AMP: 22 | ENABLED: False 23 | TEST: 24 | DETECTIONS_PER_IMAGE: 150 -------------------------------------------------------------------------------- /configs/coco/oneformer_R50_bs16_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-COCO-UnifiedSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "OneFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "OneFormerHead" 6 | IGNORE_VALUE: 255 7 | NUM_CLASSES: 133 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 14 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 15 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 16 | COMMON_STRIDE: 4 17 | TRANSFORMER_ENC_LAYERS: 6 18 | ONE_FORMER: 19 | TRANSFORMER_DECODER_NAME: "ContrastiveMultiScaleMaskedTransformerDecoder" 20 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 21 | DEEP_SUPERVISION: True 22 | NO_OBJECT_WEIGHT: 0.1 23 | CLASS_WEIGHT: 2.0 24 | MASK_WEIGHT: 5.0 25 | DICE_WEIGHT: 5.0 26 | CONTRASTIVE_WEIGHT: 0.5 27 | CONTRASTIVE_TEMPERATURE: 0.07 28 | HIDDEN_DIM: 256 29 | NUM_OBJECT_QUERIES: 150 30 | USE_TASK_NORM: True 31 | NHEADS: 8 32 | DROPOUT: 0.1 33 | DIM_FEEDFORWARD: 2048 34 | ENC_LAYERS: 0 35 | PRE_NORM: False 36 | ENFORCE_INPUT_PROJ: False 37 | SIZE_DIVISIBILITY: 32 38 | CLASS_DEC_LAYERS: 2 39 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 40 | TRAIN_NUM_POINTS: 12544 41 | OVERSAMPLE_RATIO: 3.0 42 | IMPORTANCE_SAMPLE_RATIO: 0.75 43 | TEXT_ENCODER: 44 | WIDTH: 256 45 | CONTEXT_LENGTH: 77 46 | NUM_LAYERS: 6 47 | VOCAB_SIZE: 49408 48 | PROJ_NUM_LAYERS: 2 49 | N_CTX: 16 50 | TEST: 51 | SEMANTIC_ON: True 52 | INSTANCE_ON: True 53 | PANOPTIC_ON: True 54 | DETECTION_ON: False 55 | OVERLAP_THRESHOLD: 0.8 56 | OBJECT_MASK_THRESHOLD: 0.8 57 | TASK: "panoptic" 58 | TEST: 59 | DETECTIONS_PER_IMAGE: 150 60 | -------------------------------------------------------------------------------- /configs/coco/swin/oneformer_swin_large_bs16_100ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../oneformer_R50_bs16_50ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_large_patch4_window12_384_22kto1k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | ONE_FORMER: 18 | NUM_OBJECT_QUERIES: 150 19 | SOLVER: 20 | STEPS: (655556, 735184) 21 | MAX_ITER: 737500 22 | AMP: 23 | ENABLED: False 24 | TEST: 25 | DETECTIONS_PER_IMAGE: 150 26 | -------------------------------------------------------------------------------- /configs/coco/swin/oneformer_swin_tiny_bs16_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../oneformer_R50_bs16_50ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 6, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_tiny_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | -------------------------------------------------------------------------------- /configs/mapillary_vistas/Base-Mapillary-UnifiedSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | NORM: "SyncBN" # use syncbn for cityscapes dataset 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("mapillary_vistas_panoptic_train",) 18 | TEST_PANOPTIC: ("mapillary_vistas_panoptic_val",) 19 | TEST_INSTANCE: ("mapillary_vistas_panoptic_val",) 20 | TEST_SEMANTIC: ("mapillary_vistas_sem_seg_val",) 21 | SOLVER: 22 | IMS_PER_BATCH: 16 23 | BASE_LR: 0.0001 24 | MAX_ITER: 300000 25 | WARMUP_FACTOR: 1.0 26 | WARMUP_ITERS: 0 27 | WEIGHT_DECAY: 0.05 28 | OPTIMIZER: "ADAMW" 29 | LR_SCHEDULER_NAME: "WarmupPolyLR" 30 | BACKBONE_MULTIPLIER: 0.1 31 | CLIP_GRADIENTS: 32 | ENABLED: True 33 | CLIP_TYPE: "full_model" 34 | CLIP_VALUE: 0.01 35 | NORM_TYPE: 2.0 36 | AMP: 37 | ENABLED: True 38 | INPUT: 39 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 2048) for x in range(5, 21)]"] 40 | MIN_SIZE_TRAIN_SAMPLING: "choice" 41 | MIN_SIZE_TEST: 2048 42 | MAX_SIZE_TRAIN: 8192 43 | MAX_SIZE_TEST: 2048 44 | CROP: 45 | ENABLED: True 46 | TYPE: "absolute" 47 | SIZE: (1024, 1024) 48 | SINGLE_CATEGORY_MAX_AREA: 1.0 49 | COLOR_AUG_SSD: True 50 | SIZE_DIVISIBILITY: 1024 # used in dataset mapper 51 | FORMAT: "RGB" 52 | DATASET_MAPPER_NAME: "oneformer_unified" 53 | MAX_SEQ_LEN: 77 54 | TASK_SEQ_LEN: 77 55 | TASK_PROB: 56 | SEMANTIC: 0.50 57 | INSTANCE: 0.50 58 | TEST: 59 | EVAL_PERIOD: 30000 60 | AUG: 61 | ENABLED: False 62 | MIN_SIZES: [512, 768, 1024, 1280, 1536, 1792] 63 | MAX_SIZE: 4096 64 | FLIP: True 65 | DATALOADER: 66 | FILTER_EMPTY_ANNOTATIONS: True 67 | NUM_WORKERS: 10 68 | VERSION: 2 -------------------------------------------------------------------------------- /configs/mapillary_vistas/convnext/cityscapes_pretrain_oneformer_convnext_large_bs16_300k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../oneformer_R50_bs16_300k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2ConvNeXt" 5 | CONVNEXT: 6 | IN_CHANNELS: 3 7 | DEPTHS: [3, 3, 27, 3] 8 | DIMS: [192, 384, 768, 1536] 9 | DROP_PATH_RATE: 0.4 10 | LSIT: 1.0 11 | OUT_INDICES: [0, 1, 2, 3] 12 | WEIGHTS: "convnext_large_22k_1k_384.pkl" 13 | PIXEL_MEAN: [123.675, 116.280, 103.530] 14 | PIXEL_STD: [58.395, 57.120, 57.375] 15 | ONE_FORMER: 16 | NUM_OBJECT_QUERIES: 250 17 | INPUT: 18 | TASK_PROB: 19 | SEMANTIC: 0.33 20 | INSTANCE: 0.66 21 | TEST: 22 | DETECTIONS_PER_IMAGE: 250 23 | -------------------------------------------------------------------------------- /configs/mapillary_vistas/convnext/cityscapes_pretrain_oneformer_convnext_xlarge_bs16_300k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../oneformer_R50_bs16_300k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2ConvNeXt" 5 | CONVNEXT: 6 | IN_CHANNELS: 3 7 | DEPTHS: [3, 3, 27, 3] 8 | DIMS: [256, 512, 1024, 2048] 9 | DROP_PATH_RATE: 0.4 10 | LSIT: 1.0 11 | OUT_INDICES: [0, 1, 2, 3] 12 | WEIGHTS: "convnext_xlarge_22k_1k_384_ema.pkl" 13 | PIXEL_MEAN: [123.675, 116.280, 103.530] 14 | PIXEL_STD: [58.395, 57.120, 57.375] 15 | ONE_FORMER: 16 | NUM_OBJECT_QUERIES: 250 17 | INPUT: 18 | TASK_PROB: 19 | SEMANTIC: 0.33 20 | INSTANCE: 0.66 21 | TEST: 22 | DETECTIONS_PER_IMAGE: 250 23 | -------------------------------------------------------------------------------- /configs/mapillary_vistas/convnext/oneformer_convnext_large_bs16_300k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../oneformer_R50_bs16_300k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2ConvNeXt" 5 | CONVNEXT: 6 | IN_CHANNELS: 3 7 | DEPTHS: [3, 3, 27, 3] 8 | DIMS: [192, 384, 768, 1536] 9 | DROP_PATH_RATE: 0.4 10 | LSIT: 1.0 11 | OUT_INDICES: [0, 1, 2, 3] 12 | WEIGHTS: "convnext_large_22k_1k_384.pkl" 13 | PIXEL_MEAN: [123.675, 116.280, 103.530] 14 | PIXEL_STD: [58.395, 57.120, 57.375] 15 | ONE_FORMER: 16 | NUM_OBJECT_QUERIES: 250 17 | TEST: 18 | DETECTIONS_PER_IMAGE: 250 19 | -------------------------------------------------------------------------------- /configs/mapillary_vistas/dinat/oneformer_dinat_large_bs16_300k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../oneformer_R50_bs16_300k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2DiNAT" 5 | DiNAT: 6 | EMBED_DIM: 192 7 | MLP_RATIO: 2.0 8 | DEPTHS: [3, 4, 18, 5] 9 | NUM_HEADS: [6, 12, 24, 48] 10 | KERNEL_SIZE: 11 11 | DROP_PATH_RATE: 0.3 12 | DILATIONS: [[1, 20, 1], [1, 5, 1, 10], [1, 2, 1, 3, 1, 4, 1, 5, 1, 2, 1, 3, 1, 4, 1, 5, 1, 5], [1, 2, 1, 2, 1]] 13 | WEIGHTS: "dinat_large_in22k_in1k_384_11x11.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | ONE_FORMER: 17 | NUM_OBJECT_QUERIES: 250 18 | SOLVER: 19 | AMP: 20 | ENABLED: False 21 | TEST: 22 | DETECTIONS_PER_IMAGE: 250 -------------------------------------------------------------------------------- /configs/mapillary_vistas/oneformer_R50_bs16_300k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-Mapillary-UnifiedSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "OneFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "OneFormerHead" 6 | IGNORE_VALUE: 65 7 | NUM_CLASSES: 65 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 14 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 15 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 16 | COMMON_STRIDE: 4 17 | TRANSFORMER_ENC_LAYERS: 6 18 | ONE_FORMER: 19 | TRANSFORMER_DECODER_NAME: "ContrastiveMultiScaleMaskedTransformerDecoder" 20 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 21 | DEEP_SUPERVISION: True 22 | NO_OBJECT_WEIGHT: 0.1 23 | CLASS_WEIGHT: 2.0 24 | MASK_WEIGHT: 5.0 25 | DICE_WEIGHT: 5.0 26 | CONTRASTIVE_WEIGHT: 0.5 27 | CONTRASTIVE_TEMPERATURE: 0.07 28 | HIDDEN_DIM: 256 29 | NUM_OBJECT_QUERIES: 150 30 | USE_TASK_NORM: True 31 | NHEADS: 8 32 | DROPOUT: 0.1 33 | DIM_FEEDFORWARD: 2048 34 | ENC_LAYERS: 0 35 | PRE_NORM: False 36 | ENFORCE_INPUT_PROJ: False 37 | SIZE_DIVISIBILITY: 32 38 | ENC_LAYERS: 0 39 | CLASS_DEC_LAYERS: 2 40 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 41 | TRAIN_NUM_POINTS: 12544 42 | OVERSAMPLE_RATIO: 3.0 43 | IMPORTANCE_SAMPLE_RATIO: 0.75 44 | TEXT_ENCODER: 45 | WIDTH: 256 46 | CONTEXT_LENGTH: 77 47 | NUM_LAYERS: 6 48 | VOCAB_SIZE: 49408 49 | PROJ_NUM_LAYERS: 2 50 | N_CTX: 16 51 | TEST: 52 | SEMANTIC_ON: True 53 | INSTANCE_ON: True 54 | PANOPTIC_ON: True 55 | OVERLAP_THRESHOLD: 0.8 56 | OBJECT_MASK_THRESHOLD: 0.8 57 | TASK: "panoptic" 58 | TEST: 59 | DETECTIONS_PER_IMAGE: 150 60 | -------------------------------------------------------------------------------- /configs/mapillary_vistas/swin/oneformer_swin_large_bs16_300k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../oneformer_R50_bs16_300k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_large_patch4_window12_384_22kto1k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | ONE_FORMER: 18 | NUM_OBJECT_QUERIES: 250 19 | TEST: 20 | DETECTIONS_PER_IMAGE: 250 21 | -------------------------------------------------------------------------------- /datasets/README.md: -------------------------------------------------------------------------------- 1 | # Prepare Datasets for OneFormer 2 | 3 | - A dataset can be used by accessing [DatasetCatalog](https://detectron2.readthedocs.io/modules/data.html#detectron2.data.DatasetCatalog) for its data, or [MetadataCatalog](https://detectron2.readthedocs.io/modules/data.html#detectron2.data.MetadataCatalog) for its metadata (class names, etc). 4 | - This document explains how to setup the builtin datasets so they can be used by the above APIs. [Training OneFormer with Custom Datasets](https://github.com/SHI-Labs/OneFormer/tree/main/datasets/custom_datasets) gives a deeper dive on how to train OneFormer with custom datasets. 5 | - Detectron2 has builtin support for a few datasets. The datasets are assumed to exist in a directory specified by the environment variable `DETECTRON2_DATASETS`. Under this directory, detectron2 will look for datasets in the structure described below, if needed. 6 | 7 | ```text 8 | $DETECTRON2_DATASETS/ 9 | ADEChallengeData2016/ 10 | cityscapes/ 11 | coco/ 12 | mapillary_vistas/ 13 | ``` 14 | 15 | - You can set the location for builtin datasets by `export DETECTRON2_DATASETS=/path/to/datasets`. If left unset, the default is `./datasets` relative to your current working directory. 16 | 17 | 18 | ## Expected dataset structure for [ADE20K](http://sceneparsing.csail.mit.edu/) 19 | 20 | ```text 21 | ADEChallengeData2016/ 22 | images/ 23 | annotations/ 24 | objectInfo150.txt 25 | # download instance annotation 26 | annotations_instance/ 27 | # generated by prepare_ade20k_sem_seg.py 28 | annotations_detectron2/ 29 | # below are generated by prepare_ade20k_pan_seg.py 30 | ade20k_panoptic_{train,val}.json 31 | ade20k_panoptic_{train,val}/ 32 | # below are generated by prepare_ade20k_ins_seg.py 33 | ade20k_instance_{train,val}.json 34 | ``` 35 | 36 | - Generate `annotations_detectron2`: 37 | 38 | ```bash 39 | python datasets/prepare_ade20k_sem_seg.py 40 | ``` 41 | 42 | - Install panopticapi by: 43 | 44 | ```bash 45 | pip install git+https://github.com/cocodataset/panopticapi.git 46 | ``` 47 | 48 | - Download the instance annotation from : 49 | 50 | ```bash 51 | wget http://sceneparsing.csail.mit.edu/data/ChallengeData2017/annotations_instance.tar 52 | ``` 53 | 54 | - Then, run `python datasets/prepare_ade20k_pan_seg.py`, to combine semantic and instance annotations for panoptic annotations. 55 | 56 | - Run `python datasets/prepare_ade20k_ins_seg.py`, to extract instance annotations in COCO format. 57 | 58 | ## Expected dataset structure for [Cityscapes](https://www.cityscapes-dataset.com/downloads/) 59 | 60 | ```text 61 | cityscapes/ 62 | gtFine/ 63 | train/ 64 | aachen/ 65 | color.png, instanceIds.png, labelIds.png, polygons.json, 66 | labelTrainIds.png 67 | ... 68 | val/ 69 | test/ 70 | # below are generated Cityscapes panoptic annotation 71 | cityscapes_panoptic_train.json 72 | cityscapes_panoptic_train/ 73 | cityscapes_panoptic_val.json 74 | cityscapes_panoptic_val/ 75 | cityscapes_panoptic_test.json 76 | cityscapes_panoptic_test/ 77 | leftImg8bit/ 78 | train/ 79 | val/ 80 | test/ 81 | ``` 82 | 83 | - Login and download the dataset 84 | 85 | ```bash 86 | wget --keep-session-cookies --save-cookies=cookies.txt --post-data 'username=myusername&password=mypassword&submit=Login' https://www.cityscapes-dataset.com/login/ 87 | ######## gtFine 88 | wget --load-cookies cookies.txt --content-disposition https://www.cityscapes-dataset.com/file-handling/?packageID=1 89 | ######## leftImg8bit 90 | wget --load-cookies cookies.txt --content-disposition https://www.cityscapes-dataset.com/file-handling/?packageID=3 91 | ``` 92 | 93 | - Install cityscapes scripts by: 94 | 95 | ```bash 96 | pip install git+https://github.com/mcordts/cityscapesScripts.git 97 | ``` 98 | 99 | - To create labelTrainIds.png, first prepare the above structure, then run cityscapesescript with: 100 | 101 | ```bash 102 | git clone https://github.com/mcordts/cityscapesScripts.git 103 | ``` 104 | 105 | ```bash 106 | CITYSCAPES_DATASET=/path/to/abovementioned/cityscapes python cityscapesScripts/cityscapesscripts/preparation/createTrainIdLabelImgs.py 107 | ``` 108 | 109 | These files are not needed for instance segmentation. 110 | 111 | - To generate Cityscapes panoptic dataset, run cityscapesescript with: 112 | 113 | ```bash 114 | CITYSCAPES_DATASET=/path/to/abovementioned/cityscapes python cityscapesScripts/cityscapesscripts/preparation/createPanopticImgs.py 115 | ``` 116 | 117 | These files are not needed for semantic and instance segmentation. 118 | 119 | ## Expected dataset structure for [COCO](https://cocodataset.org/#download) 120 | 121 | ```text 122 | coco/ 123 | annotations/ 124 | instances_{train,val}2017.json 125 | panoptic_{train,val}2017.json 126 | caption_{train,val}2017.json 127 | # evaluate on instance labels derived from panoptic annotations 128 | panoptic2instances_val2017.json 129 | {train,val}2017/ 130 | # image files that are mentioned in the corresponding json 131 | panoptic_{train,val}2017/ # png annotations 132 | panoptic_semseg_{train,val}2017/ # generated by the script mentioned below 133 | ``` 134 | 135 | - Install panopticapi by: 136 | 137 | ```bash 138 | pip install git+https://github.com/cocodataset/panopticapi.git 139 | ``` 140 | 141 | - Then, run `python datasets/prepare_coco_semantic_annos_from_panoptic_annos.py`, to extract semantic annotations from panoptic annotations (only used for evaluation). 142 | 143 | - Then run the following command to convert the panoptic json into instance json format (used for evaluation on instance segmentation task): 144 | 145 | ```bash 146 | python datasets/panoptic2detection_coco_format.py --things_only 147 | ``` 148 | 149 | ## Expected dataset structure for [Mapillary Vistas](https://www.mapillary.com/dataset/vistas) 150 | 151 | ```text 152 | mapillary_vistas/ 153 | training/ 154 | images/ 155 | instances/ 156 | labels/ 157 | panoptic/ 158 | validation/ 159 | images/ 160 | instances/ 161 | labels/ 162 | panoptic/ 163 | mapillary_vistas_instance_{train,val}.json # generated by the script mentioned below 164 | ``` 165 | 166 | No preprocessing is needed for Mapillary Vistas on semantic and panoptic segmentation. 167 | 168 | We do not evaluate for the instance segmentation task on the Mapillary Vistas dataset. 169 | -------------------------------------------------------------------------------- /datasets/ade20k_instance_catid_mapping.txt: -------------------------------------------------------------------------------- 1 | Instacne100 SceneParse150 FullADE20K 2 | 1 8 165 3 | 2 9 3055 4 | 3 11 350 5 | 4 13 1831 6 | 5 15 774 7 | 5 15 783 8 | 6 16 2684 9 | 7 19 687 10 | 8 20 471 11 | 9 21 401 12 | 10 23 1735 13 | 11 24 2473 14 | 12 25 2329 15 | 13 28 1564 16 | 14 31 57 17 | 15 32 2272 18 | 16 33 907 19 | 17 34 724 20 | 18 36 2985 21 | 18 36 533 22 | 19 37 1395 23 | 20 38 155 24 | 21 39 2053 25 | 22 40 689 26 | 23 42 266 27 | 24 43 581 28 | 25 44 2380 29 | 26 45 491 30 | 27 46 627 31 | 28 48 2388 32 | 29 50 943 33 | 30 51 2096 34 | 31 54 2530 35 | 32 56 420 36 | 33 57 1948 37 | 34 58 1869 38 | 35 59 2251 39 | 36 63 239 40 | 37 65 571 41 | 38 66 2793 42 | 39 67 978 43 | 40 68 236 44 | 41 70 181 45 | 42 71 629 46 | 43 72 2598 47 | 44 73 1744 48 | 45 74 1374 49 | 46 75 591 50 | 47 76 2679 51 | 48 77 223 52 | 49 79 47 53 | 50 81 327 54 | 51 82 2821 55 | 52 83 1451 56 | 53 84 2880 57 | 54 86 480 58 | 55 87 77 59 | 56 88 2616 60 | 57 89 246 61 | 57 89 247 62 | 58 90 2733 63 | 59 91 14 64 | 60 93 38 65 | 61 94 1936 66 | 62 96 120 67 | 63 98 1702 68 | 64 99 249 69 | 65 103 2928 70 | 66 104 2337 71 | 67 105 1023 72 | 68 108 2989 73 | 69 109 1930 74 | 70 111 2586 75 | 71 112 131 76 | 72 113 146 77 | 73 116 95 78 | 74 117 1563 79 | 75 119 1708 80 | 76 120 103 81 | 77 121 1002 82 | 78 122 2569 83 | 79 124 2833 84 | 80 125 1551 85 | 81 126 1981 86 | 82 127 29 87 | 83 128 187 88 | 84 130 747 89 | 85 131 2254 90 | 86 133 2262 91 | 87 134 1260 92 | 88 135 2243 93 | 89 136 2932 94 | 90 137 2836 95 | 91 138 2850 96 | 92 139 64 97 | 93 140 894 98 | 94 143 1919 99 | 95 144 1583 100 | 96 145 318 101 | 97 147 2046 102 | 98 148 1098 103 | 99 149 530 104 | 100 150 954 105 | -------------------------------------------------------------------------------- /datasets/custom_datasets/README.md: -------------------------------------------------------------------------------- 1 | # Training OneFormer with Custom Datasets 2 | 3 | OneFormer advocates the usage of panoptic annotations along with its task-conditioned joint training strategy. However, if panoptic annotations are not available, then also OneFormer can be trained using only the instance or semantic annotations on custom datasets. We provide some guidelines for training with custom datasets. 4 | 5 | ## Register your New Dataset 6 | 7 | - OneFormer uses the information (class names, thing classes, etc.) stored in a dataset's metadata while preparing a dataset dictionary using a [`dataset_mapper`](https://github.com/SHI-Labs/OneFormer/tree/main/oneformer/data/dataset_mappers). 8 | 9 | - [Use Custom Datasets](https://detectron2.readthedocs.io/en/latest/tutorials/datasets.html) gives a deeper dive into registering a new custom dataset. 10 | 11 | ## Training with Available Panoptic Annotations 12 | 13 | - To prepare the dataset dictionary for each iteration during training, OneFormer uses a [`dataset_mapper`](https://github.com/SHI-Labs/OneFormer/tree/main/oneformer/data/dataset_mappers) class. 14 | 15 | - Originally, we provide two `dataset_mapper` classes which support task-conditioned joint training using the panoptic annotations: 16 | - [`COCOUnifiedNewBaselineDatasetMapper`](https://github.com/SHI-Labs/OneFormer/blob/5e04c9aaffd9bc73020d2238757f62346fe778c0/oneformer/data/dataset_mappers/coco_unified_new_baseline_dataset_mapper.py#L56): Specifically designed for COCO annotation format. 17 | - [`OneFormerUnifiedDatasetMapper`](https://github.com/SHI-Labs/OneFormer/blob/5e04c9aaffd9bc73020d2238757f62346fe778c0/oneformer/data/dataset_mappers/oneformer_unified_dataset_mapper.py#L26): General annotation format. 18 | 19 | - If you have panoptic annotations for your custom dataset, you may use these dataset_mapper classes directly after registering your dataset. You may also tune the [task sampling probabilities in the corresponding config file](https://github.com/SHI-Labs/OneFormer/blob/5e04c9aaffd9bc73020d2238757f62346fe778c0/configs/ade20k/Base-ADE20K-UnifiedSegmentation.yaml#L55). 20 | 21 | - If you want to train using only the instance or semantic annotation, please follow the next section on preparing a custom dataset mapper class. 22 | 23 | ## Write a Custom Dataset Mapper Class 24 | 25 | - If you want to train using only instance or semantic annotations, write your custom dataset mapper class and add it to the [`build_train_loader`](https://github.com/SHI-Labs/OneFormer/blob/5e04c9aaffd9bc73020d2238757f62346fe778c0/train_net.py#L156) method. 26 | 27 | - We provide a few templates for custom dataset mappers: 28 | - [`InstanceCOCOCustomNewBaselineDatasetMapper`](https://github.com/SHI-Labs/OneFormer/blob/a7fae86ce5791a93132c059c1bdfc79c9f842820/datasets/custom_datasets/instance_coco_custom_dataset_mapper.py#L72): Specifically designed for COCO instance annotation format. 29 | - [`InstanceOneFormerCustomDatasetMapper`](https://github.com/SHI-Labs/OneFormer/blob/a7fae86ce5791a93132c059c1bdfc79c9f842820/datasets/custom_datasets/instance_oneformer_custom_dataset_mapper.py#L26): General instance annotation format. 30 | - [`SemanticOneFormerCustomDatasetMapper`](https://github.com/SHI-Labs/OneFormer/blob/a7fae86ce5791a93132c059c1bdfc79c9f842820/datasets/custom_datasets/semantic_oneformer_custom_dataset_mapper.py#L26): General semantic annotation format. 31 | 32 | - Remember to register your custom dataset before training. 33 | 34 | 35 | Now you are all set to train OneFormer using your custom dataset! -------------------------------------------------------------------------------- /datasets/fg_ids.py: -------------------------------------------------------------------------------- 1 | ADE20K_FG_IDS = { 2 | 1: 8, 3 | 2: 9, 4 | 3: 11, 5 | 4: 13, 6 | 5: 15, 7 | 5: 15, 8 | 6: 16, 9 | 7: 19, 10 | 8: 20, 11 | 9: 21, 12 | 10: 23, 13 | 11: 24, 14 | 12: 25, 15 | 13: 28, 16 | 14: 31, 17 | 15: 32, 18 | 16: 33, 19 | 17: 34, 20 | 18: 36, 21 | 18: 36, 22 | 19: 37, 23 | 20: 38, 24 | 21: 39, 25 | 22: 40, 26 | 23: 42, 27 | 24: 43, 28 | 25: 44, 29 | 26: 45, 30 | 27: 46, 31 | 28: 48, 32 | 29: 50, 33 | 30: 51, 34 | 31: 54, 35 | 32: 56, 36 | 33: 57, 37 | 34: 58, 38 | 35: 59, 39 | 36: 63, 40 | 37: 65, 41 | 38: 66, 42 | 39: 67, 43 | 40: 68, 44 | 41: 70, 45 | 42: 71, 46 | 43: 72, 47 | 44: 73, 48 | 45: 74, 49 | 46: 75, 50 | 47: 76, 51 | 48: 77, 52 | 49: 79, 53 | 50: 81, 54 | 51: 82, 55 | 52: 83, 56 | 53: 84, 57 | 54: 86, 58 | 55: 87, 59 | 56: 88, 60 | 57: 89, 61 | 57: 89, 62 | 58: 90, 63 | 59: 91, 64 | 60: 93, 65 | 61: 94, 66 | 62: 96, 67 | 63: 98, 68 | 64: 99, 69 | 65: 103, 70 | 66: 104, 71 | 67: 105, 72 | 68: 108, 73 | 69: 109, 74 | 70: 111, 75 | 71: 112, 76 | 72: 113, 77 | 73: 116, 78 | 74: 117, 79 | 75: 119, 80 | 76: 120, 81 | 77: 121, 82 | 78: 122, 83 | 79: 124, 84 | 80: 125, 85 | 81: 126, 86 | 82: 127, 87 | 83: 128, 88 | 84: 130, 89 | 85: 131, 90 | 86: 133, 91 | 87: 134, 92 | 88: 135, 93 | 89: 136, 94 | 90: 137, 95 | 91: 138, 96 | 92: 139, 97 | 93: 140, 98 | 94: 143, 99 | 95: 144, 100 | 96: 145, 101 | 97: 147, 102 | 98: 148, 103 | 99: 149, 104 | 100: 150 105 | } 106 | 107 | 108 | CITYSCAPES_FG_NAMES = ['person', 'rider', 'car', 'truck', 'bus', 'train', 'motorcycle', 'bicycle'] -------------------------------------------------------------------------------- /datasets/panoptic2detection_coco_format.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # ------------------------------------------------------------------------------ 3 | # Reference: https://github.com/cocodataset/panopticapi/blob/master/converters/panoptic2detection_coco_format.py 4 | # Modified by Jitesh Jain (https://github.com/praeclarumjj3) 5 | # ------------------------------------------------------------------------------ 6 | ''' 7 | This script converts panoptic COCO format to detection COCO format. More 8 | information about the formats can be found here: 9 | http://cocodataset.org/#format-data. All segments will be stored in RLE format. 10 | 11 | Additional option: 12 | - using option '--things_only' the script can discard all stuff 13 | segments, saving segments of things classes only. 14 | ''' 15 | from __future__ import absolute_import 16 | from __future__ import division 17 | from __future__ import print_function 18 | from __future__ import unicode_literals 19 | import os, sys 20 | import argparse 21 | import numpy as np 22 | import json 23 | import time 24 | import multiprocessing 25 | 26 | import PIL.Image as Image 27 | 28 | from panopticapi.utils import get_traceback, rgb2id, save_json 29 | 30 | try: 31 | # set up path for pycocotools 32 | # sys.path.append('./cocoapi-master/PythonAPI/') 33 | from pycocotools import mask as COCOmask 34 | except Exception: 35 | raise Exception("Please install pycocotools module from https://github.com/cocodataset/cocoapi") 36 | 37 | @get_traceback 38 | def convert_panoptic_to_detection_coco_format_single_core( 39 | proc_id, annotations_set, categories, segmentations_folder, things_only 40 | ): 41 | annotations_detection = [] 42 | for working_idx, annotation in enumerate(annotations_set): 43 | if working_idx % 100 == 0: 44 | print('Core: {}, {} from {} images processed'.format(proc_id, 45 | working_idx, 46 | len(annotations_set))) 47 | 48 | file_name = '{}.png'.format(annotation['file_name'].rsplit('.')[0]) 49 | try: 50 | pan_format = np.array( 51 | Image.open(os.path.join(segmentations_folder, file_name)), dtype=np.uint32 52 | ) 53 | except IOError: 54 | raise KeyError('no prediction png file for id: {}'.format(annotation['image_id'])) 55 | pan = rgb2id(pan_format) 56 | 57 | for segm_info in annotation['segments_info']: 58 | if things_only and categories[segm_info['category_id']]['isthing'] != 1: 59 | continue 60 | mask = (pan == segm_info['id']).astype(np.uint8) 61 | mask = np.expand_dims(mask, axis=2) 62 | segm_info.pop('id') 63 | segm_info['image_id'] = annotation['image_id'] 64 | rle = COCOmask.encode(np.asfortranarray(mask))[0] 65 | rle['counts'] = rle['counts'].decode('utf8') 66 | segm_info['segmentation'] = rle 67 | annotations_detection.append(segm_info) 68 | 69 | print('Core: {}, all {} images processed'.format(proc_id, len(annotations_set))) 70 | return annotations_detection 71 | 72 | 73 | def convert_panoptic_to_detection_coco_format(input_json_file, 74 | segmentations_folder, 75 | output_json_file, 76 | categories_json_file, 77 | things_only): 78 | start_time = time.time() 79 | 80 | if segmentations_folder is None: 81 | segmentations_folder = input_json_file.rsplit('.', 1)[0] 82 | 83 | print("CONVERTING...") 84 | print("COCO panoptic format:") 85 | print("\tSegmentation folder: {}".format(segmentations_folder)) 86 | print("\tJSON file: {}".format(input_json_file)) 87 | print("TO") 88 | print("COCO detection format") 89 | print("\tJSON file: {}".format(output_json_file)) 90 | if things_only: 91 | print("Saving only segments of things classes.") 92 | print('\n') 93 | 94 | print("Reading annotation information from {}".format(input_json_file)) 95 | with open(input_json_file, 'r') as f: 96 | d_coco = json.load(f) 97 | annotations_panoptic = d_coco['annotations'] 98 | 99 | with open(categories_json_file, 'r') as f: 100 | categories_list = json.load(f) 101 | categories = {category['id']: category for category in categories_list} 102 | 103 | cpu_num = multiprocessing.cpu_count() 104 | annotations_split = np.array_split(annotations_panoptic, cpu_num) 105 | print("Number of cores: {}, images per core: {}".format(cpu_num, len(annotations_split[0]))) 106 | workers = multiprocessing.Pool(processes=cpu_num) 107 | processes = [] 108 | for proc_id, annotations_set in enumerate(annotations_split): 109 | p = workers.apply_async(convert_panoptic_to_detection_coco_format_single_core, 110 | (proc_id, annotations_set, categories, segmentations_folder, things_only)) 111 | processes.append(p) 112 | annotations_coco_detection = [] 113 | for p in processes: 114 | annotations_coco_detection.extend(p.get()) 115 | for idx, ann in enumerate(annotations_coco_detection): 116 | ann['id'] = idx 117 | 118 | d_coco['annotations'] = annotations_coco_detection 119 | categories_coco_detection = [] 120 | for category in d_coco['categories']: 121 | if things_only and category['isthing'] != 1: 122 | continue 123 | category.pop('isthing') 124 | categories_coco_detection.append(category) 125 | d_coco['categories'] = categories_coco_detection 126 | save_json(d_coco, output_json_file) 127 | 128 | t_delta = time.time() - start_time 129 | print("Time elapsed: {:0.2f} seconds".format(t_delta)) 130 | 131 | 132 | if __name__ == "__main__": 133 | parser = argparse.ArgumentParser( 134 | description="The script converts panoptic COCO format to detection \ 135 | COCO format. See this file's head for more information." 136 | ) 137 | parser.add_argument('--things_only', action='store_true', 138 | help="discard stuff classes") 139 | args = parser.parse_args() 140 | 141 | _root = os.getenv("DETECTRON2_DATASETS", "datasets") 142 | root = os.path.join(_root, "coco") 143 | input_json_file = os.path.join(root, "annotations", "panoptic_val2017.json") 144 | output_json_file = os.path.join(root, "annotations", "panoptic2instances_val2017.json") 145 | categories_json_file = "datasets/panoptic_coco_categories.json" 146 | segmentations_folder = os.path.join(root, "panoptic_val2017") 147 | 148 | convert_panoptic_to_detection_coco_format(input_json_file, 149 | segmentations_folder, 150 | output_json_file, 151 | categories_json_file, 152 | args.things_only) 153 | -------------------------------------------------------------------------------- /datasets/prepare_ade20k_ins_seg.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # Copyright (c) Facebook, Inc. and its affiliates. 4 | import glob 5 | import json 6 | import os 7 | from collections import Counter 8 | 9 | import numpy as np 10 | import tqdm 11 | from panopticapi.utils import IdGenerator, save_json 12 | from PIL import Image 13 | import pycocotools.mask as mask_util 14 | 15 | 16 | if __name__ == "__main__": 17 | dataset_dir = os.getenv("DETECTRON2_DATASETS", "datasets") 18 | 19 | for name, dirname in [("train", "training"), ("val", "validation")]: 20 | image_dir = os.path.join(dataset_dir, f"ADEChallengeData2016/images/{dirname}/") 21 | instance_dir = os.path.join( 22 | dataset_dir, f"ADEChallengeData2016/annotations_instance/{dirname}/" 23 | ) 24 | 25 | # img_id = 0 26 | ann_id = 1 27 | 28 | # json 29 | out_file = os.path.join(dataset_dir, f"ADEChallengeData2016/ade20k_instance_{name}.json") 30 | 31 | # json config 32 | instance_config_file = "datasets/ade20k_instance_imgCatIds.json" 33 | with open(instance_config_file) as f: 34 | category_dict = json.load(f)["categories"] 35 | 36 | # load catid mapping 37 | # it is important to share category id for both instance and panoptic annotations 38 | mapping_file = "datasets/ade20k_instance_catid_mapping.txt" 39 | with open(mapping_file) as f: 40 | map_id = {} 41 | for i, line in enumerate(f.readlines()): 42 | if i == 0: 43 | continue 44 | ins_id, sem_id, _ = line.strip().split() 45 | # shift id by 1 because we want it to start from 0! 46 | # ignore_label becomes 255 47 | map_id[int(ins_id)] = int(sem_id) - 1 48 | 49 | for cat in category_dict: 50 | cat["id"] = map_id[cat["id"]] 51 | 52 | filenames = sorted(glob.glob(os.path.join(image_dir, "*.jpg"))) 53 | 54 | ann_dict = {} 55 | images = [] 56 | annotations = [] 57 | 58 | for idx, filename in enumerate(tqdm.tqdm(filenames)): 59 | image = {} 60 | image_id = os.path.basename(filename).split(".")[0] 61 | 62 | image["id"] = image_id 63 | image["file_name"] = os.path.basename(filename) 64 | 65 | original_format = np.array(Image.open(filename)) 66 | image["width"] = original_format.shape[1] 67 | image["height"] = original_format.shape[0] 68 | 69 | images.append(image) 70 | 71 | filename_instance = os.path.join(instance_dir, image_id + ".png") 72 | ins_seg = np.asarray(Image.open(filename_instance)) 73 | assert ins_seg.dtype == np.uint8 74 | 75 | instance_cat_ids = ins_seg[..., 0] 76 | # instance id starts from 1! 77 | # because 0 is reserved as VOID label 78 | instance_ins_ids = ins_seg[..., 1] 79 | 80 | # process things 81 | for thing_id in np.unique(instance_ins_ids): 82 | if thing_id == 0: 83 | continue 84 | mask = instance_ins_ids == thing_id 85 | instance_cat_id = np.unique(instance_cat_ids[mask]) 86 | assert len(instance_cat_id) == 1 87 | 88 | anno = {} 89 | anno['id'] = ann_id 90 | ann_id += 1 91 | anno['image_id'] = image['id'] 92 | anno["iscrowd"] = int(0) 93 | anno["category_id"] = int(map_id[instance_cat_id[0]]) 94 | 95 | inds = np.nonzero(mask) 96 | ymin, ymax = inds[0].min(), inds[0].max() 97 | xmin, xmax = inds[1].min(), inds[1].max() 98 | anno["bbox"] = [int(xmin), int(ymin), int(xmax - xmin + 1), int(ymax - ymin + 1)] 99 | # if xmax <= xmin or ymax <= ymin: 100 | # continue 101 | rle = mask_util.encode(np.array(mask[:, :, None], order="F", dtype="uint8"))[0] 102 | rle["counts"] = rle["counts"].decode("utf-8") 103 | anno["segmentation"] = rle 104 | anno["area"] = int(mask_util.area(rle)) 105 | annotations.append(anno) 106 | 107 | # save this 108 | ann_dict['images'] = images 109 | ann_dict['categories'] = category_dict 110 | ann_dict['annotations'] = annotations 111 | 112 | save_json(ann_dict, out_file) 113 | -------------------------------------------------------------------------------- /datasets/prepare_ade20k_sem_seg.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # Copyright (c) Facebook, Inc. and its affiliates. 4 | import os 5 | from pathlib import Path 6 | 7 | import numpy as np 8 | import tqdm 9 | from PIL import Image 10 | 11 | 12 | def convert(input, output): 13 | img = np.asarray(Image.open(input)) 14 | assert img.dtype == np.uint8 15 | img = img - 1 # 0 (ignore) becomes 255. others are shifted by 1 16 | Image.fromarray(img).save(output) 17 | 18 | 19 | if __name__ == "__main__": 20 | dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "ADEChallengeData2016" 21 | for name in ["training", "validation"]: 22 | annotation_dir = dataset_dir / "annotations" / name 23 | output_dir = dataset_dir / "annotations_detectron2" / name 24 | output_dir.mkdir(parents=True, exist_ok=True) 25 | for file in tqdm.tqdm(list(annotation_dir.iterdir())): 26 | output_file = output_dir / file.name 27 | convert(file, output_file) 28 | -------------------------------------------------------------------------------- /datasets/prepare_coco_semantic_annos_from_panoptic_annos.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # Copyright (c) Facebook, Inc. and its affiliates. 4 | 5 | import functools 6 | import json 7 | import multiprocessing as mp 8 | import numpy as np 9 | import os 10 | import time 11 | from fvcore.common.download import download 12 | from panopticapi.utils import rgb2id 13 | from PIL import Image 14 | 15 | from detectron2.data.datasets.builtin_meta import COCO_CATEGORIES 16 | 17 | 18 | def _process_panoptic_to_semantic(input_panoptic, output_semantic, segments, id_map): 19 | panoptic = np.asarray(Image.open(input_panoptic), dtype=np.uint32) 20 | panoptic = rgb2id(panoptic) 21 | output = np.zeros_like(panoptic, dtype=np.uint8) + 255 22 | for seg in segments: 23 | cat_id = seg["category_id"] 24 | new_cat_id = id_map[cat_id] 25 | output[panoptic == seg["id"]] = new_cat_id 26 | Image.fromarray(output).save(output_semantic) 27 | 28 | 29 | def separate_coco_semantic_from_panoptic(panoptic_json, panoptic_root, sem_seg_root, categories): 30 | """ 31 | Create semantic segmentation annotations from panoptic segmentation 32 | annotations, to be used by PanopticFPN. 33 | It maps all thing categories to class 0, and maps all unlabeled pixels to class 255. 34 | It maps all stuff categories to contiguous ids starting from 1. 35 | Args: 36 | panoptic_json (str): path to the panoptic json file, in COCO's format. 37 | panoptic_root (str): a directory with panoptic annotation files, in COCO's format. 38 | sem_seg_root (str): a directory to output semantic annotation files 39 | categories (list[dict]): category metadata. Each dict needs to have: 40 | "id": corresponds to the "category_id" in the json annotations 41 | "isthing": 0 or 1 42 | """ 43 | os.makedirs(sem_seg_root, exist_ok=True) 44 | 45 | id_map = {} # map from category id to id in the output semantic annotation 46 | assert len(categories) <= 254 47 | for i, k in enumerate(categories): 48 | id_map[k["id"]] = i 49 | # what is id = 0? 50 | # id_map[0] = 255 51 | print(id_map) 52 | 53 | with open(panoptic_json) as f: 54 | obj = json.load(f) 55 | 56 | pool = mp.Pool(processes=max(mp.cpu_count() // 2, 4)) 57 | 58 | def iter_annotations(): 59 | for anno in obj["annotations"]: 60 | file_name = anno["file_name"] 61 | segments = anno["segments_info"] 62 | input = os.path.join(panoptic_root, file_name) 63 | output = os.path.join(sem_seg_root, file_name) 64 | yield input, output, segments 65 | 66 | print("Start writing to {} ...".format(sem_seg_root)) 67 | start = time.time() 68 | pool.starmap( 69 | functools.partial(_process_panoptic_to_semantic, id_map=id_map), 70 | iter_annotations(), 71 | chunksize=100, 72 | ) 73 | print("Finished. time: {:.2f}s".format(time.time() - start)) 74 | 75 | 76 | if __name__ == "__main__": 77 | dataset_dir = os.path.join(os.getenv("DETECTRON2_DATASETS", "datasets"), "coco") 78 | for s in ["val2017", "train2017"]: 79 | separate_coco_semantic_from_panoptic( 80 | os.path.join(dataset_dir, "annotations/panoptic_{}.json".format(s)), 81 | os.path.join(dataset_dir, "panoptic_{}".format(s)), 82 | os.path.join(dataset_dir, "panoptic_semseg_{}".format(s)), 83 | COCO_CATEGORIES, 84 | ) 85 | -------------------------------------------------------------------------------- /demo/README.md: -------------------------------------------------------------------------------- 1 | # OneFormer Demo 2 | 3 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/SHI-Labs/OneFormer/blob/main/colab/oneformer_colab.ipynb) [![Huggingface space](https://img.shields.io/badge/🤗-Huggingface%20Space-cyan.svg)](https://huggingface.co/spaces/shi-labs/OneFormer) 4 | 5 | - Pick a model and its config file from. For example, `configs/ade20k/swin/oneformer_swin_large_IN21k_384_bs16_160k.yaml`. 6 | - We provide `demo.py` that is able to demo builtin configs. 7 | - You need to specify the `task` token value during inference, The outputs will be saved accordingly in the specified `OUTPUT_DIR`: 8 | - `panoptic`: Panoptic, Semantic and Instance Predictions when the value of `task` token is `panoptic`. 9 | - `instance`: Instance Predictions when the value of `task` token is `instance`. 10 | - `semantic`: Semantic Predictions when the value of `task` token is `semantic`. 11 | - >Note: You can change the outputs to be saved on line 60 in [predictor.py](predictor.py). 12 | 13 | ```bash 14 | export task=panoptic 15 | 16 | python demo.py --config-file ../configs/ade20k/swin/oneformer_swin_large_bs16_160k.yaml \ 17 | --input \ 18 | --output \ 19 | --task $task \ 20 | --opts MODEL.IS_TRAIN False MODEL.IS_DEMO True MODEL.WEIGHTS 21 | ``` 22 | 23 | For details of the command line arguments, see `demo.py -h` or look at its source code 24 | to understand its behavior. -------------------------------------------------------------------------------- /demo/colormap.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Reference: https://github.com/facebookresearch/detectron2/blob/main/detectron2/utils/colormap.py 3 | # Modified by Jitesh Jain (https://github.com/praeclarumjj3) 4 | # ------------------------------------------------------------------------------ 5 | 6 | """ 7 | An awesome colormap for really neat visualizations. 8 | Copied from Detectron, and removed gray colors. 9 | """ 10 | 11 | import numpy as np 12 | import random 13 | random.seed(0) 14 | 15 | __all__ = ["colormap", "random_color", "random_colors"] 16 | 17 | _COLORS = [] 18 | 19 | def gen_color(): 20 | color = tuple(np.round(np.random.choice(range(256), size=3)/255, 3)) 21 | if color not in _COLORS and np.mean(color) != 0.0: 22 | _COLORS.append(color) 23 | else: 24 | gen_color() 25 | 26 | 27 | for _ in range(300): 28 | gen_color() 29 | 30 | 31 | def colormap(rgb=False, maximum=255): 32 | """ 33 | Args: 34 | rgb (bool): whether to return RGB colors or BGR colors. 35 | maximum (int): either 255 or 1 36 | Returns: 37 | ndarray: a float32 array of Nx3 colors, in range [0, 255] or [0, 1] 38 | """ 39 | assert maximum in [255, 1], maximum 40 | c = _COLORS * maximum 41 | if not rgb: 42 | c = c[:, ::-1] 43 | return c 44 | 45 | 46 | def random_color(rgb=False, maximum=255): 47 | """ 48 | Args: 49 | rgb (bool): whether to return RGB colors or BGR colors. 50 | maximum (int): either 255 or 1 51 | Returns: 52 | ndarray: a vector of 3 numbers 53 | """ 54 | idx = np.random.randint(0, len(_COLORS)) 55 | ret = _COLORS[idx] * maximum 56 | if not rgb: 57 | ret = ret[::-1] 58 | return ret 59 | 60 | 61 | def random_colors(N, rgb=False, maximum=255): 62 | """ 63 | Args: 64 | N (int): number of unique colors needed 65 | rgb (bool): whether to return RGB colors or BGR colors. 66 | maximum (int): either 255 or 1 67 | Returns: 68 | ndarray: a list of random_color 69 | """ 70 | indices = random.sample(range(len(_COLORS)), N) 71 | ret = [_COLORS[i] * maximum for i in indices] 72 | if not rgb: 73 | ret = [x[::-1] for x in ret] 74 | return ret 75 | 76 | 77 | if __name__ == "__main__": 78 | import cv2 79 | 80 | size = 100 81 | H, W = 10, 10 82 | canvas = np.random.rand(H * size, W * size, 3).astype("float32") 83 | for h in range(H): 84 | for w in range(W): 85 | idx = h * W + w 86 | if idx >= len(_COLORS): 87 | break 88 | canvas[h * size : (h + 1) * size, w * size : (w + 1) * size] = _COLORS[idx] 89 | cv2.imshow("a", canvas) 90 | cv2.waitKey(0) -------------------------------------------------------------------------------- /demo/defaults.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Reference: https://github.com/facebookresearch/detectron2/blob/main/detectron2/engine/defaults.py 3 | # Modified by Jitesh Jain (https://github.com/praeclarumjj3) 4 | # ------------------------------------------------------------------------------ 5 | 6 | import torch 7 | import detectron2.data.transforms as T 8 | from detectron2.checkpoint import DetectionCheckpointer 9 | from detectron2.data import ( 10 | MetadataCatalog, 11 | ) 12 | from detectron2.modeling import build_model 13 | 14 | 15 | __all__ = [ 16 | "DefaultPredictor", 17 | ] 18 | 19 | 20 | class DefaultPredictor: 21 | """ 22 | Create a simple end-to-end predictor with the given config that runs on 23 | single device for a single input image. 24 | Compared to using the model directly, this class does the following additions: 25 | 1. Load checkpoint from `cfg.MODEL.WEIGHTS`. 26 | 2. Always take BGR image as the input and apply conversion defined by `cfg.INPUT.FORMAT`. 27 | 3. Apply resizing defined by `cfg.INPUT.{MIN,MAX}_SIZE_TEST`. 28 | 4. Take one input image and produce a single output, instead of a batch. 29 | This is meant for simple demo purposes, so it does the above steps automatically. 30 | This is not meant for benchmarks or running complicated inference logic. 31 | If you'd like to do anything more complicated, please refer to its source code as 32 | examples to build and use the model manually. 33 | Attributes: 34 | metadata (Metadata): the metadata of the underlying dataset, obtained from 35 | cfg.DATASETS.TEST. 36 | Examples: 37 | :: 38 | pred = DefaultPredictor(cfg) 39 | inputs = cv2.imread("input.jpg") 40 | outputs = pred(inputs) 41 | """ 42 | 43 | def __init__(self, cfg): 44 | self.cfg = cfg.clone() # cfg can be modified by model 45 | self.model = build_model(self.cfg) 46 | self.model.eval() 47 | if len(cfg.DATASETS.TEST): 48 | self.metadata = MetadataCatalog.get(cfg.DATASETS.TEST[0]) 49 | 50 | checkpointer = DetectionCheckpointer(self.model) 51 | checkpointer.load(cfg.MODEL.WEIGHTS) 52 | 53 | self.aug = T.ResizeShortestEdge( 54 | [cfg.INPUT.MIN_SIZE_TEST, cfg.INPUT.MIN_SIZE_TEST], cfg.INPUT.MAX_SIZE_TEST 55 | ) 56 | 57 | self.input_format = cfg.INPUT.FORMAT 58 | assert self.input_format in ["RGB", "BGR"], self.input_format 59 | 60 | def __call__(self, original_image, task): 61 | """ 62 | Args: 63 | original_image (np.ndarray): an image of shape (H, W, C) (in BGR order). 64 | Returns: 65 | predictions (dict): 66 | the output of the model for one image only. 67 | See :doc:`/tutorials/models` for details about the format. 68 | """ 69 | with torch.no_grad(): # https://github.com/sphinx-doc/sphinx/issues/4258 70 | # Apply pre-processing to image. 71 | if self.input_format == "RGB": 72 | # whether the model expects BGR inputs or RGB 73 | original_image = original_image[:, :, ::-1] 74 | height, width = original_image.shape[:2] 75 | image = self.aug.get_transform(original_image).apply_image(original_image) 76 | image = torch.as_tensor(image.astype("float32").transpose(2, 0, 1)) 77 | 78 | task = f"The task is {task}" 79 | 80 | inputs = {"image": image, "height": height, "width": width, "task": task} 81 | predictions = self.model([inputs])[0] 82 | return predictions -------------------------------------------------------------------------------- /demo/demo.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Reference: https://github.com/facebookresearch/Mask2Former/blob/main/demo/demo.py 3 | # Modified by Jitesh Jain (https://github.com/praeclarumjj3) 4 | # ------------------------------------------------------------------------------ 5 | 6 | import argparse 7 | import multiprocessing as mp 8 | import os 9 | import torch 10 | import random 11 | # fmt: off 12 | import sys 13 | sys.path.insert(1, os.path.join(sys.path[0], '..')) 14 | # fmt: on 15 | 16 | import time 17 | import cv2 18 | import numpy as np 19 | import tqdm 20 | 21 | from detectron2.config import get_cfg 22 | from detectron2.data.detection_utils import read_image 23 | from detectron2.projects.deeplab import add_deeplab_config 24 | from detectron2.utils.logger import setup_logger 25 | 26 | from oneformer import ( 27 | add_oneformer_config, 28 | add_common_config, 29 | add_swin_config, 30 | add_dinat_config, 31 | add_convnext_config, 32 | ) 33 | from predictor import VisualizationDemo 34 | 35 | # constants 36 | WINDOW_NAME = "OneFormer Demo" 37 | 38 | def setup_cfg(args): 39 | # load config from file and command-line arguments 40 | cfg = get_cfg() 41 | add_deeplab_config(cfg) 42 | add_common_config(cfg) 43 | add_swin_config(cfg) 44 | add_dinat_config(cfg) 45 | add_convnext_config(cfg) 46 | add_oneformer_config(cfg) 47 | cfg.merge_from_file(args.config_file) 48 | cfg.merge_from_list(args.opts) 49 | cfg.freeze() 50 | return cfg 51 | 52 | 53 | def get_parser(): 54 | parser = argparse.ArgumentParser(description="oneformer demo for builtin configs") 55 | parser.add_argument( 56 | "--config-file", 57 | default="../configs/ade20k/swin/oneformer_swin_large_IN21k_384_bs16_160k.yaml", 58 | metavar="FILE", 59 | help="path to config file", 60 | ) 61 | parser.add_argument("--task", help="Task type") 62 | parser.add_argument( 63 | "--input", 64 | nargs="+", 65 | help="A list of space separated input images; " 66 | "or a single glob pattern such as 'directory/*.jpg'", 67 | ) 68 | parser.add_argument( 69 | "--output", 70 | help="A file or directory to save output visualizations. " 71 | "If not given, will show output in an OpenCV window.", 72 | ) 73 | 74 | parser.add_argument( 75 | "--confidence-threshold", 76 | type=float, 77 | default=0.5, 78 | help="Minimum score for instance predictions to be shown", 79 | ) 80 | parser.add_argument( 81 | "--opts", 82 | help="Modify config options using the command-line 'KEY VALUE' pairs", 83 | default=[], 84 | nargs=argparse.REMAINDER, 85 | ) 86 | return parser 87 | 88 | if __name__ == "__main__": 89 | seed = 0 90 | random.seed(seed) 91 | np.random.seed(seed) 92 | torch.manual_seed(seed) 93 | torch.cuda.manual_seed_all(seed) 94 | torch.backends.cudnn.deterministic = True 95 | torch.backends.cudnn.benchmark = False 96 | 97 | mp.set_start_method("spawn", force=True) 98 | args = get_parser().parse_args() 99 | setup_logger(name="fvcore") 100 | logger = setup_logger() 101 | logger.info("Arguments: " + str(args)) 102 | 103 | cfg = setup_cfg(args) 104 | 105 | demo = VisualizationDemo(cfg) 106 | 107 | if args.input: 108 | for path in tqdm.tqdm(args.input, disable=not args.output): 109 | # use PIL, to be consistent with evaluation 110 | 111 | img = read_image(path, format="BGR") 112 | start_time = time.time() 113 | predictions, visualized_output = demo.run_on_image(img, args.task) 114 | logger.info( 115 | "{}: {} in {:.2f}s".format( 116 | path, 117 | "detected {} instances".format(len(predictions["instances"])) 118 | if "instances" in predictions 119 | else "finished", 120 | time.time() - start_time, 121 | ) 122 | ) 123 | if args.output: 124 | if len(args.input) == 1: 125 | for k in visualized_output.keys(): 126 | os.makedirs(k, exist_ok=True) 127 | out_filename = os.path.join(k, args.output) 128 | visualized_output[k].save(out_filename) 129 | else: 130 | for k in visualized_output.keys(): 131 | opath = os.path.join(args.output, k) 132 | os.makedirs(opath, exist_ok=True) 133 | out_filename = os.path.join(opath, os.path.basename(path)) 134 | visualized_output[k].save(out_filename) 135 | else: 136 | raise ValueError("Please specify an output path!") 137 | else: 138 | raise ValueError("No Input Given") 139 | -------------------------------------------------------------------------------- /demo/predictor.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Reference: https://github.com/facebookresearch/detectron2/blob/master/demo/predictor.py 3 | # Modified by Jitesh Jain (https://github.com/praeclarumjj3) 4 | # ------------------------------------------------------------------------------ 5 | 6 | import atexit 7 | import bisect 8 | import multiprocessing as mp 9 | import torch 10 | 11 | from detectron2.data import MetadataCatalog 12 | from defaults import DefaultPredictor 13 | from visualizer import ColorMode, Visualizer 14 | 15 | 16 | class VisualizationDemo(object): 17 | def __init__(self, cfg, instance_mode=ColorMode.IMAGE, parallel=False): 18 | """ 19 | Args: 20 | cfg (CfgNode): 21 | instance_mode (ColorMode): 22 | parallel (bool): whether to run the model in different processes from visualization. 23 | Useful since the visualization logic can be slow. 24 | """ 25 | self.metadata = MetadataCatalog.get( 26 | cfg.DATASETS.TEST_PANOPTIC[0] if len(cfg.DATASETS.TEST_PANOPTIC) else "__unused" 27 | ) 28 | if 'cityscapes_fine_sem_seg_val' in cfg.DATASETS.TEST_PANOPTIC[0]: 29 | from cityscapesscripts.helpers.labels import labels 30 | stuff_colors = [k.color for k in labels if k.trainId != 255] 31 | self.metadata = self.metadata.set(stuff_colors=stuff_colors) 32 | self.cpu_device = torch.device("cpu") 33 | self.instance_mode = instance_mode 34 | 35 | self.parallel = parallel 36 | if parallel: 37 | num_gpu = torch.cuda.device_count() 38 | self.predictor = AsyncPredictor(cfg, num_gpus=num_gpu) 39 | else: 40 | self.predictor = DefaultPredictor(cfg) 41 | 42 | def run_on_image(self, image, task): 43 | """ 44 | Args: 45 | image (np.ndarray): an image of shape (H, W, C) (in BGR order). 46 | This is the format used by OpenCV. 47 | Returns: 48 | predictions (dict): the output of the model. 49 | vis_output (VisImage): the visualized image output. 50 | """ 51 | vis_output = None 52 | # Convert image from OpenCV BGR format to Matplotlib RGB format. 53 | image = image[:, :, ::-1] 54 | vis_output = {} 55 | 56 | if task == 'panoptic': 57 | visualizer = Visualizer(image, metadata=self.metadata, instance_mode=ColorMode.IMAGE) 58 | predictions = self.predictor(image, task) 59 | panoptic_seg, segments_info = predictions["panoptic_seg"] 60 | vis_output['panoptic_inference'] = visualizer.draw_panoptic_seg_predictions( 61 | panoptic_seg.to(self.cpu_device), segments_info, alpha=0.7 62 | ) 63 | 64 | if task == 'panoptic' or task == 'semantic': 65 | visualizer = Visualizer(image, metadata=self.metadata, instance_mode=ColorMode.IMAGE_BW) 66 | predictions = self.predictor(image, task) 67 | vis_output['semantic_inference'] = visualizer.draw_sem_seg( 68 | predictions["sem_seg"].argmax(dim=0).to(self.cpu_device), alpha=0.7 69 | ) 70 | 71 | if task == 'panoptic' or task == 'instance': 72 | visualizer = Visualizer(image, metadata=self.metadata, instance_mode=ColorMode.IMAGE_BW) 73 | predictions = self.predictor(image, task) 74 | instances = predictions["instances"].to(self.cpu_device) 75 | vis_output['instance_inference'] = visualizer.draw_instance_predictions(predictions=instances, alpha=1) 76 | 77 | return predictions, vis_output 78 | 79 | 80 | class AsyncPredictor: 81 | """ 82 | A predictor that runs the model asynchronously, possibly on >1 GPUs. 83 | Because rendering the visualization takes considerably amount of time, 84 | this helps improve throughput a little bit when rendering videos. 85 | """ 86 | 87 | class _StopToken: 88 | pass 89 | 90 | class _PredictWorker(mp.Process): 91 | def __init__(self, cfg, task_queue, result_queue): 92 | self.cfg = cfg 93 | self.task_queue = task_queue 94 | self.result_queue = result_queue 95 | super().__init__() 96 | 97 | def run(self): 98 | predictor = DefaultPredictor(self.cfg) 99 | 100 | while True: 101 | task = self.task_queue.get() 102 | if isinstance(task, AsyncPredictor._StopToken): 103 | break 104 | idx, data = task 105 | result = predictor(data) 106 | self.result_queue.put((idx, result)) 107 | 108 | def __init__(self, cfg, num_gpus: int = 1): 109 | """ 110 | Args: 111 | cfg (CfgNode): 112 | num_gpus (int): if 0, will run on CPU 113 | """ 114 | num_workers = max(num_gpus, 1) 115 | self.task_queue = mp.Queue(maxsize=num_workers * 3) 116 | self.result_queue = mp.Queue(maxsize=num_workers * 3) 117 | self.procs = [] 118 | for gpuid in range(max(num_gpus, 1)): 119 | cfg = cfg.clone() 120 | cfg.defrost() 121 | cfg.MODEL.DEVICE = "cuda:{}".format(gpuid) if num_gpus > 0 else "cpu" 122 | self.procs.append( 123 | AsyncPredictor._PredictWorker(cfg, self.task_queue, self.result_queue) 124 | ) 125 | 126 | self.put_idx = 0 127 | self.get_idx = 0 128 | self.result_rank = [] 129 | self.result_data = [] 130 | 131 | for p in self.procs: 132 | p.start() 133 | atexit.register(self.shutdown) 134 | 135 | def put(self, image): 136 | self.put_idx += 1 137 | self.task_queue.put((self.put_idx, image)) 138 | 139 | def get(self): 140 | self.get_idx += 1 # the index needed for this request 141 | if len(self.result_rank) and self.result_rank[0] == self.get_idx: 142 | res = self.result_data[0] 143 | del self.result_data[0], self.result_rank[0] 144 | return res 145 | 146 | while True: 147 | # make sure the results are returned in the correct order 148 | idx, res = self.result_queue.get() 149 | if idx == self.get_idx: 150 | return res 151 | insert = bisect.bisect(self.result_rank, idx) 152 | self.result_rank.insert(insert, idx) 153 | self.result_data.insert(insert, res) 154 | 155 | def __len__(self): 156 | return self.put_idx - self.get_idx 157 | 158 | def __call__(self, image): 159 | self.put(image) 160 | return self.get() 161 | 162 | def shutdown(self): 163 | for _ in self.procs: 164 | self.task_queue.put(AsyncPredictor._StopToken()) 165 | 166 | @property 167 | def default_buffer_size(self): 168 | return len(self.procs) * 5 169 | -------------------------------------------------------------------------------- /images/teaser.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SHI-Labs/OneFormer/4962ef6a96ffb76a76771bfa3e8b3587f209752b/images/teaser.png -------------------------------------------------------------------------------- /oneformer/__init__.py: -------------------------------------------------------------------------------- 1 | from . import data # register all new datasets 2 | from . import modeling 3 | 4 | # config 5 | from .config import * 6 | 7 | # dataset loading 8 | from .data.dataset_mappers.coco_unified_new_baseline_dataset_mapper import COCOUnifiedNewBaselineDatasetMapper 9 | from .data.dataset_mappers.oneformer_unified_dataset_mapper import ( 10 | OneFormerUnifiedDatasetMapper, 11 | ) 12 | 13 | # models 14 | from .oneformer_model import OneFormer 15 | from .test_time_augmentation import SemanticSegmentorWithTTA 16 | 17 | # evaluation 18 | from .evaluation.instance_evaluation import InstanceSegEvaluator 19 | -------------------------------------------------------------------------------- /oneformer/config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | from detectron2.config import CfgNode as CN 4 | 5 | __all__ = ["add_common_config", "add_oneformer_config", "add_swin_config", 6 | "add_dinat_config", "add_convnext_config"] 7 | 8 | def add_common_config(cfg): 9 | """ 10 | Add config for common configuration 11 | """ 12 | 13 | # data config 14 | # select the dataset mapper 15 | cfg.INPUT.DATASET_MAPPER_NAME = "oneformer_unified" 16 | # Color augmentation 17 | cfg.INPUT.COLOR_AUG_SSD = False 18 | # We retry random cropping until no single category in semantic segmentation GT occupies more 19 | # than `SINGLE_CATEGORY_MAX_AREA` part of the crop. 20 | cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA = 1.0 21 | # Pad image and segmentation GT in dataset mapper. 22 | cfg.INPUT.SIZE_DIVISIBILITY = -1 23 | 24 | cfg.INPUT.TASK_SEQ_LEN = 77 25 | cfg.INPUT.MAX_SEQ_LEN = 77 26 | 27 | cfg.INPUT.TASK_PROB = CN() 28 | cfg.INPUT.TASK_PROB.SEMANTIC = 0.33 29 | cfg.INPUT.TASK_PROB.INSTANCE = 0.66 30 | 31 | # test dataset 32 | cfg.DATASETS.TEST_PANOPTIC = ("",) 33 | cfg.DATASETS.TEST_INSTANCE = ("",) 34 | cfg.DATASETS.TEST_SEMANTIC = ("",) 35 | 36 | # solver config 37 | # weight decay on embedding 38 | cfg.SOLVER.WEIGHT_DECAY_EMBED = 0.0 39 | # optimizer 40 | cfg.SOLVER.OPTIMIZER = "ADAMW" 41 | cfg.SOLVER.BACKBONE_MULTIPLIER = 0.1 42 | 43 | # wandb 44 | cfg.WANDB = CN() 45 | cfg.WANDB.PROJECT = "OneFormer" 46 | cfg.WANDB.NAME = None 47 | 48 | cfg.MODEL.IS_TRAIN = True 49 | cfg.MODEL.IS_DEMO = False 50 | 51 | # text encoder config 52 | cfg.MODEL.TEXT_ENCODER = CN() 53 | 54 | cfg.MODEL.TEXT_ENCODER.WIDTH = 256 55 | cfg.MODEL.TEXT_ENCODER.CONTEXT_LENGTH = 77 56 | cfg.MODEL.TEXT_ENCODER.NUM_LAYERS = 12 57 | cfg.MODEL.TEXT_ENCODER.VOCAB_SIZE = 49408 58 | cfg.MODEL.TEXT_ENCODER.PROJ_NUM_LAYERS = 2 59 | cfg.MODEL.TEXT_ENCODER.N_CTX = 16 60 | 61 | # oneformer inference config 62 | cfg.MODEL.TEST = CN() 63 | cfg.MODEL.TEST.SEMANTIC_ON = True 64 | cfg.MODEL.TEST.INSTANCE_ON = False 65 | cfg.MODEL.TEST.PANOPTIC_ON = False 66 | cfg.MODEL.TEST.DETECTION_ON = False 67 | cfg.MODEL.TEST.OBJECT_MASK_THRESHOLD = 0.0 68 | cfg.MODEL.TEST.OVERLAP_THRESHOLD = 0.0 69 | cfg.MODEL.TEST.SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE = False 70 | cfg.MODEL.TEST.TASK = "panoptic" 71 | 72 | # TEST AUG Slide 73 | cfg.TEST.AUG.IS_SLIDE = False 74 | cfg.TEST.AUG.CROP_SIZE = (640, 640) 75 | cfg.TEST.AUG.STRIDE = (426, 426) 76 | cfg.TEST.AUG.SCALE = (2048, 640) 77 | cfg.TEST.AUG.SETR_MULTI_SCALE = True 78 | cfg.TEST.AUG.KEEP_RATIO = True 79 | cfg.TEST.AUG.SIZE_DIVISOR = 32 80 | 81 | # pixel decoder config 82 | cfg.MODEL.SEM_SEG_HEAD.MASK_DIM = 256 83 | # adding transformer in pixel decoder 84 | cfg.MODEL.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS = 0 85 | # pixel decoder 86 | cfg.MODEL.SEM_SEG_HEAD.PIXEL_DECODER_NAME = "BasePixelDecoder" 87 | cfg.MODEL.SEM_SEG_HEAD.SEM_EMBED_DIM = 256 88 | cfg.MODEL.SEM_SEG_HEAD.INST_EMBED_DIM = 256 89 | 90 | # LSJ aug 91 | cfg.INPUT.IMAGE_SIZE = 1024 92 | cfg.INPUT.MIN_SCALE = 0.1 93 | cfg.INPUT.MAX_SCALE = 2.0 94 | 95 | # MSDeformAttn encoder configs 96 | cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES = ["res3", "res4", "res5"] 97 | cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_POINTS = 4 98 | cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_HEADS = 8 99 | 100 | def add_oneformer_config(cfg): 101 | """ 102 | Add config for ONE_FORMER. 103 | """ 104 | 105 | # oneformer model config 106 | cfg.MODEL.ONE_FORMER = CN() 107 | 108 | # loss 109 | cfg.MODEL.ONE_FORMER.DEEP_SUPERVISION = True 110 | cfg.MODEL.ONE_FORMER.NO_OBJECT_WEIGHT = 0.1 111 | cfg.MODEL.ONE_FORMER.CLASS_WEIGHT = 1.0 112 | cfg.MODEL.ONE_FORMER.DICE_WEIGHT = 1.0 113 | cfg.MODEL.ONE_FORMER.MASK_WEIGHT = 20.0 114 | cfg.MODEL.ONE_FORMER.CONTRASTIVE_WEIGHT = 0.5 115 | cfg.MODEL.ONE_FORMER.CONTRASTIVE_TEMPERATURE = 0.07 116 | 117 | # transformer config 118 | cfg.MODEL.ONE_FORMER.NHEADS = 8 119 | cfg.MODEL.ONE_FORMER.DROPOUT = 0.1 120 | cfg.MODEL.ONE_FORMER.DIM_FEEDFORWARD = 2048 121 | cfg.MODEL.ONE_FORMER.ENC_LAYERS = 0 122 | cfg.MODEL.ONE_FORMER.CLASS_DEC_LAYERS = 2 123 | cfg.MODEL.ONE_FORMER.DEC_LAYERS = 6 124 | cfg.MODEL.ONE_FORMER.PRE_NORM = False 125 | 126 | cfg.MODEL.ONE_FORMER.HIDDEN_DIM = 256 127 | cfg.MODEL.ONE_FORMER.NUM_OBJECT_QUERIES = 120 128 | cfg.MODEL.ONE_FORMER.NUM_OBJECT_CTX = 16 129 | cfg.MODEL.ONE_FORMER.USE_TASK_NORM = True 130 | 131 | cfg.MODEL.ONE_FORMER.TRANSFORMER_IN_FEATURE = "res5" 132 | cfg.MODEL.ONE_FORMER.ENFORCE_INPUT_PROJ = False 133 | 134 | # Sometimes `backbone.size_divisibility` is set to 0 for some backbone (e.g. ResNet) 135 | # you can use this config to override 136 | cfg.MODEL.ONE_FORMER.SIZE_DIVISIBILITY = 32 137 | 138 | # transformer module 139 | cfg.MODEL.ONE_FORMER.TRANSFORMER_DECODER_NAME = "ContrastiveMultiScaleMaskedTransformerDecoder" 140 | 141 | # point loss configs 142 | # Number of points sampled during training for a mask point head. 143 | cfg.MODEL.ONE_FORMER.TRAIN_NUM_POINTS = 112 * 112 144 | # Oversampling parameter for PointRend point sampling during training. Parameter `k` in the 145 | # original paper. 146 | cfg.MODEL.ONE_FORMER.OVERSAMPLE_RATIO = 3.0 147 | # Importance sampling parameter for PointRend point sampling during training. Parametr `beta` in 148 | # the original paper. 149 | cfg.MODEL.ONE_FORMER.IMPORTANCE_SAMPLE_RATIO = 0.75 150 | 151 | def add_swin_config(cfg): 152 | """ 153 | Add config forSWIN Backbone. 154 | """ 155 | 156 | # swin transformer backbone 157 | cfg.MODEL.SWIN = CN() 158 | cfg.MODEL.SWIN.PRETRAIN_IMG_SIZE = 224 159 | cfg.MODEL.SWIN.PATCH_SIZE = 4 160 | cfg.MODEL.SWIN.EMBED_DIM = 96 161 | cfg.MODEL.SWIN.DEPTHS = [2, 2, 6, 2] 162 | cfg.MODEL.SWIN.NUM_HEADS = [3, 6, 12, 24] 163 | cfg.MODEL.SWIN.WINDOW_SIZE = 7 164 | cfg.MODEL.SWIN.MLP_RATIO = 4.0 165 | cfg.MODEL.SWIN.QKV_BIAS = True 166 | cfg.MODEL.SWIN.QK_SCALE = None 167 | cfg.MODEL.SWIN.DROP_RATE = 0.0 168 | cfg.MODEL.SWIN.ATTN_DROP_RATE = 0.0 169 | cfg.MODEL.SWIN.DROP_PATH_RATE = 0.3 170 | cfg.MODEL.SWIN.APE = False 171 | cfg.MODEL.SWIN.PATCH_NORM = True 172 | cfg.MODEL.SWIN.OUT_FEATURES = ["res2", "res3", "res4", "res5"] 173 | cfg.MODEL.SWIN.USE_CHECKPOINT = False 174 | 175 | def add_dinat_config(cfg): 176 | """ 177 | Add config for NAT Backbone. 178 | """ 179 | 180 | # DINAT transformer backbone 181 | cfg.MODEL.DiNAT = CN() 182 | cfg.MODEL.DiNAT.DEPTHS = [3, 4, 18, 5] 183 | cfg.MODEL.DiNAT.OUT_FEATURES = ["res2", "res3", "res4", "res5"] 184 | cfg.MODEL.DiNAT.EMBED_DIM = 64 185 | cfg.MODEL.DiNAT.MLP_RATIO = 3.0 186 | cfg.MODEL.DiNAT.NUM_HEADS = [2, 4, 8, 16] 187 | cfg.MODEL.DiNAT.DROP_PATH_RATE = 0.2 188 | cfg.MODEL.DiNAT.KERNEL_SIZE = 7 189 | cfg.MODEL.DiNAT.DILATIONS = [[1, 16, 1], [1, 4, 1, 8], [1, 2, 1, 3, 1, 4], [1, 2, 1, 2, 1]] 190 | cfg.MODEL.DiNAT.OUT_INDICES = (0, 1, 2, 3) 191 | cfg.MODEL.DiNAT.QKV_BIAS = True 192 | cfg.MODEL.DiNAT.QK_SCALE = None 193 | cfg.MODEL.DiNAT.DROP_RATE = 0 194 | cfg.MODEL.DiNAT.ATTN_DROP_RATE = 0. 195 | cfg.MODEL.DiNAT.IN_PATCH_SIZE = 4 196 | 197 | def add_convnext_config(cfg): 198 | """ 199 | Add config for ConvNeXt Backbone. 200 | """ 201 | 202 | # swin transformer backbone 203 | cfg.MODEL.CONVNEXT = CN() 204 | cfg.MODEL.CONVNEXT.IN_CHANNELS = 3 205 | cfg.MODEL.CONVNEXT.DEPTHS = [3, 3, 27, 3] 206 | cfg.MODEL.CONVNEXT.DIMS = [192, 384, 768, 1536] 207 | cfg.MODEL.CONVNEXT.DROP_PATH_RATE = 0.4 208 | cfg.MODEL.CONVNEXT.LSIT = 1.0 209 | cfg.MODEL.CONVNEXT.OUT_INDICES = [0, 1, 2, 3] 210 | cfg.MODEL.CONVNEXT.OUT_FEATURES = ["res2", "res3", "res4", "res5"] -------------------------------------------------------------------------------- /oneformer/data/__init__.py: -------------------------------------------------------------------------------- 1 | from . import datasets 2 | -------------------------------------------------------------------------------- /oneformer/data/bpe_simple_vocab_16e6.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SHI-Labs/OneFormer/4962ef6a96ffb76a76771bfa3e8b3587f209752b/oneformer/data/bpe_simple_vocab_16e6.txt.gz -------------------------------------------------------------------------------- /oneformer/data/build.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Reference: https://github.com/facebookresearch/detectron2/blob/main/detectron2/data/build.py 3 | # Modified by Jitesh Jain (https://github.com/praeclarumjj3) 4 | # ------------------------------------------------------------------------------ 5 | 6 | from typing import Any, Callable, Dict, List, Optional, Union 7 | import torch.utils.data as torchdata 8 | 9 | from detectron2.config import configurable 10 | 11 | 12 | from detectron2.data.common import DatasetFromList, MapDataset 13 | from detectron2.data.dataset_mapper import DatasetMapper 14 | from detectron2.data.samplers import ( 15 | InferenceSampler, 16 | ) 17 | from detectron2.data.build import ( 18 | get_detection_dataset_dicts, 19 | trivial_batch_collator 20 | ) 21 | """ 22 | This file contains the default logic to build a dataloader for training or testing. 23 | """ 24 | 25 | __all__ = [ 26 | "build_detection_test_loader", 27 | ] 28 | 29 | 30 | def _test_loader_from_config(cfg, dataset_name, mapper=None): 31 | """ 32 | Uses the given `dataset_name` argument (instead of the names in cfg), because the 33 | standard practice is to evaluate each test set individually (not combining them). 34 | """ 35 | if isinstance(dataset_name, str): 36 | dataset_name = [dataset_name] 37 | 38 | dataset = get_detection_dataset_dicts( 39 | dataset_name, 40 | filter_empty=False, 41 | proposal_files=[ 42 | cfg.DATASETS.PROPOSAL_FILES_TEST[list(cfg.DATASETS.TEST).index(x)] for x in dataset_name 43 | ] 44 | if cfg.MODEL.LOAD_PROPOSALS 45 | else None, 46 | ) 47 | if mapper is None: 48 | mapper = DatasetMapper(cfg, False) 49 | return { 50 | "dataset": dataset, 51 | "mapper": mapper, 52 | "num_workers": cfg.DATALOADER.NUM_WORKERS, 53 | "sampler": InferenceSampler(len(dataset)) 54 | if not isinstance(dataset, torchdata.IterableDataset) 55 | else None, 56 | } 57 | 58 | 59 | @configurable(from_config=_test_loader_from_config) 60 | def build_detection_test_loader( 61 | dataset: Union[List[Any], torchdata.Dataset], 62 | *, 63 | mapper: Callable[[Dict[str, Any]], Any], 64 | sampler: Optional[torchdata.Sampler] = None, 65 | batch_size: int = 1, 66 | num_workers: int = 0, 67 | collate_fn: Optional[Callable[[List[Any]], Any]] = None, 68 | ) -> torchdata.DataLoader: 69 | """ 70 | Similar to `build_detection_train_loader`, with default batch size = 1, 71 | and sampler = :class:`InferenceSampler`. This sampler coordinates all workers 72 | to produce the exact set of all samples. 73 | 74 | Args: 75 | dataset: a list of dataset dicts, 76 | or a pytorch dataset (either map-style or iterable). They can be obtained 77 | by using :func:`DatasetCatalog.get` or :func:`get_detection_dataset_dicts`. 78 | mapper: a callable which takes a sample (dict) from dataset 79 | and returns the format to be consumed by the model. 80 | When using cfg, the default choice is ``DatasetMapper(cfg, is_train=False)``. 81 | sampler: a sampler that produces 82 | indices to be applied on ``dataset``. Default to :class:`InferenceSampler`, 83 | which splits the dataset across all workers. Sampler must be None 84 | if `dataset` is iterable. 85 | batch_size: the batch size of the data loader to be created. 86 | Default to 1 image per worker since this is the standard when reporting 87 | inference time in papers. 88 | num_workers: number of parallel data loading workers 89 | collate_fn: same as the argument of `torch.utils.data.DataLoader`. 90 | Defaults to do no collation and return a list of data. 91 | 92 | Returns: 93 | DataLoader: a torch DataLoader, that loads the given detection 94 | dataset, with test-time transformation and batching. 95 | 96 | Examples: 97 | :: 98 | data_loader = build_detection_test_loader( 99 | DatasetRegistry.get("my_test"), 100 | mapper=DatasetMapper(...)) 101 | 102 | # or, instantiate with a CfgNode: 103 | data_loader = build_detection_test_loader(cfg, "my_test") 104 | """ 105 | if isinstance(dataset, list): 106 | dataset = DatasetFromList(dataset, copy=False) 107 | if mapper is not None: 108 | dataset = MapDataset(dataset, mapper) 109 | if isinstance(dataset, torchdata.IterableDataset): 110 | assert sampler is None, "sampler must be None if dataset is IterableDataset" 111 | else: 112 | if sampler is None: 113 | sampler = InferenceSampler(len(dataset)) 114 | return torchdata.DataLoader( 115 | dataset, 116 | batch_size=batch_size, 117 | sampler=sampler, 118 | drop_last=False, 119 | num_workers=num_workers, 120 | collate_fn=trivial_batch_collator if collate_fn is None else collate_fn, 121 | ) -------------------------------------------------------------------------------- /oneformer/data/dataset_mappers/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /oneformer/data/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from . import ( 2 | register_ade20k_panoptic, 3 | register_cityscapes_panoptic, 4 | register_coco_panoptic_annos_semseg, 5 | register_ade20k_instance, 6 | register_coco_panoptic2instance, 7 | register_mapillary_vistas, 8 | register_mapillary_vistas_panoptic, 9 | ) 10 | -------------------------------------------------------------------------------- /oneformer/data/datasets/register_ade20k_instance.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Reference: https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/data/datasets/register_ade20k_instance.py 3 | # ------------------------------------------------------------------------------ 4 | 5 | import json 6 | import logging 7 | import numpy as np 8 | import os 9 | from PIL import Image 10 | 11 | from detectron2.data import DatasetCatalog, MetadataCatalog 12 | from detectron2.data.datasets.coco import load_coco_json, register_coco_instances 13 | from detectron2.utils.file_io import PathManager 14 | 15 | ADE_CATEGORIES = [{'id': 7, 'name': 'bed'}, {'id': 8, 'name': 'windowpane'}, {'id': 10, 'name': 'cabinet'}, {'id': 12, 'name': 'person'}, {'id': 14, 'name': 'door'}, {'id': 15, 'name': 'table'}, {'id': 18, 'name': 'curtain'}, {'id': 19, 'name': 'chair'}, {'id': 20, 'name': 'car'}, {'id': 22, 'name': 'painting'}, {'id': 23, 'name': 'sofa'}, {'id': 24, 'name': 'shelf'}, {'id': 27, 'name': 'mirror'}, {'id': 30, 'name': 'armchair'}, {'id': 31, 'name': 'seat'}, {'id': 32, 'name': 'fence'}, {'id': 33, 'name': 'desk'}, {'id': 35, 'name': 'wardrobe'}, {'id': 36, 'name': 'lamp'}, {'id': 37, 'name': 'bathtub'}, {'id': 38, 'name': 'railing'}, {'id': 39, 'name': 'cushion'}, {'id': 41, 'name': 'box'}, {'id': 42, 'name': 'column'}, {'id': 43, 'name': 'signboard'}, {'id': 44, 'name': 'chest of drawers'}, {'id': 45, 'name': 'counter'}, {'id': 47, 'name': 'sink'}, {'id': 49, 'name': 'fireplace'}, {'id': 50, 'name': 'refrigerator'}, {'id': 53, 'name': 'stairs'}, {'id': 55, 'name': 'case'}, {'id': 56, 'name': 'pool table'}, {'id': 57, 'name': 'pillow'}, {'id': 58, 'name': 'screen door'}, {'id': 62, 'name': 'bookcase'}, {'id': 64, 'name': 'coffee table'}, {'id': 65, 'name': 'toilet'}, {'id': 66, 'name': 'flower'}, {'id': 67, 'name': 'book'}, {'id': 69, 'name': 'bench'}, {'id': 70, 'name': 'countertop'}, {'id': 71, 'name': 'stove'}, {'id': 72, 'name': 'palm'}, {'id': 73, 'name': 'kitchen island'}, {'id': 74, 'name': 'computer'}, {'id': 75, 'name': 'swivel chair'}, {'id': 76, 'name': 'boat'}, {'id': 78, 'name': 'arcade machine'}, {'id': 80, 'name': 'bus'}, {'id': 81, 'name': 'towel'}, {'id': 82, 'name': 'light'}, {'id': 83, 'name': 'truck'}, {'id': 85, 'name': 'chandelier'}, {'id': 86, 'name': 'awning'}, {'id': 87, 'name': 'streetlight'}, {'id': 88, 'name': 'booth'}, {'id': 89, 'name': 'television receiver'}, {'id': 90, 'name': 'airplane'}, {'id': 92, 'name': 'apparel'}, {'id': 93, 'name': 'pole'}, {'id': 95, 'name': 'bannister'}, {'id': 97, 'name': 'ottoman'}, {'id': 98, 'name': 'bottle'}, {'id': 102, 'name': 'van'}, {'id': 103, 'name': 'ship'}, {'id': 104, 'name': 'fountain'}, {'id': 107, 'name': 'washer'}, {'id': 108, 'name': 'plaything'}, {'id': 110, 'name': 'stool'}, {'id': 111, 'name': 'barrel'}, {'id': 112, 'name': 'basket'}, {'id': 115, 'name': 'bag'}, {'id': 116, 'name': 'minibike'}, {'id': 118, 'name': 'oven'}, {'id': 119, 'name': 'ball'}, {'id': 120, 'name': 'food'}, {'id': 121, 'name': 'step'}, {'id': 123, 'name': 'trade name'}, {'id': 124, 'name': 'microwave'}, {'id': 125, 'name': 'pot'}, {'id': 126, 'name': 'animal'}, {'id': 127, 'name': 'bicycle'}, {'id': 129, 'name': 'dishwasher'}, {'id': 130, 'name': 'screen'}, {'id': 132, 'name': 'sculpture'}, {'id': 133, 'name': 'hood'}, {'id': 134, 'name': 'sconce'}, {'id': 135, 'name': 'vase'}, {'id': 136, 'name': 'traffic light'}, {'id': 137, 'name': 'tray'}, {'id': 138, 'name': 'ashcan'}, {'id': 139, 'name': 'fan'}, {'id': 142, 'name': 'plate'}, {'id': 143, 'name': 'monitor'}, {'id': 144, 'name': 'bulletin board'}, {'id': 146, 'name': 'radiator'}, {'id': 147, 'name': 'glass'}, {'id': 148, 'name': 'clock'}, {'id': 149, 'name': 'flag'}] 16 | 17 | 18 | _PREDEFINED_SPLITS = { 19 | # point annotations without masks 20 | "ade20k_instance_train": ( 21 | "ADEChallengeData2016/images/training", 22 | "ADEChallengeData2016/ade20k_instance_train.json", 23 | ), 24 | "ade20k_instance_val": ( 25 | "ADEChallengeData2016/images/validation", 26 | "ADEChallengeData2016/ade20k_instance_val.json", 27 | ), 28 | } 29 | 30 | 31 | def _get_ade_instances_meta(): 32 | thing_ids = [k["id"] for k in ADE_CATEGORIES] 33 | assert len(thing_ids) == 100, len(thing_ids) 34 | # Mapping from the incontiguous ADE category id to an id in [0, 99] 35 | thing_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(thing_ids)} 36 | thing_classes = [k["name"] for k in ADE_CATEGORIES] 37 | ret = { 38 | "thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id, 39 | "thing_classes": thing_classes, 40 | } 41 | return ret 42 | 43 | 44 | def register_all_ade20k_instance(root): 45 | for key, (image_root, json_file) in _PREDEFINED_SPLITS.items(): 46 | # Assume pre-defined datasets live in `./datasets`. 47 | register_coco_instances( 48 | key, 49 | _get_ade_instances_meta(), 50 | os.path.join(root, json_file) if "://" not in json_file else json_file, 51 | os.path.join(root, image_root), 52 | ) 53 | 54 | 55 | _root = os.getenv("DETECTRON2_DATASETS", "datasets") 56 | register_all_ade20k_instance(_root) 57 | -------------------------------------------------------------------------------- /oneformer/data/datasets/register_coco_panoptic2instance.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Reference: https://github.com/facebookresearch/detectron2/blob/main/detectron2/data/datasets/builtin.py 3 | # Modified by Jitesh Jain (https://github.com/praeclarumjj3) 4 | # ------------------------------------------------------------------------------ 5 | 6 | 7 | """ 8 | This file registers pre-defined datasets at hard-coded paths, and their metadata. 9 | 10 | We hard-code metadata for common datasets. This will enable: 11 | 1. Consistency check when loading the datasets 12 | 2. Use models on these standard datasets directly and run demos, 13 | without having to download the dataset annotations 14 | 15 | We hard-code some paths to the dataset that's assumed to 16 | exist in "./datasets/". 17 | 18 | Users SHOULD NOT use this file to create new dataset / metadata for new dataset. 19 | To add new dataset, refer to the tutorial "docs/DATASETS.md". 20 | """ 21 | 22 | import os 23 | from detectron2.data.datasets.builtin_meta import _get_builtin_metadata 24 | from detectron2.data.datasets.coco import register_coco_instances 25 | 26 | 27 | _PREDEFINED_SPLITS_COCO = { 28 | "coco_2017_val_panoptic2instance": ("coco/val2017", "coco/annotations/panoptic2instances_val2017.json"), 29 | } 30 | 31 | 32 | def register_panoptic2instances_coco(root): 33 | for key, (image_root, json_file) in _PREDEFINED_SPLITS_COCO.items(): 34 | # Assume pre-defined datasets live in `./datasets`. 35 | register_coco_instances( 36 | key, 37 | _get_builtin_metadata("coco"), 38 | os.path.join(root, json_file) if "://" not in json_file else json_file, 39 | os.path.join(root, image_root), 40 | ) 41 | 42 | 43 | _root = os.path.expanduser(os.getenv("DETECTRON2_DATASETS", "datasets")) 44 | register_panoptic2instances_coco(_root) -------------------------------------------------------------------------------- /oneformer/data/tokenizer.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # MIT License 3 | # 4 | # Copyright (c) 2021 OpenAI 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining a copy 7 | # of this software and associated documentation files (the "Software"), to deal 8 | # in the Software without restriction, including without limitation the rights 9 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | # copies of the Software, and to permit persons to whom the Software is 11 | # furnished to do so, subject to the following conditions: 12 | # 13 | # The above copyright notice and this permission notice shall be included in all 14 | # copies or substantial portions of the Software. 15 | # 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | # SOFTWARE. 23 | # 24 | # Modified by Jiarui Xu 25 | # ------------------------------------------------------------------------- 26 | 27 | import gzip 28 | import html 29 | import os 30 | from functools import lru_cache 31 | 32 | import ftfy 33 | import regex as re 34 | import torch 35 | 36 | 37 | @lru_cache() 38 | def default_bpe(): 39 | return os.path.join(os.path.dirname(os.path.abspath(__file__)), 'bpe_simple_vocab_16e6.txt.gz') 40 | 41 | 42 | @lru_cache() 43 | def bytes_to_unicode(): 44 | """Returns list of utf-8 byte and a corresponding list of unicode strings. 45 | 46 | The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab 47 | if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for decent 48 | coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup tables 49 | between utf-8 bytes and unicode strings. And avoids mapping to whitespace/control characters the bpe code barfs on. 50 | """ 51 | bs = list(range(ord('!'), ord('~') + 1)) + list(range(ord('¡'), ord('¬') + 1)) + list(range(ord('®'), ord('ÿ') + 1)) 52 | cs = bs[:] 53 | n = 0 54 | for b in range(2**8): 55 | if b not in bs: 56 | bs.append(b) 57 | cs.append(2**8 + n) 58 | n += 1 59 | cs = [chr(n) for n in cs] 60 | return dict(zip(bs, cs)) 61 | 62 | 63 | def get_pairs(word): 64 | """Return set of symbol pairs in a word. 65 | 66 | Word is represented as tuple of symbols (symbols being variable-length strings). 67 | """ 68 | pairs = set() 69 | prev_char = word[0] 70 | for char in word[1:]: 71 | pairs.add((prev_char, char)) 72 | prev_char = char 73 | return pairs 74 | 75 | 76 | def basic_clean(text): 77 | text = ftfy.fix_text(text) 78 | text = html.unescape(html.unescape(text)) 79 | return text.strip() 80 | 81 | 82 | def whitespace_clean(text): 83 | text = re.sub(r'\s+', ' ', text) 84 | text = text.strip() 85 | return text 86 | 87 | class Tokenize: 88 | 89 | def __init__(self, tokenizer, max_seq_len=77, truncate=True): 90 | self.tokenizer = tokenizer 91 | self.max_seq_len = max_seq_len 92 | self.truncate = truncate 93 | 94 | def __call__(self, texts): 95 | expanded_dim = False 96 | if isinstance(texts, str): 97 | texts = [texts] 98 | expanded_dim = True 99 | 100 | sot_token = self.tokenizer.encoder['<|startoftext|>'] 101 | eot_token = self.tokenizer.encoder['<|endoftext|>'] 102 | all_tokens = [[sot_token] + self.tokenizer.encode(text) + [eot_token] for text in texts] 103 | result = torch.zeros(len(all_tokens), self.max_seq_len, dtype=torch.long) 104 | 105 | for i, tokens in enumerate(all_tokens): 106 | if len(tokens) > self.max_seq_len: 107 | if self.truncate: 108 | tokens = tokens[:self.max_seq_len] 109 | tokens[-1] = eot_token 110 | else: 111 | raise RuntimeError(f'Input {texts[i]} is too long for context length {self.max_seq_len}') 112 | result[i, :len(tokens)] = torch.tensor(tokens) 113 | 114 | if expanded_dim: 115 | return result[0] 116 | 117 | return result 118 | 119 | 120 | class SimpleTokenizer(object): 121 | 122 | def __init__(self, bpe_path: str = default_bpe()): 123 | self.byte_encoder = bytes_to_unicode() 124 | self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} 125 | merges = gzip.open(bpe_path).read().decode('utf-8').split('\n') 126 | merges = merges[1:49152 - 256 - 2 + 1] 127 | merges = [tuple(merge.split()) for merge in merges] 128 | vocab = list(bytes_to_unicode().values()) 129 | vocab = vocab + [v + '' for v in vocab] 130 | for merge in merges: 131 | vocab.append(''.join(merge)) 132 | vocab.extend(['<|startoftext|>', '<|endoftext|>']) 133 | self.encoder = dict(zip(vocab, range(len(vocab)))) 134 | self.decoder = {v: k for k, v in self.encoder.items()} 135 | self.bpe_ranks = dict(zip(merges, range(len(merges)))) 136 | self.cache = {'<|startoftext|>': '<|startoftext|>', '<|endoftext|>': '<|endoftext|>'} 137 | self.pat = re.compile( 138 | r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", 139 | re.IGNORECASE) 140 | 141 | def bpe(self, token): 142 | if token in self.cache: 143 | return self.cache[token] 144 | word = tuple(token[:-1]) + (token[-1] + '', ) 145 | pairs = get_pairs(word) 146 | 147 | if not pairs: 148 | return token + '' 149 | 150 | while True: 151 | bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf'))) 152 | if bigram not in self.bpe_ranks: 153 | break 154 | first, second = bigram 155 | new_word = [] 156 | i = 0 157 | while i < len(word): 158 | try: 159 | j = word.index(first, i) 160 | new_word.extend(word[i:j]) 161 | i = j 162 | except: # noqa: E722 163 | new_word.extend(word[i:]) 164 | break 165 | 166 | if word[i] == first and i < len(word) - 1 and word[i + 1] == second: 167 | new_word.append(first + second) 168 | i += 2 169 | else: 170 | new_word.append(word[i]) 171 | i += 1 172 | new_word = tuple(new_word) 173 | word = new_word 174 | if len(word) == 1: 175 | break 176 | else: 177 | pairs = get_pairs(word) 178 | word = ' '.join(word) 179 | self.cache[token] = word 180 | return word 181 | 182 | def encode(self, text): 183 | bpe_tokens = [] 184 | text = whitespace_clean(basic_clean(text)).lower() 185 | for token in re.findall(self.pat, text): 186 | token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8')) 187 | bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' ')) 188 | return bpe_tokens 189 | 190 | def decode(self, tokens): 191 | text = ''.join([self.decoder[token] for token in tokens]) 192 | text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors='replace').replace('', ' ') 193 | return text -------------------------------------------------------------------------------- /oneformer/datasetmapper_tta.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import numpy as np 3 | from typing import List 4 | import torch 5 | from fvcore.transforms import NoOpTransform 6 | from torch import nn 7 | 8 | from detectron2.config import configurable 9 | from detectron2.data.transforms import ( 10 | RandomFlip, 11 | ResizeShortestEdge, 12 | ResizeTransform, 13 | apply_augmentations, 14 | ) 15 | 16 | __all__ = ["DatasetMapperTTA"] 17 | 18 | 19 | class DatasetMapperTTA: 20 | """ 21 | Implement test-time augmentation for detection data. 22 | It is a callable which takes a dataset dict from a detection dataset, 23 | and returns a list of dataset dicts where the images 24 | are augmented from the input image by the transformations defined in the config. 25 | This is used for test-time augmentation. 26 | """ 27 | 28 | @configurable 29 | def __init__(self, min_sizes: List[int], max_size: int, flip: bool): 30 | """ 31 | Args: 32 | min_sizes: list of short-edge size to resize the image to 33 | max_size: maximum height or width of resized images 34 | flip: whether to apply flipping augmentation 35 | """ 36 | self.min_sizes = min_sizes 37 | self.max_size = max_size 38 | self.flip = flip 39 | 40 | @classmethod 41 | def from_config(cls, cfg): 42 | return { 43 | "min_sizes": cfg.TEST.AUG.MIN_SIZES, 44 | "max_size": cfg.TEST.AUG.MAX_SIZE, 45 | "flip": cfg.TEST.AUG.FLIP, 46 | } 47 | 48 | def __call__(self, dataset_dict): 49 | """ 50 | Args: 51 | dict: a dict in standard model input format. See tutorials for details. 52 | Returns: 53 | list[dict]: 54 | a list of dicts, which contain augmented version of the input image. 55 | The total number of dicts is ``len(min_sizes) * (2 if flip else 1)``. 56 | Each dict has field "transforms" which is a TransformList, 57 | containing the transforms that are used to generate this image. 58 | """ 59 | numpy_image = dataset_dict["image"].permute(1, 2, 0).numpy() 60 | shape = numpy_image.shape 61 | orig_shape = (dataset_dict["height"], dataset_dict["width"]) 62 | 63 | if shape[:2] != orig_shape: 64 | # It transforms the "original" image in the dataset to the input image 65 | pre_tfm = ResizeTransform(orig_shape[0], orig_shape[1], shape[0], shape[1]) 66 | else: 67 | pre_tfm = NoOpTransform() 68 | 69 | # Create all combinations of augmentations to use 70 | aug_candidates = [] # each element is a list[Augmentation] 71 | for min_size in self.min_sizes: 72 | resize = ResizeShortestEdge(min_size, self.max_size) 73 | aug_candidates.append([resize]) # resize only 74 | if self.flip: 75 | flip = RandomFlip(prob=1.0) 76 | aug_candidates.append([resize, flip]) # resize + flip 77 | 78 | # Apply all the augmentations 79 | ret = [] 80 | for aug in aug_candidates: 81 | new_image, tfms = apply_augmentations(aug, np.copy(numpy_image)) 82 | torch_image = torch.from_numpy(np.ascontiguousarray(new_image.transpose(2, 0, 1))) 83 | 84 | dic = copy.deepcopy(dataset_dict) 85 | dic["transforms"] = pre_tfm + tfms 86 | dic["image"] = torch_image 87 | ret.append(dic) 88 | return ret -------------------------------------------------------------------------------- /oneformer/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | from .detection_coco_evaluator import * 2 | from .coco_evaluator import * 3 | from .cityscapes_evaluation import CityscapesInstanceEvaluator -------------------------------------------------------------------------------- /oneformer/evaluation/instance_evaluation.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Reference: https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/evaluation/instance_evaluation.py 3 | # ------------------------------------------------------------------------------ 4 | 5 | import contextlib 6 | import copy 7 | import io 8 | import itertools 9 | import json 10 | import logging 11 | import numpy as np 12 | import os 13 | import pickle 14 | from collections import OrderedDict 15 | import pycocotools.mask as mask_util 16 | import torch 17 | from pycocotools.coco import COCO 18 | from pycocotools.cocoeval import COCOeval 19 | from tabulate import tabulate 20 | 21 | import detectron2.utils.comm as comm 22 | from detectron2.config import CfgNode 23 | from detectron2.data import MetadataCatalog 24 | from detectron2.data.datasets.coco import convert_to_coco_json 25 | from detectron2.evaluation.coco_evaluation import COCOEvaluator, _evaluate_predictions_on_coco 26 | from detectron2.evaluation.fast_eval_api import COCOeval_opt 27 | from detectron2.structures import Boxes, BoxMode, pairwise_iou 28 | from detectron2.utils.file_io import PathManager 29 | from detectron2.utils.logger import create_small_table 30 | 31 | 32 | # modified from COCOEvaluator for instance segmetnat 33 | class InstanceSegEvaluator(COCOEvaluator): 34 | """ 35 | Evaluate AR for object proposals, AP for instance detection/segmentation, AP 36 | for keypoint detection outputs using COCO's metrics. 37 | See http://cocodataset.org/#detection-eval and 38 | http://cocodataset.org/#keypoints-eval to understand its metrics. 39 | The metrics range from 0 to 100 (instead of 0 to 1), where a -1 or NaN means 40 | the metric cannot be computed (e.g. due to no predictions made). 41 | 42 | In addition to COCO, this evaluator is able to support any bounding box detection, 43 | instance segmentation, or keypoint detection dataset. 44 | """ 45 | 46 | def _eval_predictions(self, predictions, img_ids=None): 47 | """ 48 | Evaluate predictions. Fill self._results with the metrics of the tasks. 49 | """ 50 | self._logger.info("Preparing results for COCO format ...") 51 | coco_results = list(itertools.chain(*[x["instances"] for x in predictions])) 52 | tasks = self._tasks or self._tasks_from_predictions(coco_results) 53 | 54 | # unmap the category ids for COCO 55 | if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"): 56 | dataset_id_to_contiguous_id = self._metadata.thing_dataset_id_to_contiguous_id 57 | # all_contiguous_ids = list(dataset_id_to_contiguous_id.values()) 58 | # num_classes = len(all_contiguous_ids) 59 | # assert min(all_contiguous_ids) == 0 and max(all_contiguous_ids) == num_classes - 1 60 | 61 | reverse_id_mapping = {v: k for k, v in dataset_id_to_contiguous_id.items()} 62 | for result in coco_results: 63 | category_id = result["category_id"] 64 | # assert category_id < num_classes, ( 65 | # f"A prediction has class={category_id}, " 66 | # f"but the dataset only has {num_classes} classes and " 67 | # f"predicted class id should be in [0, {num_classes - 1}]." 68 | # ) 69 | assert category_id in reverse_id_mapping, ( 70 | f"A prediction has class={category_id}, " 71 | f"but the dataset only has class ids in {dataset_id_to_contiguous_id}." 72 | ) 73 | result["category_id"] = reverse_id_mapping[category_id] 74 | 75 | if self._output_dir: 76 | file_path = os.path.join(self._output_dir, "coco_instances_results.json") 77 | self._logger.info("Saving results to {}".format(file_path)) 78 | with PathManager.open(file_path, "w") as f: 79 | f.write(json.dumps(coco_results)) 80 | f.flush() 81 | 82 | if not self._do_evaluation: 83 | self._logger.info("Annotations are not available for evaluation.") 84 | return 85 | 86 | self._logger.info( 87 | "Evaluating predictions with {} COCO API...".format( 88 | "unofficial" if self._use_fast_impl else "official" 89 | ) 90 | ) 91 | for task in sorted(tasks): 92 | assert task in {"bbox", "segm", "keypoints"}, f"Got unknown task: {task}!" 93 | coco_eval = ( 94 | _evaluate_predictions_on_coco( 95 | self._coco_api, 96 | coco_results, 97 | task, 98 | kpt_oks_sigmas=self._kpt_oks_sigmas, 99 | use_fast_impl=self._use_fast_impl, 100 | img_ids=img_ids, 101 | max_dets_per_image=self._max_dets_per_image, 102 | ) 103 | if len(coco_results) > 0 104 | else None # cocoapi does not handle empty results very well 105 | ) 106 | 107 | res = self._derive_coco_results( 108 | coco_eval, task, class_names=self._metadata.get("thing_classes") 109 | ) 110 | self._results[task] = res 111 | -------------------------------------------------------------------------------- /oneformer/modeling/__init__.py: -------------------------------------------------------------------------------- 1 | from .backbone.swin import D2SwinTransformer 2 | from .backbone.dinat import D2DiNAT 3 | from .backbone.convnext import D2ConvNeXt 4 | from .pixel_decoder.fpn import BasePixelDecoder 5 | from .pixel_decoder.msdeformattn import MSDeformAttnPixelDecoder 6 | from .meta_arch.oneformer_head import OneFormerHead 7 | -------------------------------------------------------------------------------- /oneformer/modeling/backbone/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /oneformer/modeling/meta_arch/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /oneformer/modeling/meta_arch/oneformer_head.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Reference: https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/modeling/meta_arch/mask_former_head.py 3 | # Modified by Jitesh Jain (https://github.com/praeclarumjj3) 4 | # ------------------------------------------------------------------------------ 5 | 6 | import logging 7 | from copy import deepcopy 8 | from typing import Callable, Dict, List, Optional, Tuple, Union 9 | 10 | import fvcore.nn.weight_init as weight_init 11 | from torch import nn 12 | from torch.nn import functional as F 13 | 14 | from detectron2.config import configurable 15 | from detectron2.layers import Conv2d, ShapeSpec, get_norm 16 | from detectron2.modeling import SEM_SEG_HEADS_REGISTRY 17 | from ..pixel_decoder.fpn import build_pixel_decoder 18 | from ..transformer_decoder.oneformer_transformer_decoder import build_transformer_decoder 19 | 20 | @SEM_SEG_HEADS_REGISTRY.register() 21 | class OneFormerHead(nn.Module): 22 | 23 | _version = 2 24 | 25 | def _load_from_state_dict( 26 | self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs 27 | ): 28 | version = local_metadata.get("version", None) 29 | if version is None or version < 2: 30 | # Do not warn if train from scratch 31 | scratch = True 32 | logger = logging.getLogger(__name__) 33 | for k in list(state_dict.keys()): 34 | newk = k 35 | if "sem_seg_head" in k and not k.startswith(prefix + "predictor"): 36 | newk = k.replace(prefix, prefix + "pixel_decoder.") 37 | # logger.debug(f"{k} ==> {newk}") 38 | if newk != k: 39 | state_dict[newk] = state_dict[k] 40 | del state_dict[k] 41 | scratch = False 42 | 43 | if not scratch: 44 | logger.warning( 45 | f"Weight format of {self.__class__.__name__} have changed! " 46 | "Please upgrade your models. Applying automatic conversion now ..." 47 | ) 48 | 49 | @configurable 50 | def __init__( 51 | self, 52 | input_shape: Dict[str, ShapeSpec], 53 | *, 54 | num_classes: int, 55 | pixel_decoder: nn.Module, 56 | loss_weight: float = 1.0, 57 | ignore_value: int = -1, 58 | # extra parameters 59 | transformer_predictor: nn.Module, 60 | transformer_in_feature: str, 61 | ): 62 | """ 63 | NOTE: this interface is experimental. 64 | Args: 65 | input_shape: shapes (channels and stride) of the input features 66 | num_classes: number of classes to predict 67 | pixel_decoder: the pixel decoder module 68 | loss_weight: loss weight 69 | ignore_value: category id to be ignored during training. 70 | transformer_predictor: the transformer decoder that makes prediction 71 | transformer_in_feature: input feature name to the transformer_predictor 72 | """ 73 | super().__init__() 74 | input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride) 75 | self.in_features = [k for k, v in input_shape] 76 | feature_strides = [v.stride for k, v in input_shape] 77 | feature_channels = [v.channels for k, v in input_shape] 78 | 79 | self.ignore_value = ignore_value 80 | self.common_stride = 4 81 | self.loss_weight = loss_weight 82 | 83 | self.pixel_decoder = pixel_decoder 84 | self.predictor = transformer_predictor 85 | self.transformer_in_feature = transformer_in_feature 86 | 87 | self.num_classes = num_classes 88 | 89 | @classmethod 90 | def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]): 91 | # figure out in_channels to transformer predictor 92 | if cfg.MODEL.ONE_FORMER.TRANSFORMER_IN_FEATURE == "transformer_encoder": 93 | transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM 94 | elif cfg.MODEL.ONE_FORMER.TRANSFORMER_IN_FEATURE == "pixel_embedding": 95 | transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM 96 | elif cfg.MODEL.ONE_FORMER.TRANSFORMER_IN_FEATURE == "multi_scale_pixel_decoder": 97 | transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM 98 | else: 99 | transformer_predictor_in_channels = input_shape[cfg.MODEL.ONE_FORMER.TRANSFORMER_IN_FEATURE].channels 100 | 101 | return { 102 | "input_shape": { 103 | k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES 104 | }, 105 | "ignore_value": cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE, 106 | "num_classes": cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES, 107 | "pixel_decoder": build_pixel_decoder(cfg, input_shape), 108 | "loss_weight": cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT, 109 | "transformer_in_feature": cfg.MODEL.ONE_FORMER.TRANSFORMER_IN_FEATURE, 110 | "transformer_predictor": build_transformer_decoder( 111 | cfg, 112 | transformer_predictor_in_channels, 113 | mask_classification=True, 114 | ), 115 | } 116 | 117 | def forward(self, features, tasks, mask=None): 118 | return self.layers(features, tasks, mask) 119 | 120 | def layers(self, features, tasks, mask=None): 121 | mask_features, transformer_encoder_features, multi_scale_features, _, _ = self.pixel_decoder.forward_features(features) 122 | 123 | if self.transformer_in_feature == "multi_scale_pixel_decoder": 124 | predictions = self.predictor(multi_scale_features, mask_features, tasks, mask) 125 | else: 126 | if self.transformer_in_feature == "transformer_encoder": 127 | assert ( 128 | transformer_encoder_features is not None 129 | ), "Please use the TransformerEncoderPixelDecoder." 130 | predictions = self.predictor(transformer_encoder_features, mask_features, mask) 131 | elif self.transformer_in_feature == "pixel_embedding": 132 | predictions = self.predictor(mask_features, mask_features, mask) 133 | else: 134 | predictions = self.predictor(features[self.transformer_in_feature], mask_features, mask) 135 | return predictions 136 | -------------------------------------------------------------------------------- /oneformer/modeling/pixel_decoder/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /oneformer/modeling/pixel_decoder/ops/functions/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | from .ms_deform_attn_func import MSDeformAttnFunction 10 | 11 | -------------------------------------------------------------------------------- /oneformer/modeling/pixel_decoder/ops/functions/ms_deform_attn_func.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | from __future__ import absolute_import 13 | from __future__ import print_function 14 | from __future__ import division 15 | 16 | import torch 17 | import torch.nn.functional as F 18 | from torch.autograd import Function 19 | from torch.autograd.function import once_differentiable 20 | 21 | if torch.cuda.is_available(): 22 | try: 23 | import MultiScaleDeformableAttention as MSDA 24 | except ModuleNotFoundError as e: 25 | info_string = ( 26 | "\n\nPlease compile MultiScaleDeformableAttention CUDA op with the following commands:\n" 27 | "\t`cd mask2former/modeling/pixel_decoder/ops`\n" 28 | "\t`sh make.sh`\n" 29 | ) 30 | raise ModuleNotFoundError(info_string) 31 | else: 32 | MultiScaleDeformableAttention = None 33 | 34 | 35 | class MSDeformAttnFunction(Function): 36 | @staticmethod 37 | def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step): 38 | ctx.im2col_step = im2col_step 39 | output = MSDA.ms_deform_attn_forward( 40 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step) 41 | ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights) 42 | return output 43 | 44 | @staticmethod 45 | @once_differentiable 46 | def backward(ctx, grad_output): 47 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors 48 | grad_value, grad_sampling_loc, grad_attn_weight = \ 49 | MSDA.ms_deform_attn_backward( 50 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step) 51 | 52 | return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None 53 | 54 | 55 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights): 56 | # for debug and test only, 57 | # need to use cuda version instead 58 | N_, S_, M_, D_ = value.shape 59 | _, Lq_, M_, L_, P_, _ = sampling_locations.shape 60 | value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1) 61 | sampling_grids = 2 * sampling_locations - 1 62 | sampling_value_list = [] 63 | for lid_, (H_, W_) in enumerate(value_spatial_shapes): 64 | # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_ 65 | value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_) 66 | # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2 67 | sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1) 68 | # N_*M_, D_, Lq_, P_ 69 | sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_, 70 | mode='bilinear', padding_mode='zeros', align_corners=False) 71 | sampling_value_list.append(sampling_value_l_) 72 | # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_) 73 | attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_) 74 | output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_) 75 | return output.transpose(1, 2).contiguous() 76 | -------------------------------------------------------------------------------- /oneformer/modeling/pixel_decoder/ops/make.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # ------------------------------------------------------------------------------------------------ 3 | # Deformable DETR 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | # ------------------------------------------------------------------------------------------------ 7 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | # ------------------------------------------------------------------------------------------------ 9 | 10 | # Copyright (c) Facebook, Inc. and its affiliates. 11 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 12 | 13 | python setup.py build install 14 | -------------------------------------------------------------------------------- /oneformer/modeling/pixel_decoder/ops/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | 10 | from .ms_deform_attn import MSDeformAttn 11 | -------------------------------------------------------------------------------- /oneformer/modeling/pixel_decoder/ops/modules/ms_deform_attn.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | from __future__ import absolute_import 13 | from __future__ import print_function 14 | from __future__ import division 15 | 16 | import warnings 17 | import math 18 | 19 | import torch 20 | from torch import nn 21 | import torch.nn.functional as F 22 | from torch.nn.init import xavier_uniform_, constant_ 23 | 24 | if torch.cuda.is_available(): 25 | from ..functions import MSDeformAttnFunction 26 | else: 27 | MSDeformAttnFunction = None 28 | from ..functions.ms_deform_attn_func import ms_deform_attn_core_pytorch 29 | 30 | 31 | def _is_power_of_2(n): 32 | if (not isinstance(n, int)) or (n < 0): 33 | raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n))) 34 | return (n & (n-1) == 0) and n != 0 35 | 36 | 37 | class MSDeformAttn(nn.Module): 38 | def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4): 39 | """ 40 | Multi-Scale Deformable Attention Module 41 | :param d_model hidden dimension 42 | :param n_levels number of feature levels 43 | :param n_heads number of attention heads 44 | :param n_points number of sampling points per attention head per feature level 45 | """ 46 | super().__init__() 47 | if d_model % n_heads != 0: 48 | raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads)) 49 | _d_per_head = d_model // n_heads 50 | # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation 51 | if not _is_power_of_2(_d_per_head): 52 | warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 " 53 | "which is more efficient in our CUDA implementation.") 54 | 55 | self.im2col_step = 128 56 | 57 | self.d_model = d_model 58 | self.n_levels = n_levels 59 | self.n_heads = n_heads 60 | self.n_points = n_points 61 | 62 | self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2) 63 | self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points) 64 | self.value_proj = nn.Linear(d_model, d_model) 65 | self.output_proj = nn.Linear(d_model, d_model) 66 | 67 | self._reset_parameters() 68 | 69 | def _reset_parameters(self): 70 | constant_(self.sampling_offsets.weight.data, 0.) 71 | thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads) 72 | grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) 73 | grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1) 74 | for i in range(self.n_points): 75 | grid_init[:, :, i, :] *= i + 1 76 | with torch.no_grad(): 77 | self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1)) 78 | constant_(self.attention_weights.weight.data, 0.) 79 | constant_(self.attention_weights.bias.data, 0.) 80 | xavier_uniform_(self.value_proj.weight.data) 81 | constant_(self.value_proj.bias.data, 0.) 82 | xavier_uniform_(self.output_proj.weight.data) 83 | constant_(self.output_proj.bias.data, 0.) 84 | 85 | def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None): 86 | """ 87 | :param query (N, Length_{query}, C) 88 | :param reference_points (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area 89 | or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes 90 | :param input_flatten (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C) 91 | :param input_spatial_shapes (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})] 92 | :param input_level_start_index (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}] 93 | :param input_padding_mask (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements 94 | 95 | :return output (N, Length_{query}, C) 96 | """ 97 | N, Len_q, _ = query.shape 98 | N, Len_in, _ = input_flatten.shape 99 | assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in 100 | 101 | value = self.value_proj(input_flatten) 102 | if input_padding_mask is not None: 103 | value = value.masked_fill(input_padding_mask[..., None], float(0)) 104 | value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads) 105 | sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2) 106 | attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points) 107 | attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points) 108 | # N, Len_q, n_heads, n_levels, n_points, 2 109 | if reference_points.shape[-1] == 2: 110 | offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1) 111 | sampling_locations = reference_points[:, :, None, :, None, :] \ 112 | + sampling_offsets / offset_normalizer[None, None, None, :, None, :] 113 | elif reference_points.shape[-1] == 4: 114 | sampling_locations = reference_points[:, :, None, :, None, :2] \ 115 | + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5 116 | else: 117 | raise ValueError( 118 | 'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1])) 119 | if torch.cuda.is_available(): 120 | output = MSDeformAttnFunction.apply( 121 | value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step) 122 | else: 123 | ## CPU 124 | output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights) 125 | output = self.output_proj(output) 126 | return output 127 | -------------------------------------------------------------------------------- /oneformer/modeling/pixel_decoder/ops/setup.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | import os 13 | import glob 14 | 15 | import torch 16 | 17 | from torch.utils.cpp_extension import CUDA_HOME 18 | from torch.utils.cpp_extension import CppExtension 19 | from torch.utils.cpp_extension import CUDAExtension 20 | 21 | from setuptools import find_packages 22 | from setuptools import setup 23 | 24 | requirements = ["torch", "torchvision"] 25 | 26 | def get_extensions(): 27 | this_dir = os.path.dirname(os.path.abspath(__file__)) 28 | extensions_dir = os.path.join(this_dir, "src") 29 | 30 | main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) 31 | source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) 32 | source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) 33 | 34 | sources = main_file + source_cpu 35 | extension = CppExtension 36 | extra_compile_args = {"cxx": []} 37 | define_macros = [] 38 | 39 | # Force cuda since torch ask for a device, not if cuda is in fact available. 40 | if (os.environ.get('FORCE_CUDA') or torch.cuda.is_available()) and CUDA_HOME is not None: 41 | extension = CUDAExtension 42 | sources += source_cuda 43 | define_macros += [("WITH_CUDA", None)] 44 | extra_compile_args["nvcc"] = [ 45 | "-DCUDA_HAS_FP16=1", 46 | "-D__CUDA_NO_HALF_OPERATORS__", 47 | "-D__CUDA_NO_HALF_CONVERSIONS__", 48 | "-D__CUDA_NO_HALF2_OPERATORS__", 49 | ] 50 | else: 51 | if CUDA_HOME is None: 52 | raise NotImplementedError('CUDA_HOME is None. Please set environment variable CUDA_HOME.') 53 | else: 54 | raise NotImplementedError('No CUDA runtime is found. Please set FORCE_CUDA=1 or test it by running torch.cuda.is_available().') 55 | 56 | sources = [os.path.join(extensions_dir, s) for s in sources] 57 | include_dirs = [extensions_dir] 58 | ext_modules = [ 59 | extension( 60 | "MultiScaleDeformableAttention", 61 | sources, 62 | include_dirs=include_dirs, 63 | define_macros=define_macros, 64 | extra_compile_args=extra_compile_args, 65 | ) 66 | ] 67 | return ext_modules 68 | 69 | setup( 70 | name="MultiScaleDeformableAttention", 71 | version="1.0", 72 | author="Weijie Su", 73 | url="https://github.com/fundamentalvision/Deformable-DETR", 74 | description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention", 75 | packages=find_packages(exclude=("configs", "tests",)), 76 | ext_modules=get_extensions(), 77 | cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, 78 | ) 79 | -------------------------------------------------------------------------------- /oneformer/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #include 17 | 18 | #include 19 | #include 20 | 21 | 22 | at::Tensor 23 | ms_deform_attn_cpu_forward( 24 | const at::Tensor &value, 25 | const at::Tensor &spatial_shapes, 26 | const at::Tensor &level_start_index, 27 | const at::Tensor &sampling_loc, 28 | const at::Tensor &attn_weight, 29 | const int im2col_step) 30 | { 31 | AT_ERROR("Not implement on cpu"); 32 | } 33 | 34 | std::vector 35 | ms_deform_attn_cpu_backward( 36 | const at::Tensor &value, 37 | const at::Tensor &spatial_shapes, 38 | const at::Tensor &level_start_index, 39 | const at::Tensor &sampling_loc, 40 | const at::Tensor &attn_weight, 41 | const at::Tensor &grad_output, 42 | const int im2col_step) 43 | { 44 | AT_ERROR("Not implement on cpu"); 45 | } 46 | 47 | -------------------------------------------------------------------------------- /oneformer/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #pragma once 17 | #include 18 | 19 | at::Tensor 20 | ms_deform_attn_cpu_forward( 21 | const at::Tensor &value, 22 | const at::Tensor &spatial_shapes, 23 | const at::Tensor &level_start_index, 24 | const at::Tensor &sampling_loc, 25 | const at::Tensor &attn_weight, 26 | const int im2col_step); 27 | 28 | std::vector 29 | ms_deform_attn_cpu_backward( 30 | const at::Tensor &value, 31 | const at::Tensor &spatial_shapes, 32 | const at::Tensor &level_start_index, 33 | const at::Tensor &sampling_loc, 34 | const at::Tensor &attn_weight, 35 | const at::Tensor &grad_output, 36 | const int im2col_step); 37 | 38 | 39 | -------------------------------------------------------------------------------- /oneformer/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.cu: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #include 17 | #include "cuda/ms_deform_im2col_cuda.cuh" 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | 25 | at::Tensor ms_deform_attn_cuda_forward( 26 | const at::Tensor &value, 27 | const at::Tensor &spatial_shapes, 28 | const at::Tensor &level_start_index, 29 | const at::Tensor &sampling_loc, 30 | const at::Tensor &attn_weight, 31 | const int im2col_step) 32 | { 33 | AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous"); 34 | AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous"); 35 | AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous"); 36 | AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous"); 37 | AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous"); 38 | 39 | AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor"); 40 | AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor"); 41 | AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor"); 42 | AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor"); 43 | AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor"); 44 | 45 | const int batch = value.size(0); 46 | const int spatial_size = value.size(1); 47 | const int num_heads = value.size(2); 48 | const int channels = value.size(3); 49 | 50 | const int num_levels = spatial_shapes.size(0); 51 | 52 | const int num_query = sampling_loc.size(1); 53 | const int num_point = sampling_loc.size(4); 54 | 55 | const int im2col_step_ = std::min(batch, im2col_step); 56 | 57 | AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_); 58 | 59 | auto output = at::zeros({batch, num_query, num_heads, channels}, value.options()); 60 | 61 | const int batch_n = im2col_step_; 62 | auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels}); 63 | auto per_value_size = spatial_size * num_heads * channels; 64 | auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2; 65 | auto per_attn_weight_size = num_query * num_heads * num_levels * num_point; 66 | for (int n = 0; n < batch/im2col_step_; ++n) 67 | { 68 | auto columns = output_n.select(0, n); 69 | AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_forward_cuda", ([&] { 70 | ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(), 71 | value.data() + n * im2col_step_ * per_value_size, 72 | spatial_shapes.data(), 73 | level_start_index.data(), 74 | sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, 75 | attn_weight.data() + n * im2col_step_ * per_attn_weight_size, 76 | batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point, 77 | columns.data()); 78 | 79 | })); 80 | } 81 | 82 | output = output.view({batch, num_query, num_heads*channels}); 83 | 84 | return output; 85 | } 86 | 87 | 88 | std::vector ms_deform_attn_cuda_backward( 89 | const at::Tensor &value, 90 | const at::Tensor &spatial_shapes, 91 | const at::Tensor &level_start_index, 92 | const at::Tensor &sampling_loc, 93 | const at::Tensor &attn_weight, 94 | const at::Tensor &grad_output, 95 | const int im2col_step) 96 | { 97 | 98 | AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous"); 99 | AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous"); 100 | AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous"); 101 | AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous"); 102 | AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous"); 103 | AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous"); 104 | 105 | AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor"); 106 | AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor"); 107 | AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor"); 108 | AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor"); 109 | AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor"); 110 | AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor"); 111 | 112 | const int batch = value.size(0); 113 | const int spatial_size = value.size(1); 114 | const int num_heads = value.size(2); 115 | const int channels = value.size(3); 116 | 117 | const int num_levels = spatial_shapes.size(0); 118 | 119 | const int num_query = sampling_loc.size(1); 120 | const int num_point = sampling_loc.size(4); 121 | 122 | const int im2col_step_ = std::min(batch, im2col_step); 123 | 124 | AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_); 125 | 126 | auto grad_value = at::zeros_like(value); 127 | auto grad_sampling_loc = at::zeros_like(sampling_loc); 128 | auto grad_attn_weight = at::zeros_like(attn_weight); 129 | 130 | const int batch_n = im2col_step_; 131 | auto per_value_size = spatial_size * num_heads * channels; 132 | auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2; 133 | auto per_attn_weight_size = num_query * num_heads * num_levels * num_point; 134 | auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels}); 135 | 136 | for (int n = 0; n < batch/im2col_step_; ++n) 137 | { 138 | auto grad_output_g = grad_output_n.select(0, n); 139 | AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_backward_cuda", ([&] { 140 | ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(), 141 | grad_output_g.data(), 142 | value.data() + n * im2col_step_ * per_value_size, 143 | spatial_shapes.data(), 144 | level_start_index.data(), 145 | sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, 146 | attn_weight.data() + n * im2col_step_ * per_attn_weight_size, 147 | batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point, 148 | grad_value.data() + n * im2col_step_ * per_value_size, 149 | grad_sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, 150 | grad_attn_weight.data() + n * im2col_step_ * per_attn_weight_size); 151 | 152 | })); 153 | } 154 | 155 | return { 156 | grad_value, grad_sampling_loc, grad_attn_weight 157 | }; 158 | } -------------------------------------------------------------------------------- /oneformer/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #pragma once 17 | #include 18 | 19 | at::Tensor ms_deform_attn_cuda_forward( 20 | const at::Tensor &value, 21 | const at::Tensor &spatial_shapes, 22 | const at::Tensor &level_start_index, 23 | const at::Tensor &sampling_loc, 24 | const at::Tensor &attn_weight, 25 | const int im2col_step); 26 | 27 | std::vector ms_deform_attn_cuda_backward( 28 | const at::Tensor &value, 29 | const at::Tensor &spatial_shapes, 30 | const at::Tensor &level_start_index, 31 | const at::Tensor &sampling_loc, 32 | const at::Tensor &attn_weight, 33 | const at::Tensor &grad_output, 34 | const int im2col_step); 35 | 36 | -------------------------------------------------------------------------------- /oneformer/modeling/pixel_decoder/ops/src/ms_deform_attn.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #pragma once 17 | 18 | #include "cpu/ms_deform_attn_cpu.h" 19 | 20 | #ifdef WITH_CUDA 21 | #include "cuda/ms_deform_attn_cuda.h" 22 | #endif 23 | 24 | 25 | at::Tensor 26 | ms_deform_attn_forward( 27 | const at::Tensor &value, 28 | const at::Tensor &spatial_shapes, 29 | const at::Tensor &level_start_index, 30 | const at::Tensor &sampling_loc, 31 | const at::Tensor &attn_weight, 32 | const int im2col_step) 33 | { 34 | if (value.type().is_cuda()) 35 | { 36 | #ifdef WITH_CUDA 37 | return ms_deform_attn_cuda_forward( 38 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step); 39 | #else 40 | AT_ERROR("Not compiled with GPU support"); 41 | #endif 42 | } 43 | AT_ERROR("Not implemented on the CPU"); 44 | } 45 | 46 | std::vector 47 | ms_deform_attn_backward( 48 | const at::Tensor &value, 49 | const at::Tensor &spatial_shapes, 50 | const at::Tensor &level_start_index, 51 | const at::Tensor &sampling_loc, 52 | const at::Tensor &attn_weight, 53 | const at::Tensor &grad_output, 54 | const int im2col_step) 55 | { 56 | if (value.type().is_cuda()) 57 | { 58 | #ifdef WITH_CUDA 59 | return ms_deform_attn_cuda_backward( 60 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step); 61 | #else 62 | AT_ERROR("Not compiled with GPU support"); 63 | #endif 64 | } 65 | AT_ERROR("Not implemented on the CPU"); 66 | } 67 | 68 | -------------------------------------------------------------------------------- /oneformer/modeling/pixel_decoder/ops/src/vision.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #include "ms_deform_attn.h" 17 | 18 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 19 | m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward"); 20 | m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward"); 21 | } 22 | -------------------------------------------------------------------------------- /oneformer/modeling/pixel_decoder/ops/test.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | from __future__ import absolute_import 13 | from __future__ import print_function 14 | from __future__ import division 15 | 16 | import time 17 | import torch 18 | import torch.nn as nn 19 | from torch.autograd import gradcheck 20 | 21 | from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch 22 | 23 | 24 | N, M, D = 1, 2, 2 25 | Lq, L, P = 2, 2, 2 26 | shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda() 27 | level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1])) 28 | S = sum([(H*W).item() for H, W in shapes]) 29 | 30 | 31 | torch.manual_seed(3) 32 | 33 | 34 | @torch.no_grad() 35 | def check_forward_equal_with_pytorch_double(): 36 | value = torch.rand(N, S, M, D).cuda() * 0.01 37 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 38 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 39 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 40 | im2col_step = 2 41 | output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu() 42 | output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu() 43 | fwdok = torch.allclose(output_cuda, output_pytorch) 44 | max_abs_err = (output_cuda - output_pytorch).abs().max() 45 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() 46 | 47 | print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') 48 | 49 | 50 | @torch.no_grad() 51 | def check_forward_equal_with_pytorch_float(): 52 | value = torch.rand(N, S, M, D).cuda() * 0.01 53 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 54 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 55 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 56 | im2col_step = 2 57 | output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu() 58 | output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu() 59 | fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3) 60 | max_abs_err = (output_cuda - output_pytorch).abs().max() 61 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() 62 | 63 | print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') 64 | 65 | 66 | def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True): 67 | 68 | value = torch.rand(N, S, M, channels).cuda() * 0.01 69 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 70 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 71 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 72 | im2col_step = 2 73 | func = MSDeformAttnFunction.apply 74 | 75 | value.requires_grad = grad_value 76 | sampling_locations.requires_grad = grad_sampling_loc 77 | attention_weights.requires_grad = grad_attn_weight 78 | 79 | gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step)) 80 | 81 | print(f'* {gradok} check_gradient_numerical(D={channels})') 82 | 83 | 84 | if __name__ == '__main__': 85 | check_forward_equal_with_pytorch_double() 86 | check_forward_equal_with_pytorch_float() 87 | 88 | for channels in [30, 32, 64, 71, 1025, 2048, 3096]: 89 | check_gradient_numerical(channels, True, True, True) 90 | 91 | 92 | 93 | -------------------------------------------------------------------------------- /oneformer/modeling/transformer_decoder/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from .oneformer_transformer_decoder import ContrastiveMultiScaleMaskedTransformerDecoder -------------------------------------------------------------------------------- /oneformer/modeling/transformer_decoder/position_encoding.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Reference: https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/modeling/transformer_decoder/position_encoding.py 3 | # Modified by Jitesh Jain (https://github.com/praeclarumjj3) 4 | # ------------------------------------------------------------------------------ 5 | 6 | """ 7 | Various positional encodings for the transformer. 8 | """ 9 | import math 10 | 11 | import torch 12 | from torch import nn 13 | 14 | 15 | class PositionEmbeddingSine(nn.Module): 16 | """ 17 | This is a more standard version of the position embedding, very similar to the one 18 | used by the Attention is all you need paper, generalized to work on images. 19 | """ 20 | 21 | def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None): 22 | super().__init__() 23 | self.num_pos_feats = num_pos_feats 24 | self.temperature = temperature 25 | self.normalize = normalize 26 | if scale is not None and normalize is False: 27 | raise ValueError("normalize should be True if scale is passed") 28 | if scale is None: 29 | scale = 2 * math.pi 30 | self.scale = scale 31 | 32 | def forward(self, x, mask=None): 33 | if mask is None: 34 | mask = torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool) 35 | not_mask = ~mask 36 | y_embed = not_mask.cumsum(1, dtype=torch.float32) 37 | x_embed = not_mask.cumsum(2, dtype=torch.float32) 38 | if self.normalize: 39 | eps = 1e-6 40 | y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale 41 | x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale 42 | 43 | dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) 44 | dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) 45 | 46 | pos_x = x_embed[:, :, :, None] / dim_t 47 | pos_y = y_embed[:, :, :, None] / dim_t 48 | pos_x = torch.stack( 49 | (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4 50 | ).flatten(3) 51 | pos_y = torch.stack( 52 | (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4 53 | ).flatten(3) 54 | pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) 55 | return pos 56 | 57 | def __repr__(self, _repr_indent=4): 58 | head = "Positional encoding " + self.__class__.__name__ 59 | body = [ 60 | "num_pos_feats: {}".format(self.num_pos_feats), 61 | "temperature: {}".format(self.temperature), 62 | "normalize: {}".format(self.normalize), 63 | "scale: {}".format(self.scale), 64 | ] 65 | # _repr_indent = 4 66 | lines = [head] + [" " * _repr_indent + line for line in body] 67 | return "\n".join(lines) 68 | -------------------------------------------------------------------------------- /oneformer/test_time_augmentation.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Reference: https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/test_time_augmentation.py 3 | # ------------------------------------------------------------------------------ 4 | 5 | import copy 6 | import logging 7 | from itertools import count 8 | 9 | import numpy as np 10 | import torch 11 | from fvcore.transforms import HFlipTransform 12 | from torch import nn 13 | from torch.nn.parallel import DistributedDataParallel 14 | 15 | from detectron2.data.detection_utils import read_image 16 | from .datasetmapper_tta import DatasetMapperTTA 17 | import torch.nn.functional as F 18 | 19 | __all__ = [ 20 | "SemanticSegmentorWithTTA", 21 | ] 22 | 23 | 24 | class SemanticSegmentorWithTTA(nn.Module): 25 | """ 26 | A SemanticSegmentor with test-time augmentation enabled. 27 | Its :meth:`__call__` method has the same interface as :meth:`SemanticSegmentor.forward`. 28 | """ 29 | 30 | def __init__(self, cfg, model, tta_mapper=None, batch_size=1): 31 | """ 32 | Args: 33 | cfg (CfgNode): 34 | model (SemanticSegmentor): a SemanticSegmentor to apply TTA on. 35 | tta_mapper (callable): takes a dataset dict and returns a list of 36 | augmented versions of the dataset dict. Defaults to 37 | `DatasetMapperTTA(cfg)`. 38 | batch_size (int): batch the augmented images into this batch size for inference. 39 | """ 40 | super().__init__() 41 | if isinstance(model, DistributedDataParallel): 42 | model = model.module 43 | self.cfg = cfg.clone() 44 | self.num_classes = self.cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES 45 | 46 | self.model = model 47 | 48 | if tta_mapper is None: 49 | tta_mapper = DatasetMapperTTA(cfg) 50 | self.tta_mapper = tta_mapper 51 | self.batch_size = batch_size 52 | 53 | def __call__(self, batched_inputs): 54 | """ 55 | Same input/output format as :meth:`SemanticSegmentor.forward` 56 | """ 57 | 58 | def _maybe_read_image(dataset_dict): 59 | ret = copy.copy(dataset_dict) 60 | if "image" not in ret: 61 | image = read_image(ret.pop("file_name"), self.model.input_format) 62 | image = torch.from_numpy(np.ascontiguousarray(image.transpose(2, 0, 1))) # CHW 63 | ret["image"] = image 64 | if "height" not in ret and "width" not in ret: 65 | ret["height"] = image.shape[1] 66 | ret["width"] = image.shape[2] 67 | return ret 68 | 69 | processed_results = [] 70 | for x in batched_inputs: 71 | result = self._inference_one_image(_maybe_read_image(x)) 72 | processed_results.append(result) 73 | return processed_results 74 | 75 | def _inference_one_image(self, input): 76 | """ 77 | Args: 78 | input (dict): one dataset dict with "image" field being a CHW tensor 79 | Returns: 80 | dict: one output dict 81 | """ 82 | orig_shape = (input["height"], input["width"]) 83 | augmented_inputs, tfms = self._get_augmented_inputs(input) 84 | 85 | final_predictions = None 86 | count_predictions = 0 87 | for input, tfm in zip(augmented_inputs, tfms): 88 | count_predictions += 1 89 | with torch.no_grad(): 90 | if final_predictions is None: 91 | if any(isinstance(t, HFlipTransform) for t in tfm.transforms): 92 | final_predictions = self.model([input])[0].pop("sem_seg").flip(dims=[2]) 93 | else: 94 | final_predictions = self.model([input])[0].pop("sem_seg") 95 | else: 96 | if any(isinstance(t, HFlipTransform) for t in tfm.transforms): 97 | final_predictions += self.model([input])[0].pop("sem_seg").flip(dims=[2]) 98 | else: 99 | final_predictions += self.model([input])[0].pop("sem_seg") 100 | 101 | final_predictions = final_predictions / count_predictions 102 | return {"sem_seg": final_predictions} 103 | 104 | def _get_augmented_inputs(self, input): 105 | augmented_inputs = self.tta_mapper(input) 106 | tfms = [x.pop("transforms") for x in augmented_inputs] 107 | return augmented_inputs, tfms -------------------------------------------------------------------------------- /oneformer/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from .events import setup_wandb, WandbWriter -------------------------------------------------------------------------------- /oneformer/utils/box_ops.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | """ 3 | Utilities for bounding box manipulation and GIoU. 4 | """ 5 | import torch, os 6 | from torchvision.ops.boxes import box_area 7 | 8 | 9 | def box_cxcywh_to_xyxy(x): 10 | x_c, y_c, w, h = x.unbind(-1) 11 | b = [(x_c - 0.5 * w), (y_c - 0.5 * h), 12 | (x_c + 0.5 * w), (y_c + 0.5 * h)] 13 | return torch.stack(b, dim=-1) 14 | 15 | 16 | def box_xyxy_to_cxcywh(x): 17 | x0, y0, x1, y1 = x.unbind(-1) 18 | b = [(x0 + x1) / 2, (y0 + y1) / 2, 19 | (x1 - x0), (y1 - y0)] 20 | return torch.stack(b, dim=-1) 21 | 22 | 23 | # modified from torchvision to also return the union 24 | def box_iou(boxes1, boxes2): 25 | area1 = box_area(boxes1) 26 | area2 = box_area(boxes2) 27 | 28 | # import ipdb; ipdb.set_trace() 29 | lt = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2] 30 | rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2] 31 | 32 | wh = (rb - lt).clamp(min=0) # [N,M,2] 33 | inter = wh[:, :, 0] * wh[:, :, 1] # [N,M] 34 | 35 | union = area1[:, None] + area2 - inter 36 | 37 | iou = inter / (union + 1e-6) 38 | return iou, union 39 | 40 | 41 | def generalized_box_iou(boxes1, boxes2): 42 | """ 43 | Generalized IoU from https://giou.stanford.edu/ 44 | The boxes should be in [x0, y0, x1, y1] format 45 | Returns a [N, M] pairwise matrix, where N = len(boxes1) 46 | and M = len(boxes2) 47 | """ 48 | # degenerate boxes gives inf / nan results 49 | # so do an early check 50 | assert (boxes1[:, 2:] >= boxes1[:, :2]).all() 51 | assert (boxes2[:, 2:] >= boxes2[:, :2]).all() 52 | # except: 53 | # import ipdb; ipdb.set_trace() 54 | iou, union = box_iou(boxes1, boxes2) 55 | 56 | lt = torch.min(boxes1[:, None, :2], boxes2[:, :2]) 57 | rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:]) 58 | 59 | wh = (rb - lt).clamp(min=0) # [N,M,2] 60 | area = wh[:, :, 0] * wh[:, :, 1] 61 | 62 | return iou - (area - union) / (area + 1e-6) 63 | 64 | 65 | 66 | # modified from torchvision to also return the union 67 | def box_iou_pairwise(boxes1, boxes2): 68 | area1 = box_area(boxes1) 69 | area2 = box_area(boxes2) 70 | 71 | lt = torch.max(boxes1[:, :2], boxes2[:, :2]) # [N,2] 72 | rb = torch.min(boxes1[:, 2:], boxes2[:, 2:]) # [N,2] 73 | 74 | wh = (rb - lt).clamp(min=0) # [N,2] 75 | inter = wh[:, 0] * wh[:, 1] # [N] 76 | 77 | union = area1 + area2 - inter 78 | 79 | iou = inter / union 80 | return iou, union 81 | 82 | 83 | def generalized_box_iou_pairwise(boxes1, boxes2): 84 | """ 85 | Generalized IoU from https://giou.stanford.edu/ 86 | Input: 87 | - boxes1, boxes2: N,4 88 | Output: 89 | - giou: N, 4 90 | """ 91 | # degenerate boxes gives inf / nan results 92 | # so do an early check 93 | assert (boxes1[:, 2:] >= boxes1[:, :2]).all() 94 | assert (boxes2[:, 2:] >= boxes2[:, :2]).all() 95 | assert boxes1.shape == boxes2.shape 96 | iou, union = box_iou_pairwise(boxes1, boxes2) # N, 4 97 | 98 | lt = torch.min(boxes1[:, :2], boxes2[:, :2]) 99 | rb = torch.max(boxes1[:, 2:], boxes2[:, 2:]) 100 | 101 | wh = (rb - lt).clamp(min=0) # [N,2] 102 | area = wh[:, 0] * wh[:, 1] 103 | 104 | return iou - (area - union) / area 105 | 106 | def masks_to_boxes(masks): 107 | """Compute the bounding boxes around the provided masks 108 | The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions. 109 | Returns a [N, 4] tensors, with the boxes in xyxy format 110 | """ 111 | if masks.numel() == 0: 112 | return torch.zeros((0, 4), device=masks.device) 113 | 114 | h, w = masks.shape[-2:] 115 | 116 | y = torch.arange(0, h, dtype=torch.float) 117 | x = torch.arange(0, w, dtype=torch.float) 118 | y, x = torch.meshgrid(y, x) 119 | 120 | x_mask = (masks * x.unsqueeze(0)) 121 | x_max = x_mask.flatten(1).max(-1)[0] 122 | x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0] 123 | 124 | y_mask = (masks * y.unsqueeze(0)) 125 | y_max = y_mask.flatten(1).max(-1)[0] 126 | y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0] 127 | 128 | return torch.stack([x_min, y_min, x_max, y_max], 1) 129 | 130 | if __name__ == '__main__': 131 | x = torch.rand(5, 4) 132 | y = torch.rand(3, 4) 133 | iou, union = box_iou(x, y) -------------------------------------------------------------------------------- /oneformer/utils/events.py: -------------------------------------------------------------------------------- 1 | import os 2 | import wandb 3 | from detectron2.utils import comm 4 | from detectron2.utils.events import EventWriter, get_event_storage 5 | 6 | 7 | def setup_wandb(cfg, args): 8 | if comm.is_main_process(): 9 | init_args = { 10 | k.lower(): v 11 | for k, v in cfg.WANDB.items() 12 | if isinstance(k, str) and k not in ["config"] 13 | } 14 | # only include most related part to avoid too big table 15 | # TODO: add configurable params to select which part of `cfg` should be saved in config 16 | if "config_exclude_keys" in init_args: 17 | init_args["config"] = cfg 18 | init_args["config"]["cfg_file"] = args.config_file 19 | else: 20 | init_args["config"] = { 21 | "model": cfg.MODEL, 22 | "solver": cfg.SOLVER, 23 | "cfg_file": args.config_file, 24 | } 25 | if ("name" not in init_args) or (init_args["name"] is None): 26 | init_args["name"] = os.path.basename(args.config_file) 27 | else: 28 | init_args["name"] = init_args["name"] + '_' + os.path.basename(args.config_file) 29 | wandb.init(**init_args) 30 | 31 | 32 | class BaseRule(object): 33 | def __call__(self, target): 34 | return target 35 | 36 | 37 | class IsIn(BaseRule): 38 | def __init__(self, keyword: str): 39 | self.keyword = keyword 40 | 41 | def __call__(self, target): 42 | return self.keyword in target 43 | 44 | 45 | class Prefix(BaseRule): 46 | def __init__(self, keyword: str): 47 | self.keyword = keyword 48 | 49 | def __call__(self, target): 50 | return "/".join([self.keyword, target]) 51 | 52 | 53 | class WandbWriter(EventWriter): 54 | """ 55 | Write all scalars to a tensorboard file. 56 | """ 57 | 58 | def __init__(self): 59 | """ 60 | Args: 61 | log_dir (str): the directory to save the output events 62 | kwargs: other arguments passed to `torch.utils.tensorboard.SummaryWriter(...)` 63 | """ 64 | self._last_write = -1 65 | self._group_rules = [ 66 | (IsIn("/"), BaseRule()), 67 | (IsIn("loss"), Prefix("train")), 68 | ] 69 | 70 | def write(self): 71 | 72 | storage = get_event_storage() 73 | 74 | def _group_name(scalar_name): 75 | for (rule, op) in self._group_rules: 76 | if rule(scalar_name): 77 | return op(scalar_name) 78 | return scalar_name 79 | 80 | stats = { 81 | _group_name(name): scalars[0] 82 | for name, scalars in storage.latest().items() 83 | if scalars[1] > self._last_write 84 | } 85 | if len(stats) > 0: 86 | self._last_write = max([v[1] for k, v in storage.latest().items()]) 87 | 88 | # storage.put_{image,histogram} is only meant to be used by 89 | # tensorboard writer. So we access its internal fields directly from here. 90 | if len(storage._vis_data) >= 1: 91 | stats["image"] = [ 92 | wandb.Image(img, caption=img_name) 93 | for img_name, img, step_num in storage._vis_data 94 | ] 95 | # Storage stores all image data and rely on this writer to clear them. 96 | # As a result it assumes only one writer will use its image data. 97 | # An alternative design is to let storage store limited recent 98 | # data (e.g. only the most recent image) that all writers can access. 99 | # In that case a writer may not see all image data if its period is long. 100 | storage.clear_images() 101 | 102 | if len(storage._histograms) >= 1: 103 | 104 | def create_bar(tag, bucket_limits, bucket_counts, **kwargs): 105 | data = [ 106 | [label, val] for (label, val) in zip(bucket_limits, bucket_counts) 107 | ] 108 | table = wandb.Table(data=data, columns=["label", "value"]) 109 | return wandb.plot.bar(table, "label", "value", title=tag) 110 | 111 | stats["hist"] = [create_bar(**params) for params in storage._histograms] 112 | 113 | storage.clear_histograms() 114 | 115 | if len(stats) == 0: 116 | return 117 | wandb.log(stats, step=storage.iter) 118 | 119 | def close(self): 120 | wandb.finish() -------------------------------------------------------------------------------- /oneformer/utils/misc.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/util/misc.py 3 | """ 4 | Misc functions, including distributed helpers. 5 | 6 | Mostly copy-paste from torchvision references. 7 | """ 8 | from typing import List, Optional 9 | 10 | import torch 11 | import torch.distributed as dist 12 | import torchvision 13 | from torch import Tensor 14 | import warnings 15 | import torch.nn.functional as F 16 | import math 17 | 18 | def inverse_sigmoid(x, eps=1e-3): 19 | x = x.clamp(min=0, max=1) 20 | x1 = x.clamp(min=eps) 21 | x2 = (1 - x).clamp(min=eps) 22 | return torch.log(x1/x2) 23 | 24 | def _no_grad_trunc_normal_(tensor, mean, std, a, b): 25 | # Cut & paste from PyTorch official master until it's in a few official releases - RW 26 | # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf 27 | def norm_cdf(x): 28 | # Computes standard normal cumulative distribution function 29 | return (1. + math.erf(x / math.sqrt(2.))) / 2. 30 | 31 | if (mean < a - 2 * std) or (mean > b + 2 * std): 32 | warnings.warn("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. " 33 | "The distribution of values may be incorrect.", 34 | stacklevel=2) 35 | 36 | with torch.no_grad(): 37 | # Values are generated by using a truncated uniform distribution and 38 | # then using the inverse CDF for the normal distribution. 39 | # Get upper and lower cdf values 40 | l = norm_cdf((a - mean) / std) 41 | u = norm_cdf((b - mean) / std) 42 | 43 | # Uniformly fill tensor with values from [l, u], then translate to 44 | # [2l-1, 2u-1]. 45 | tensor.uniform_(2 * l - 1, 2 * u - 1) 46 | 47 | # Use inverse cdf transform for normal distribution to get truncated 48 | # standard normal 49 | tensor.erfinv_() 50 | 51 | # Transform to proper mean, std 52 | tensor.mul_(std * math.sqrt(2.)) 53 | tensor.add_(mean) 54 | 55 | # Clamp to ensure it's in the proper range 56 | tensor.clamp_(min=a, max=b) 57 | return tensor 58 | 59 | def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.): 60 | # type: (Tensor, float, float, float, float) -> Tensor 61 | r"""Fills the input Tensor with values drawn from a truncated 62 | normal distribution. The values are effectively drawn from the 63 | normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)` 64 | with values outside :math:`[a, b]` redrawn until they are within 65 | the bounds. The method used for generating the random values works 66 | best when :math:`a \leq \text{mean} \leq b`. 67 | Args: 68 | tensor: an n-dimensional `torch.Tensor` 69 | mean: the mean of the normal distribution 70 | std: the standard deviation of the normal distribution 71 | a: the minimum cutoff value 72 | b: the maximum cutoff value 73 | Examples: 74 | >>> w = torch.empty(3, 5) 75 | >>> nn.init.trunc_normal_(w) 76 | """ 77 | return _no_grad_trunc_normal_(tensor, mean, std, a, b) 78 | 79 | def resize(input, 80 | size=None, 81 | scale_factor=None, 82 | mode='nearest', 83 | align_corners=None, 84 | warning=True): 85 | if warning: 86 | if size is not None and align_corners: 87 | input_h, input_w = tuple(int(x) for x in input.shape[2:]) 88 | output_h, output_w = tuple(int(x) for x in size) 89 | if output_h > input_h or output_w > output_h: 90 | if ((output_h > 1 and output_w > 1 and input_h > 1 91 | and input_w > 1) and (output_h - 1) % (input_h - 1) 92 | and (output_w - 1) % (input_w - 1)): 93 | warnings.warn( 94 | f'When align_corners={align_corners}, ' 95 | 'the output would more aligned if ' 96 | f'input size {(input_h, input_w)} is `x+1` and ' 97 | f'out size {(output_h, output_w)} is `nx+1`') 98 | if isinstance(size, torch.Size): 99 | size = tuple(int(x) for x in size) 100 | return F.interpolate(input, size, scale_factor, mode, align_corners) 101 | 102 | def _max_by_axis(the_list): 103 | # type: (List[List[int]]) -> List[int] 104 | maxes = the_list[0] 105 | for sublist in the_list[1:]: 106 | for index, item in enumerate(sublist): 107 | maxes[index] = max(maxes[index], item) 108 | return maxes 109 | 110 | 111 | class NestedTensor(object): 112 | def __init__(self, tensors, mask: Optional[Tensor]): 113 | self.tensors = tensors 114 | self.mask = mask 115 | 116 | def to(self, device): 117 | # type: (Device) -> NestedTensor # noqa 118 | cast_tensor = self.tensors.to(device) 119 | mask = self.mask 120 | if mask is not None: 121 | assert mask is not None 122 | cast_mask = mask.to(device) 123 | else: 124 | cast_mask = None 125 | return NestedTensor(cast_tensor, cast_mask) 126 | 127 | def decompose(self): 128 | return self.tensors, self.mask 129 | 130 | def __repr__(self): 131 | return str(self.tensors) 132 | 133 | 134 | def nested_tensor_from_tensor_list(tensor_list: List[Tensor]): 135 | # TODO make this more general 136 | if tensor_list[0].ndim == 3: 137 | if torchvision._is_tracing(): 138 | # nested_tensor_from_tensor_list() does not export well to ONNX 139 | # call _onnx_nested_tensor_from_tensor_list() instead 140 | return _onnx_nested_tensor_from_tensor_list(tensor_list) 141 | 142 | # TODO make it support different-sized images 143 | max_size = _max_by_axis([list(img.shape) for img in tensor_list]) 144 | # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list])) 145 | batch_shape = [len(tensor_list)] + max_size 146 | b, c, h, w = batch_shape 147 | dtype = tensor_list[0].dtype 148 | device = tensor_list[0].device 149 | tensor = torch.zeros(batch_shape, dtype=dtype, device=device) 150 | mask = torch.ones((b, h, w), dtype=torch.bool, device=device) 151 | for img, pad_img, m in zip(tensor_list, tensor, mask): 152 | pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) 153 | m[: img.shape[1], : img.shape[2]] = False 154 | else: 155 | raise ValueError("not supported") 156 | return NestedTensor(tensor, mask) 157 | 158 | 159 | # _onnx_nested_tensor_from_tensor_list() is an implementation of 160 | # nested_tensor_from_tensor_list() that is supported by ONNX tracing. 161 | @torch.jit.unused 162 | def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor: 163 | max_size = [] 164 | for i in range(tensor_list[0].dim()): 165 | max_size_i = torch.max( 166 | torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32) 167 | ).to(torch.int64) 168 | max_size.append(max_size_i) 169 | max_size = tuple(max_size) 170 | 171 | # work around for 172 | # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) 173 | # m[: img.shape[1], :img.shape[2]] = False 174 | # which is not yet supported in onnx 175 | padded_imgs = [] 176 | padded_masks = [] 177 | for img in tensor_list: 178 | padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))] 179 | padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0])) 180 | padded_imgs.append(padded_img) 181 | 182 | m = torch.zeros_like(img[0], dtype=torch.int, device=img.device) 183 | padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1) 184 | padded_masks.append(padded_mask.to(torch.bool)) 185 | 186 | tensor = torch.stack(padded_imgs) 187 | mask = torch.stack(padded_masks) 188 | 189 | return NestedTensor(tensor, mask=mask) 190 | 191 | 192 | def is_dist_avail_and_initialized(): 193 | if not dist.is_available(): 194 | return False 195 | if not dist.is_initialized(): 196 | return False 197 | return True 198 | -------------------------------------------------------------------------------- /oneformer/utils/pos_embed.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Position embedding utils 3 | # -------------------------------------------------------- 4 | 5 | from typing import Tuple 6 | 7 | import numpy as np 8 | import torch 9 | 10 | 11 | # -------------------------------------------------------- 12 | # 2D sine-cosine position embedding 13 | # References: 14 | # Transformer: https://github.com/tensorflow/models/blob/master/official/nlp/transformer/model_utils.py 15 | # MoCo v3: https://github.com/facebookresearch/moco-v3 16 | # -------------------------------------------------------- 17 | def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False): 18 | """ 19 | grid_size: int of the grid height and width 20 | return: 21 | pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token) 22 | """ 23 | grid_h = np.arange(grid_size, dtype=np.float32) 24 | grid_w = np.arange(grid_size, dtype=np.float32) 25 | grid = np.meshgrid(grid_w, grid_h) # here w goes first 26 | grid = np.stack(grid, axis=0) 27 | 28 | grid = grid.reshape([2, 1, grid_size, grid_size]) 29 | pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid) 30 | if cls_token: 31 | pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0) 32 | return pos_embed 33 | 34 | 35 | def get_2d_sincos_pos_embed_from_grid(embed_dim, grid): 36 | assert embed_dim % 2 == 0 37 | 38 | # use half of dimensions to encode grid_h 39 | emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2) 40 | emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2) 41 | 42 | emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D) 43 | return emb 44 | 45 | 46 | def get_1d_sincos_pos_embed_from_grid(embed_dim, pos): 47 | """ 48 | embed_dim: output dimension for each position 49 | pos: a list of positions to be encoded: size (M,) 50 | out: (M, D) 51 | """ 52 | assert embed_dim % 2 == 0 53 | omega = np.arange(embed_dim // 2, dtype=np.float) 54 | omega /= embed_dim / 2.0 55 | omega = 1.0 / 10000 ** omega # (D/2,) 56 | 57 | pos = pos.reshape(-1) # (M,) 58 | out = np.einsum("m,d->md", pos, omega) # (M, D/2), outer product 59 | 60 | emb_sin = np.sin(out) # (M, D/2) 61 | emb_cos = np.cos(out) # (M, D/2) 62 | 63 | emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D) 64 | return emb 65 | 66 | 67 | # -------------------------------------------------------- 68 | # Interpolate position embeddings for high-resolution 69 | # References: 70 | # DeiT: https://github.com/facebookresearch/deit 71 | # -------------------------------------------------------- 72 | def interpolate_pos_embed(model, checkpoint_model, pos_embed_key): 73 | if pos_embed_key in checkpoint_model: 74 | pos_embed_checkpoint = checkpoint_model[pos_embed_key] 75 | embedding_size = pos_embed_checkpoint.shape[-1] 76 | num_patches = model.num_patches 77 | if pos_embed_key.startswith("decoder"): 78 | num_extra_tokens = model.decoder_pos_embed.shape[-2] - num_patches 79 | else: 80 | num_extra_tokens = model.pos_embed.shape[-2] - num_patches 81 | # height (== width) for the checkpoint position embedding 82 | orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5) 83 | # height (== width) for the new position embedding 84 | new_size = int(num_patches ** 0.5) 85 | # class_token and dist_token are kept unchanged 86 | if orig_size != new_size: 87 | print( 88 | "Position interpolate from %dx%d to %dx%d" 89 | % (orig_size, orig_size, new_size, new_size) 90 | ) 91 | extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens] 92 | # only the position tokens are interpolated 93 | pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:] 94 | pos_tokens = pos_tokens.reshape( 95 | -1, orig_size, orig_size, embedding_size 96 | ).permute(0, 3, 1, 2) 97 | pos_tokens = torch.nn.functional.interpolate( 98 | pos_tokens, 99 | size=(new_size, new_size), 100 | mode="bicubic", 101 | align_corners=False, 102 | ) 103 | pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2) 104 | new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1) 105 | checkpoint_model[pos_embed_key] = new_pos_embed 106 | 107 | 108 | def interpolate_pos_embed_online( 109 | pos_embed, orig_size: Tuple[int], new_size: Tuple[int], num_extra_tokens: int 110 | ): 111 | extra_tokens = pos_embed[:, :num_extra_tokens] 112 | pos_tokens = pos_embed[:, num_extra_tokens:] 113 | embedding_size = pos_tokens.shape[-1] 114 | pos_tokens = pos_tokens.reshape( 115 | -1, orig_size[0], orig_size[1], embedding_size 116 | ).permute(0, 3, 1, 2) 117 | pos_tokens = torch.nn.functional.interpolate( 118 | pos_tokens, size=new_size, mode="bicubic", align_corners=False, 119 | ) 120 | pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2) 121 | new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1) 122 | return new_pos_embed 123 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | cython 2 | scipy==1.8.1 3 | shapely 4 | h5py==3.7.0 5 | submitit==1.4.2 6 | scikit-image 7 | timm==0.4.12 8 | einops==0.4.1 9 | icecream==2.1.2 10 | setuptools==59.5.0 11 | wandb==0.12.20 12 | ftfy==6.1.1 13 | regex==2022.6.2 14 | inflect==5.6.0 15 | diffdist==0.1 16 | pytorch_lightning==1.6.4 17 | tqdm==4.64.0 18 | mmcv==1.6.2 19 | -f https://shi-labs.com/natten/wheels/cu113/torch1.10.1/index.html 20 | natten==0.14.4 -------------------------------------------------------------------------------- /tools/README.md: -------------------------------------------------------------------------------- 1 | # OneFormer Tools 2 | 3 | ## Download Pretrained Weights 4 | 5 | It's common to initialize from backbone models pre-trained on ImageNet classification tasks. We use [Swin-Tranformer](https://github.com/microsoft/Swin-Transformer), [ConvNeXt](https://github.com/facebookresearch/ConvNeXt), and [DiNAT](https://github.com/SHI-Labs/Neighborhood-Attention-Transformer) for our experiments. 6 | 7 |
8 | Swin-Transformer 9 | 10 | - [Official Repo](https://github.com/microsoft/Swin-Transformer) 11 | - `convert-pretrained-model-to-d2.py`: Tool to convert Swin Transformer pre-trained weights for D2. 12 | 13 | ```bash 14 | pip install timm 15 | 16 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth 17 | python tools/convert-pretrained-model-to-d2.py swin_large_patch4_window12_384_22k.pth swin_large_patch4_window12_384_22k.pkl 18 | 19 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22kto1k.pth 20 | python tools/convert-pretrained-model-to-d2.py swin_large_patch4_window12_384_22kto1k.pth swin_large_patch4_window12_384_22kto1k.pkl 21 | ``` 22 | 23 |
24 | 25 |
26 | ConvNeXt 27 | 28 | - [Official Repo](https://github.com/facebookresearch/ConvNeXt) 29 | - `convert-pretrained-model-to-d2.py`: Tool to convert ConvNeXt pre-trained weights for D2. 30 | 31 | ```bash 32 | wget https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_1k_384.pth 33 | python tools/convert-pretrained-model-to-d2.py convnext_large_22k_1k_384.pth convnext_large_22k_1k_384.pkl 34 | 35 | wget https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_1k_384_ema.pth 36 | python tools/convert-pretrained-model-to-d2.py convnext_xlarge_22k_1k_384_ema.pth convnext_xlarge_22k_1k_384_ema.pkl 37 | ``` 38 | 39 |
40 | 41 |
42 | DiNAT 43 | 44 | - [Official Repo](https://github.com/SHI-Labs/Neighborhood-Attention-Transformer) 45 | - `convert-pretrained-nat-model-to-d2.py`: Tool to convert DiNAT pre-trained weights for D2. 46 | 47 | ```bash 48 | wget https://shi-labs.com/projects/dinat/checkpoints/imagenet1k/dinat_large_in22k_in1k_384_11x11.pth 49 | python tools/convert-pretrained-nat-model-to-d2.py dinat_large_in22k_in1k_384_11x11.pth dinat_large_in22k_in1k_384_11x11.pkl 50 | 51 | wget https://shi-labs.com/projects/dinat/checkpoints/imagenet22k/dinat_large_in22k_224.pth 52 | python tools/convert-pretrained-nat-model-to-d2.py dinat_large_in22k_224.pth dinat_large_in22k_224.pkl 53 | ``` 54 | 55 |
56 | 57 | ## Analyze Model 58 | 59 | - Tool to analyze model parameters, flops and speed. 60 | - We use dummy image to compute flops on ADE20K and Cityscapes. 61 | - For COCO, we use random 100 validation images. 62 | - We set `task = panoptic` by default. 63 | 64 | ```bash 65 | python tools/analyze_model.py --num-inputs 100 --tasks [flop speed] \ 66 | --config-file configs/ade20k/swin/oneformer_swin_large_IN21k_384_bs16_160k.yaml \ 67 | MODEL.WEIGHTS [--use-fixed-input-size] MODEL.TEST.SEMANTIC_ON False MODEL.TEST.INSTANCE_ON False 68 | ``` 69 | 70 | ## Training Throughput 71 | 72 | - Tool to compute throughput. 73 | - We compute throughput for 500 iterations by default. 74 | 75 | ```bash 76 | python tools/calc_throughput.py --dist-url 'tcp://127.0.0.1:50162' \ 77 | --num-gpus 8 \ 78 | --config-file configs/ade20k/swin/oneformer_swin_large_IN21k_384_bs16_160k.yaml \ 79 | MODEL.WEIGHTS pretrain/swin_large_patch4_window12_384_22kto1k.pkl \ 80 | OUTPUT_DIR tp_out SOLVER.MAX_ITER 500 81 | 82 | rm -rf tp_out 83 | ``` 84 | -------------------------------------------------------------------------------- /tools/analyze_model.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import numpy as np 3 | from collections import Counter 4 | import tqdm 5 | from fvcore.nn import flop_count_table # can also try flop_count_str 6 | 7 | from detectron2.checkpoint import DetectionCheckpointer 8 | from detectron2.config import CfgNode, LazyConfig, get_cfg, instantiate 9 | from detectron2.engine import default_argument_parser 10 | from detectron2.modeling import build_model 11 | from detectron2.projects.deeplab import add_deeplab_config 12 | from detectron2.utils.analysis import ( 13 | FlopCountAnalysis, 14 | activation_count_operators, 15 | parameter_count_table, 16 | ) 17 | from detectron2.utils.logger import setup_logger 18 | 19 | # fmt: off 20 | import os 21 | import sys 22 | sys.path.insert(1, os.path.join(sys.path[0], '..')) 23 | # fmt: on 24 | 25 | from oneformer.data.build import * 26 | from oneformer.data.dataset_mappers.dataset_mapper import DatasetMapper 27 | from oneformer import ( 28 | add_oneformer_config, 29 | add_common_config, 30 | add_swin_config, 31 | add_dinat_config, 32 | add_beit_adapter_config, 33 | add_convnext_config, 34 | ) 35 | 36 | logger = logging.getLogger("detectron2") 37 | 38 | 39 | def setup(args): 40 | if args.config_file.endswith(".yaml"): 41 | cfg = get_cfg() 42 | add_deeplab_config(cfg) 43 | add_common_config(cfg) 44 | add_swin_config(cfg) 45 | add_dinat_config(cfg) 46 | add_beit_adapter_config(cfg) 47 | add_oneformer_config(cfg) 48 | add_convnext_config(cfg) 49 | cfg.merge_from_file(args.config_file) 50 | cfg.DATALOADER.NUM_WORKERS = 0 51 | cfg.merge_from_list(args.opts) 52 | cfg.freeze() 53 | else: 54 | cfg = LazyConfig.load(args.config_file) 55 | cfg = LazyConfig.apply_overrides(cfg, args.opts) 56 | setup_logger(name="fvcore") 57 | setup_logger() 58 | return cfg 59 | 60 | 61 | def do_flop(cfg): 62 | if isinstance(cfg, CfgNode): 63 | mapper = DatasetMapper(cfg, False) 64 | data_loader = build_detection_test_loader(cfg, cfg.DATASETS.TEST_PANOPTIC[0], mapper=mapper) 65 | model = build_model(cfg) 66 | DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS) 67 | else: 68 | data_loader = instantiate(cfg.dataloader.test) 69 | model = instantiate(cfg.model) 70 | model.to(cfg.train.device) 71 | DetectionCheckpointer(model).load(cfg.train.init_checkpoint) 72 | model.eval() 73 | 74 | counts = Counter() 75 | total_flops = [] 76 | for idx, data in zip(tqdm.trange(args.num_inputs), data_loader): # noqa 77 | if args.use_fixed_input_size and isinstance(cfg, CfgNode): 78 | import torch 79 | crop_size = cfg.INPUT.CROP.SIZE 80 | data[0]["image"] = torch.zeros((3, crop_size[0], crop_size[1])) 81 | flops = FlopCountAnalysis(model, data) 82 | if idx > 0: 83 | flops.unsupported_ops_warnings(False).uncalled_modules_warnings(False) 84 | counts += flops.by_operator() 85 | total_flops.append(flops.total()) 86 | 87 | logger.info("Flops table computed from only one input sample:\n" + flop_count_table(flops)) 88 | logger.info( 89 | "Average GFlops for each type of operators:\n" 90 | + str([(k, v / (idx + 1) / 1e9) for k, v in counts.items()]) 91 | ) 92 | logger.info( 93 | "Total GFlops: {:.1f}±{:.1f}".format(np.mean(total_flops) / 1e9, np.std(total_flops) / 1e9) 94 | ) 95 | 96 | 97 | def do_activation(cfg): 98 | if isinstance(cfg, CfgNode): 99 | mapper = DatasetMapper(cfg, False) 100 | data_loader = build_detection_test_loader(cfg, cfg.DATASETS.TEST_PANOPTIC[0], mapper=mapper) 101 | model = build_model(cfg) 102 | DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS) 103 | else: 104 | data_loader = instantiate(cfg.dataloader.test) 105 | model = instantiate(cfg.model) 106 | model.to(cfg.train.device) 107 | DetectionCheckpointer(model).load(cfg.train.init_checkpoint) 108 | model.eval() 109 | 110 | counts = Counter() 111 | total_activations = [] 112 | for idx, data in zip(tqdm.trange(args.num_inputs), data_loader): # noqa 113 | count = activation_count_operators(model, data) 114 | counts += count 115 | total_activations.append(sum(count.values())) 116 | logger.info( 117 | "(Million) Activations for Each Type of Operators:\n" 118 | + str([(k, v / idx) for k, v in counts.items()]) 119 | ) 120 | logger.info( 121 | "Total (Million) Activations: {}±{}".format( 122 | np.mean(total_activations), np.std(total_activations) 123 | ) 124 | ) 125 | 126 | def do_speed(cfg): 127 | if isinstance(cfg, CfgNode): 128 | model = build_model(cfg) 129 | DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS) 130 | else: 131 | model = instantiate(cfg.model) 132 | model.to(cfg.train.device) 133 | DetectionCheckpointer(model).load(cfg.train.init_checkpoint) 134 | model.eval() 135 | import torch 136 | crop_size = cfg.INPUT.CROP.SIZE 137 | data = [{}] 138 | data[0]["image"] = torch.zeros((3, crop_size[0], crop_size[1])) 139 | data[0]["task"] = "the task is panoptic" 140 | total_times = [] 141 | for _ in tqdm.trange(100): # noqa 142 | model(data) 143 | torch.cuda.synchronize() 144 | tstart = torch.cuda.Event(enable_timing=True) 145 | tend = torch.cuda.Event(enable_timing=True) 146 | fps = [] 147 | times = [] 148 | for _ in range(5): 149 | for _ in tqdm.trange(args.num_inputs): # noqa 150 | tstart.record() 151 | model(data) 152 | tend.record() 153 | torch.cuda.synchronize() 154 | total_times.append(tstart.elapsed_time(tend)) 155 | times.append(np.mean(total_times)) 156 | fps.append(1000/np.mean(total_times)) 157 | 158 | logger.info( 159 | "Average Time per {}x{} Image : {:.1f} ± {:.1f} milli-seconds".format(crop_size, crop_size, np.mean(times), np.std(times)) 160 | ) 161 | logger.info( 162 | "FPS : {:.2f} ± {:.2f}".format(np.mean(fps), np.std(fps)) 163 | ) 164 | 165 | def do_parameter(cfg): 166 | if isinstance(cfg, CfgNode): 167 | model = build_model(cfg) 168 | else: 169 | model = instantiate(cfg.model) 170 | logger.info("Parameter Count:\n" + parameter_count_table(model, max_depth=5)) 171 | 172 | 173 | def do_structure(cfg): 174 | if isinstance(cfg, CfgNode): 175 | model = build_model(cfg) 176 | else: 177 | model = instantiate(cfg.model) 178 | logger.info("Model Structure:\n" + str(model)) 179 | 180 | 181 | if __name__ == "__main__": 182 | parser = default_argument_parser( 183 | epilog=""" 184 | Examples: 185 | To show parameters of a model: 186 | $ ./analyze_model.py --tasks parameter \\ 187 | --config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml 188 | Flops and activations are data-dependent, therefore inputs and model weights 189 | are needed to count them: 190 | $ ./analyze_model.py --num-inputs 100 --tasks flop \\ 191 | --config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \\ 192 | MODEL.WEIGHTS /path/to/model.pkl 193 | """ 194 | ) 195 | parser.add_argument( 196 | "--tasks", 197 | choices=["flop", "speed", "activation", "parameter", "structure"], 198 | required=True, 199 | nargs="+", 200 | ) 201 | parser.add_argument( 202 | "-n", 203 | "--num-inputs", 204 | default=100, 205 | type=int, 206 | help="number of inputs used to compute statistics for flops/activations, " 207 | "both are data dependent.", 208 | ) 209 | parser.add_argument( 210 | "--use-fixed-input-size", 211 | action="store_true", 212 | help="use fixed input size when calculating flops", 213 | ) 214 | args = parser.parse_args() 215 | assert not args.eval_only 216 | assert args.num_gpus == 1 217 | 218 | cfg = setup(args) 219 | 220 | for task in args.tasks: 221 | { 222 | "flop": do_flop, 223 | "speed": do_speed, 224 | "activation": do_activation, 225 | "parameter": do_parameter, 226 | "structure": do_structure, 227 | }[task](cfg) 228 | -------------------------------------------------------------------------------- /tools/convert-pretrained-model-to-d2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 3 | 4 | import pickle as pkl 5 | import sys 6 | 7 | import torch 8 | 9 | """ 10 | Usage: 11 | # download pretrained swin model: 12 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth 13 | # run the conversion 14 | ./convert-pretrained-model-to-d2.py swin_tiny_patch4_window7_224.pth swin_tiny_patch4_window7_224.pkl 15 | # Then, use swin_tiny_patch4_window7_224.pkl with the following changes in config: 16 | MODEL: 17 | WEIGHTS: "/path/to/swin_tiny_patch4_window7_224.pkl" 18 | INPUT: 19 | FORMAT: "RGB" 20 | """ 21 | 22 | if __name__ == "__main__": 23 | input = sys.argv[1] 24 | 25 | obj = torch.load(input, map_location="cpu")["model"] 26 | 27 | res = {"model": obj, "__author__": "third_party", "matching_heuristics": True} 28 | 29 | with open(sys.argv[2], "wb") as f: 30 | pkl.dump(res, f) 31 | -------------------------------------------------------------------------------- /tools/convert-pretrained-nat-model-to-d2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 3 | 4 | import pickle as pkl 5 | import sys 6 | 7 | import torch 8 | 9 | """ 10 | Usage: 11 | # download pretrained swin model: 12 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth 13 | # run the conversion 14 | ./convert-pretrained-model-to-d2.py swin_tiny_patch4_window7_224.pth swin_tiny_patch4_window7_224.pkl 15 | # Then, use swin_tiny_patch4_window7_224.pkl with the following changes in config: 16 | MODEL: 17 | WEIGHTS: "/path/to/swin_tiny_patch4_window7_224.pkl" 18 | INPUT: 19 | FORMAT: "RGB" 20 | """ 21 | 22 | if __name__ == "__main__": 23 | input = sys.argv[1] 24 | 25 | obj = torch.load(input, map_location="cpu") 26 | 27 | res = {"model": obj, "__author__": "third_party", "matching_heuristics": True} 28 | 29 | with open(sys.argv[2], "wb") as f: 30 | pkl.dump(res, f) 31 | -------------------------------------------------------------------------------- /tools/convert-torchvision-to-d2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | 4 | import pickle as pkl 5 | import sys 6 | 7 | import torch 8 | 9 | """ 10 | Usage: 11 | # download one of the ResNet{18,34,50,101,152} models from torchvision: 12 | wget https://download.pytorch.org/models/resnet50-19c8e357.pth -O r50.pth 13 | # run the conversion 14 | ./convert-torchvision-to-d2.py r50.pth r50.pkl 15 | # Then, use r50.pkl with the following changes in config: 16 | MODEL: 17 | WEIGHTS: "/path/to/r50.pkl" 18 | PIXEL_MEAN: [123.675, 116.280, 103.530] 19 | PIXEL_STD: [58.395, 57.120, 57.375] 20 | RESNETS: 21 | DEPTH: 50 22 | STRIDE_IN_1X1: False 23 | INPUT: 24 | FORMAT: "RGB" 25 | """ 26 | 27 | if __name__ == "__main__": 28 | input = sys.argv[1] 29 | 30 | obj = torch.load(input, map_location="cpu") 31 | 32 | newmodel = {} 33 | for k in list(obj.keys()): 34 | old_k = k 35 | if "layer" not in k: 36 | k = "stem." + k 37 | for t in [1, 2, 3, 4]: 38 | k = k.replace("layer{}".format(t), "res{}".format(t + 1)) 39 | for t in [1, 2, 3]: 40 | k = k.replace("bn{}".format(t), "conv{}.norm".format(t)) 41 | k = k.replace("downsample.0", "shortcut") 42 | k = k.replace("downsample.1", "shortcut.norm") 43 | print(old_k, "->", k) 44 | newmodel[k] = obj.pop(old_k).detach().numpy() 45 | 46 | res = {"model": newmodel, "__author__": "torchvision", "matching_heuristics": True} 47 | 48 | with open(sys.argv[2], "wb") as f: 49 | pkl.dump(res, f) 50 | if obj: 51 | print("Unconverted keys:", obj.keys()) 52 | -------------------------------------------------------------------------------- /tools/setup_detectron2.py: -------------------------------------------------------------------------------- 1 | import sys, os, distutils.core, subprocess 2 | 3 | if not os.path.exists('./detectron2'): 4 | subprocess.run(['git', 'clone', 'https://github.com/facebookresearch/detectron2']) 5 | 6 | dist = distutils.core.run_setup("./detectron2/setup.py") 7 | 8 | for x in dist.install_requires: 9 | subprocess.run(['python', '-m', 'pip', 'install', x]) 10 | 11 | sys.path.insert(0, os.path.abspath('./detectron2')) --------------------------------------------------------------------------------