├── hgformer ├── evaluation │ ├── __init__.py │ └── instance_evaluation.py ├── utils │ ├── __init__.py │ └── misc.py ├── modeling │ ├── backbone │ │ └── __init__.py │ ├── meta_arch │ │ ├── __init__.py │ │ ├── group_former_head.py │ │ └── mask_former_head.py │ ├── pixel_decoder │ │ ├── __init__.py │ │ └── ops │ │ │ ├── make.sh │ │ │ ├── modules │ │ │ ├── __init__.py │ │ │ └── ms_deform_attn.py │ │ │ ├── functions │ │ │ ├── __init__.py │ │ │ └── ms_deform_attn_func.py │ │ │ ├── src │ │ │ ├── vision.cpp │ │ │ ├── cuda │ │ │ │ ├── ms_deform_attn_cuda.h │ │ │ │ └── ms_deform_attn_cuda.cu │ │ │ ├── cpu │ │ │ │ ├── ms_deform_attn_cpu.h │ │ │ │ └── ms_deform_attn_cpu.cpp │ │ │ └── ms_deform_attn.h │ │ │ ├── setup.py │ │ │ └── test.py │ ├── transformer_decoder │ │ ├── __init__.py │ │ ├── position_encoding.py │ │ └── maskformer_transformer_decoder.py │ └── __init__.py ├── data │ ├── dataset_mappers │ │ ├── __init__.py │ │ ├── mask_former_panoptic_dataset_mapper.py │ │ ├── mask_former_instance_dataset_mapper.py │ │ └── mask_former_semantic_dataset_mapper.py │ ├── __init__.py │ ├── samplers │ │ ├── __init__.py │ │ ├── grouped_batch_sampler.py │ │ └── balanced_sampler.py │ └── datasets │ │ ├── __init__.py │ │ ├── register_gta.py │ │ ├── register_synthia.py │ │ ├── register_bdd.py │ │ ├── register_mapillary_19.py │ │ ├── register_city_c.py │ │ └── register_city_c_vis.py ├── __init__.py ├── test_time_augmentation.py └── config.py ├── requirements.txt ├── configs ├── cityscapes │ ├── maskformer2_swin_large_IN21k_384_bs16_20k.yaml │ ├── hgformer_swin_large_IN21K_384_bs16_20k.yaml │ ├── maskformer2_swin_tiny_bs16_20k.yaml │ ├── hgformer_swin_tiny_bs16_20k.yaml │ ├── maskformer2_R50_bs16_20k_gn.yaml │ ├── Base-Cityscapes-SemanticSegmentation.yaml │ └── hgformer_R50_bs16_20k.yaml ├── mapillary │ ├── maskformer2_swin_large_IN21k_384_bs16_20k_mapillary.yaml │ ├── hgformer_swin_tiny_bs16_20k_mapillary.yaml │ ├── maskformer2_swin_tiny_bs16_20k_mapillary.yaml │ ├── hgformer_swin_large_IN21k_384_bs16_20k_mapillary.yaml │ ├── maskformer2_R50_bs16_90k_gn_mapillary_20k.yaml │ ├── Base-mapillary19-SemanticSegmentation.yaml │ └── hgformer_R50_bs16_20k_mapillary.yaml └── city_c │ ├── hgformer_swin_large_IN21K_384_bs16_20k.yaml │ ├── maskformer2_swin_large_IN21k_384_bs16_90k.yaml │ ├── hgformer_swin_tiny_bs16_20k.yaml │ └── maskformer2_swin_tiny_bs16_20k.yaml ├── .gitignore ├── tools ├── convert-pretrained-swin-model-to-d2.py ├── convert-torchvision-to-d2.py ├── evaluate_coco_boundary_ap.py ├── README.md ├── visualize_data.py └── analyze_model.py ├── README.md ├── datasets ├── split_data │ ├── gta │ │ ├── resize_img.py │ │ └── split_gta.py │ └── synthia │ │ └── split_synthia.py ├── prepare_gta_sem_seg.py ├── generate_cityscapes_c.py ├── find_truncated_images.py ├── prepare_mapillary_sem_seg.py ├── prepare_synthia_sem_seg.py └── README.md ├── INSTALL.md ├── GETTING_STARTED.md ├── MODEL_ZOO.md └── demo ├── inference.py └── predictor.py /hgformer/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /hgformer/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /hgformer/modeling/backbone/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /hgformer/data/dataset_mappers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /hgformer/modeling/meta_arch/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /hgformer/modeling/pixel_decoder/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /hgformer/data/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from . import datasets -------------------------------------------------------------------------------- /hgformer/data/samplers/__init__.py: -------------------------------------------------------------------------------- 1 | from .balanced_sampler import ( 2 | BalancedTrainingSampler, 3 | ) 4 | 5 | 6 | __all__ = [ 7 | "BalancedTrainingSampler", 8 | ] 9 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | cython 2 | scipy 3 | numpy==1.23.1 4 | setuptools==58.0.4 5 | shapely 6 | timm 7 | h5py 8 | submitit 9 | scikit-image 10 | ftfy 11 | einops 12 | regex 13 | mmcv 14 | imagecorruptions -------------------------------------------------------------------------------- /hgformer/data/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from . import ( 3 | # register_acdc, 4 | register_gta, 5 | register_city_c, 6 | register_bdd, 7 | register_synthia, 8 | register_mapillary_19, 9 | register_city_c_vis, 10 | ) 11 | -------------------------------------------------------------------------------- /hgformer/modeling/transformer_decoder/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from .maskformer_transformer_decoder import StandardTransformerDecoder 3 | from .mask2former_transformer_decoder import MultiScaleMaskedTransformerDecoder 4 | from .groupFormer_transformer_decoder import GroupFormerDecoder 5 | from .mask2former_transformer_decoder_wo_maskatten import MultiScaleMaskedTransformerDecoderWoMaskAtten 6 | -------------------------------------------------------------------------------- /configs/cityscapes/maskformer2_swin_large_IN21k_384_bs16_20k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: maskformer2_R50_bs16_20k_gn.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | MASK_FORMER: 18 | NUM_OBJECT_QUERIES: 100 19 | SOLVER: 20 | MAX_ITER: 20000 -------------------------------------------------------------------------------- /configs/mapillary/maskformer2_swin_large_IN21k_384_bs16_20k_mapillary.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: maskformer2_R50_bs16_90k_gn_mapillary_20k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | MASK_FORMER: 18 | NUM_OBJECT_QUERIES: 100 19 | SOLVER: 20 | MAX_ITER: 20000 -------------------------------------------------------------------------------- /configs/mapillary/hgformer_swin_tiny_bs16_20k_mapillary.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: hgformer_R50_bs16_20k_mapillary.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 6, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_tiny_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | 17 | SOLVER: 18 | MAX_ITER: 20000 19 | IMS_PER_BATCH: 16 20 | 21 | TEST: 22 | CLUSTER_SOFTMAX: True 23 | PRED_STAGE: "spix_pixelexclude0125+stage3" -------------------------------------------------------------------------------- /configs/mapillary/maskformer2_swin_tiny_bs16_20k_mapillary.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: maskformer2_R50_bs16_90k_gn_mapillary_20k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 6, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_tiny_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | 17 | DATALOADER: 18 | FILTER_EMPTY_ANNOTATIONS: True 19 | NUM_WORKERS: 4 20 | VERSION: 2 21 | SOLVER: 22 | MAX_ITER: 20000 23 | 24 | CUDNN_BENCHMARK: True 25 | -------------------------------------------------------------------------------- /configs/city_c/hgformer_swin_large_IN21K_384_bs16_20k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: hgformer_swin_tiny_bs16_20k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | 18 | SOLVER: 19 | MAX_ITER: 20000 20 | # IMS_PER_BATCH: 2 21 | 22 | TEST: 23 | CLUSTER_SOFTMAX: True 24 | PRED_STAGE: "spix_all_stage_exclude012" -------------------------------------------------------------------------------- /configs/cityscapes/hgformer_swin_large_IN21K_384_bs16_20k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: hgformer_R50_bs16_20k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | 18 | SOLVER: 19 | MAX_ITER: 20000 20 | # IMS_PER_BATCH: 2 21 | 22 | TEST: 23 | CLUSTER_SOFTMAX: True 24 | PRED_STAGE: "spix_pixelexclude0125+stage3" -------------------------------------------------------------------------------- /configs/mapillary/hgformer_swin_large_IN21k_384_bs16_20k_mapillary.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: hgformer_R50_bs16_20k_mapillary.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | 18 | SOLVER: 19 | MAX_ITER: 20000 20 | # IMS_PER_BATCH: 2 21 | 22 | TEST: 23 | CLUSTER_SOFTMAX: True 24 | PRED_STAGE: "spix_pixelexclude0125+stage3" -------------------------------------------------------------------------------- /hgformer/modeling/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from .backbone.swin import D2SwinTransformer, D2SwinTransformerFreeze 3 | from .pixel_decoder.fpn import BasePixelDecoder 4 | from .pixel_decoder.msdeformattn import MSDeformAttnPixelDecoder 5 | from .pixel_decoder.msdeformattn import MSDeformAttnPixelDecoderv2 6 | from .pixel_decoder.msdeformattn import MSDeformAttnPixelDecoderv3 7 | from .pixel_decoder.msdeformattn import MSDeformAttnPixelDecodervSingleLayer 8 | from .meta_arch.mask_former_head import MaskFormerHead 9 | from .meta_arch.group_former_head import GroupFormerHead 10 | from .meta_arch.per_pixel_baseline import PerPixelBaselineHead, PerPixelBaselinePlusHead 11 | -------------------------------------------------------------------------------- /configs/cityscapes/maskformer2_swin_tiny_bs16_20k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: maskformer2_R50_bs16_20k_gn.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 6, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_tiny_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | #DATASETS: 17 | # TRAIN: ("cityscapes_fine_sem_seg_train",) 18 | # TEST: ("cityscapes_fine_sem_seg_val",) 19 | DATALOADER: 20 | FILTER_EMPTY_ANNOTATIONS: True 21 | NUM_WORKERS: 4 22 | VERSION: 2 23 | SOLVER: 24 | MAX_ITER: 20000 25 | 26 | CUDNN_BENCHMARK: True 27 | -------------------------------------------------------------------------------- /configs/cityscapes/hgformer_swin_tiny_bs16_20k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: hgformer_R50_bs16_20k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 6, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_tiny_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | #DATASETS: 17 | # TRAIN: ("cityscapes_fine_sem_seg_train",) 18 | # TEST: ("synthia_val", "cityscapes_fine_sem_seg_val", "bdd_val", "mapillary_val", "gta_trainid_val") 19 | SOLVER: 20 | MAX_ITER: 20000 21 | IMS_PER_BATCH: 16 22 | 23 | TEST: 24 | CLUSTER_SOFTMAX: True 25 | PRED_STAGE: "spix_pixelexclude0125+stage3" -------------------------------------------------------------------------------- /hgformer/modeling/pixel_decoder/ops/make.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # ------------------------------------------------------------------------------------------------ 3 | # Deformable DETR 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | # ------------------------------------------------------------------------------------------------ 7 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | # ------------------------------------------------------------------------------------------------ 9 | 10 | # Copyright (c) Facebook, Inc. and its affiliates. 11 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 12 | 13 | python setup.py build install 14 | -------------------------------------------------------------------------------- /hgformer/modeling/pixel_decoder/ops/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | from .ms_deform_attn import MSDeformAttn 13 | -------------------------------------------------------------------------------- /hgformer/modeling/pixel_decoder/ops/functions/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | from .ms_deform_attn_func import MSDeformAttnFunction 13 | 14 | -------------------------------------------------------------------------------- /hgformer/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from . import data # register all new datasets 3 | from . import modeling 4 | 5 | # config 6 | from .config import add_maskformer2_config 7 | 8 | # dataset loading 9 | 10 | from .data.dataset_mappers.mask_former_instance_dataset_mapper import ( 11 | MaskFormerInstanceDatasetMapper, 12 | ) 13 | from .data.dataset_mappers.mask_former_panoptic_dataset_mapper import ( 14 | MaskFormerPanopticDatasetMapper, 15 | ) 16 | from .data.dataset_mappers.mask_former_semantic_dataset_mapper import ( 17 | MaskFormerSemanticDatasetMapper, 18 | ) 19 | 20 | 21 | # models 22 | from .maskformer_model import MaskFormer 23 | from .test_time_augmentation import SemanticSegmentorWithTTA 24 | from .groupformer_model import GroupFormer 25 | 26 | # evaluation 27 | from .evaluation.instance_evaluation import InstanceSegEvaluator 28 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # output dir 2 | output 3 | instant_test_output 4 | inference_test_output 5 | 6 | 7 | *.png 8 | *.json 9 | *.diff 10 | *.jpg 11 | !/projects/DensePose/doc/images/*.jpg 12 | 13 | # compilation and distribution 14 | __pycache__ 15 | _ext 16 | *.pyc 17 | *.pyd 18 | *.so 19 | *.dll 20 | *.egg-info/ 21 | build/ 22 | dist/ 23 | wheels/ 24 | 25 | # pytorch/python/numpy formats 26 | *.pth 27 | *.pkl 28 | *.npy 29 | *.ts 30 | model_ts*.txt 31 | 32 | # ipython/jupyter notebooks 33 | *.ipynb 34 | **/.ipynb_checkpoints/ 35 | 36 | # Editor temporaries 37 | *.swn 38 | *.swo 39 | *.swp 40 | *~ 41 | 42 | # editor settings 43 | .idea 44 | .vscode 45 | _darcs 46 | 47 | # project dirs 48 | /detectron2/model_zoo/configs 49 | # /datasets/* 50 | !/datasets/*.* 51 | /projects/*/datasets 52 | /models 53 | /snippet 54 | /GroupViT 55 | /work_dirs 56 | /work_dirs_1 57 | test*.sh 58 | start*.sh 59 | slurm* 60 | /detectron2 61 | -------------------------------------------------------------------------------- /tools/convert-pretrained-swin-model-to-d2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 3 | 4 | import pickle as pkl 5 | import sys 6 | 7 | import torch 8 | 9 | """ 10 | Usage: 11 | # download pretrained swin model: 12 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth 13 | # run the conversion 14 | ./convert-pretrained-model-to-d2.py swin_tiny_patch4_window7_224.pth swin_tiny_patch4_window7_224.pkl 15 | # Then, use swin_tiny_patch4_window7_224.pkl with the following changes in config: 16 | MODEL: 17 | WEIGHTS: "/path/to/swin_tiny_patch4_window7_224.pkl" 18 | INPUT: 19 | FORMAT: "RGB" 20 | """ 21 | 22 | if __name__ == "__main__": 23 | input = sys.argv[1] 24 | 25 | obj = torch.load(input, map_location="cpu")["model"] 26 | 27 | res = {"model": obj, "__author__": "third_party", "matching_heuristics": True} 28 | 29 | with open(sys.argv[2], "wb") as f: 30 | pkl.dump(res, f) 31 | -------------------------------------------------------------------------------- /hgformer/modeling/pixel_decoder/ops/src/vision.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #include "ms_deform_attn.h" 17 | 18 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 19 | m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward"); 20 | m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward"); 21 | } 22 | -------------------------------------------------------------------------------- /hgformer/data/datasets/register_gta.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import os 3 | 4 | from detectron2.data import DatasetCatalog, MetadataCatalog 5 | from detectron2.data.datasets import load_sem_seg 6 | from detectron2.data.datasets.builtin_meta import _get_builtin_metadata 7 | 8 | # ==== Predefined splits for raw gta images =========== 9 | 10 | GTA_Trainid = { 11 | "gta_trainid_val": ("gta/images/valid/", "gta/labels_detectron2/valid/"), 12 | } 13 | 14 | def register_all_gta_sem_seg(root): 15 | for key, (image_dir, gt_dir) in GTA_Trainid.items(): 16 | meta = _get_builtin_metadata("cityscapes") 17 | image_dir = os.path.join(root, image_dir) 18 | gt_dir = os.path.join(root, gt_dir) 19 | 20 | DatasetCatalog.register( 21 | key, lambda x=image_dir, y=gt_dir: load_sem_seg(y, x, gt_ext="png", image_ext="png") 22 | ) 23 | MetadataCatalog.get(key).set( 24 | image_dir=image_dir, 25 | gt_dir=gt_dir, 26 | evaluator_type="sem_seg", 27 | ignore_label=255, 28 | **meta, 29 | ) 30 | 31 | _root = os.getenv("DETECTRON2_DATASETS", "datasets") 32 | 33 | register_all_gta_sem_seg(_root) -------------------------------------------------------------------------------- /hgformer/data/datasets/register_synthia.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import os 3 | 4 | from detectron2.data import DatasetCatalog, MetadataCatalog 5 | from detectron2.data.datasets import load_sem_seg 6 | from detectron2.data.datasets.builtin_meta import _get_builtin_metadata 7 | # from .acdc import load_acdc_semantic 8 | # from detectron2.data.datasets import load_sem_seg 9 | 10 | 11 | _RAW_BDD_SPLITS = { 12 | "synthia_train": ("synthia/RGB/train", "synthia/labels_detectron2/train"), 13 | "synthia_val": ("synthia/RGB/val", "synthia/labels_detectron2/val") 14 | } 15 | 16 | def register_all_synthia(root): 17 | for key, (image_dir, gt_dir) in _RAW_BDD_SPLITS.items(): 18 | meta = _get_builtin_metadata("cityscapes") 19 | image_dir = os.path.join(root, image_dir) 20 | gt_dir = os.path.join(root, gt_dir) 21 | 22 | DatasetCatalog.register( 23 | key, lambda x=image_dir, y=gt_dir: load_sem_seg(y, x, gt_ext="png", image_ext="png") 24 | ) 25 | MetadataCatalog.get(key).set( 26 | image_dir=image_dir, 27 | gt_dir=gt_dir, 28 | evaluator_type="sem_seg", 29 | ignore_label=255, 30 | **meta, 31 | ) 32 | 33 | _root = os.getenv("DETECTRON2_DATASETS", "datasets") 34 | register_all_synthia(_root) -------------------------------------------------------------------------------- /hgformer/data/datasets/register_bdd.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import os 3 | 4 | from detectron2.data import DatasetCatalog, MetadataCatalog 5 | from detectron2.data.datasets import load_sem_seg 6 | from detectron2.data.datasets.builtin_meta import _get_builtin_metadata 7 | 8 | 9 | _RAW_BDD_SPLITS = { 10 | "bdd_train": ("bdd/images/10k/train", "bdd/labels/sem_seg/masks/train"), 11 | "bdd_val": ("bdd/images/10k/val", "bdd/labels/sem_seg/masks/val") 12 | } 13 | 14 | def register_all_bdd(root): 15 | for key, (image_dir, gt_dir) in _RAW_BDD_SPLITS.items(): 16 | meta = _get_builtin_metadata("cityscapes") 17 | image_dir = os.path.join(root, image_dir) 18 | gt_dir = os.path.join(root, gt_dir) 19 | 20 | # DatasetCatalog.register( 21 | # key, lambda x=image_dir, y=gt_dir: load_sem_seg(x, y) 22 | # ) 23 | DatasetCatalog.register( 24 | key, lambda x=image_dir, y=gt_dir: load_sem_seg(y, x, gt_ext="png", image_ext="jpg") 25 | ) 26 | MetadataCatalog.get(key).set( 27 | image_dir=image_dir, 28 | gt_dir=gt_dir, 29 | evaluator_type="sem_seg", 30 | ignore_label=255, 31 | **meta, 32 | ) 33 | 34 | _root = os.getenv("DETECTRON2_DATASETS", "datasets") 35 | register_all_bdd(_root) -------------------------------------------------------------------------------- /hgformer/data/datasets/register_mapillary_19.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import os 3 | 4 | from detectron2.data import DatasetCatalog, MetadataCatalog 5 | from detectron2.data.datasets import load_sem_seg 6 | from detectron2.data.datasets.builtin_meta import _get_builtin_metadata 7 | # from detectron2.data.datasets import load_sem_seg 8 | 9 | 10 | _RAW_BDD_SPLITS = { 11 | "mapillary_train": ("mapillary/training/images", "mapillary/labels_detectron2/training"), 12 | "mapillary_val": ("mapillary/validation/images", "mapillary/labels_detectron2/validation") 13 | } 14 | 15 | def register_all_mapillary_19(root): 16 | for key, (image_dir, gt_dir) in _RAW_BDD_SPLITS.items(): 17 | meta = _get_builtin_metadata("cityscapes") 18 | image_dir = os.path.join(root, image_dir) 19 | gt_dir = os.path.join(root, gt_dir) 20 | 21 | DatasetCatalog.register( 22 | key, lambda x=image_dir, y=gt_dir: load_sem_seg(y, x, gt_ext="png", image_ext="jpg") 23 | ) 24 | MetadataCatalog.get(key).set( 25 | image_dir=image_dir, 26 | gt_dir=gt_dir, 27 | evaluator_type="sem_seg", 28 | ignore_label=255, 29 | **meta, 30 | ) 31 | 32 | _root = os.getenv("DETECTRON2_DATASETS", "datasets") 33 | register_all_mapillary_19(_root) -------------------------------------------------------------------------------- /configs/city_c/maskformer2_swin_large_IN21k_384_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../cityscapes/maskformer2_R50_bs16_20k_gn.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | MASK_FORMER: 18 | NUM_OBJECT_QUERIES: 100 19 | DATASETS: 20 | TRAIN: ("cityscapes_fine_sem_seg_train", ) 21 | TEST: ("cityscapes_fine_sem_seg_val", "cityscapes_fine_gaussian_noise_5_val", "cityscapes_fine_shot_noise_5_val", "cityscapes_fine_impulse_noise_5_val", "cityscapes_fine_defocus_blur_5_val", "cityscapes_fine_glass_blur_5_val", "cityscapes_fine_motion_blur_5_val", "cityscapes_fine_zoom_blur_5_val", "cityscapes_fine_snow_5_val", "cityscapes_fine_frost_5_val", "cityscapes_fine_fog_5_val", "cityscapes_fine_brightness_5_val", "cityscapes_fine_contrast_5_val", "cityscapes_fine_elastic_transform_5_val", "cityscapes_fine_pixelate_5_val", "cityscapes_fine_jpeg_compression_5_val", "cityscapes_fine_speckle_noise_5_val", "cityscapes_fine_gaussian_blur_5_val", "cityscapes_fine_spatter_5_val", "cityscapes_fine_saturate_5_val") 22 | -------------------------------------------------------------------------------- /configs/city_c/hgformer_swin_tiny_bs16_20k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../cityscapes/hgformer_R50_bs16_20k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 6, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_tiny_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | 17 | SOLVER: 18 | MAX_ITER: 20000 19 | # IMS_PER_BATCH: 2 20 | DATASETS: 21 | TRAIN: ("cityscapes_fine_sem_seg_train", ) 22 | TEST: ("cityscapes_fine_sem_seg_val", "cityscapes_fine_gaussian_noise_5_val", "cityscapes_fine_shot_noise_5_val", "cityscapes_fine_impulse_noise_5_val", "cityscapes_fine_defocus_blur_5_val", "cityscapes_fine_glass_blur_5_val", "cityscapes_fine_motion_blur_5_val", "cityscapes_fine_zoom_blur_5_val", "cityscapes_fine_snow_5_val", "cityscapes_fine_frost_5_val", "cityscapes_fine_fog_5_val", "cityscapes_fine_brightness_5_val", "cityscapes_fine_contrast_5_val", "cityscapes_fine_elastic_transform_5_val", "cityscapes_fine_pixelate_5_val", "cityscapes_fine_jpeg_compression_5_val", "cityscapes_fine_speckle_noise_5_val", "cityscapes_fine_gaussian_blur_5_val", "cityscapes_fine_spatter_5_val", "cityscapes_fine_saturate_5_val") 23 | TEST: 24 | CLUSTER_SOFTMAX: True 25 | PRED_STAGE: "spix_pixelexclude0125+stage3" -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # HGFormer: Hierarchical Grouping Transformer for Domain Generalized Semantic Segmentation 2 | 3 | This is the official code for the [HGFormer](https://openaccess.thecvf.com/content/CVPR2023/papers/Ding_HGFormer_Hierarchical_Grouping_Transformer_for_Domain_Generalized_Semantic_Segmentation_CVPR_2023_paper.pdf) (CVPR 2023) 4 | 5 | ## Installation 6 | 7 | See [installation instructions](INSTALL.md). 8 | 9 | ## Getting Started 10 | 11 | See [Preparing Datasets for HGFormer](datasets/README.md). 12 | 13 | See [Getting Started with HGFormer](GETTING_STARTED.md). 14 | 15 | ## Pre-trained Models and Baselines 16 | 17 | We provide a large set of baseline results and trained models available for download in the [HGFormer Model Zoo](MODEL_ZOO.md). 18 | 19 | ## Citing HGFormer 20 | 21 | If you use HGFormer in your research, please use the following BibTeX entry. 22 | 23 | ```BibTeX 24 | @inproceedings{ding2023hgformer, 25 | title={HGFormer: Hierarchical Grouping Transformer for Domain Generalized Semantic Segmentation}, 26 | author={Ding, Jian and Xue, Nan and Xia, Gui-Song and Schiele, Bernt and Dai, Dengxin}, 27 | booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, 28 | pages={15413--15423}, 29 | year={2023} 30 | } 31 | ``` 32 | 33 | ## Acknowledgement 34 | 35 | Code is largely based on Mask2Former (https://github.com/facebookresearch/Mask2Former). 36 | -------------------------------------------------------------------------------- /configs/city_c/maskformer2_swin_tiny_bs16_20k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../cityscapes/maskformer2_R50_bs16_20k_gn.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 96 7 | DEPTHS: [2, 2, 6, 2] 8 | NUM_HEADS: [3, 6, 12, 24] 9 | WINDOW_SIZE: 7 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | WEIGHTS: "swin_tiny_patch4_window7_224.pkl" 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.120, 57.375] 16 | 17 | DATALOADER: 18 | FILTER_EMPTY_ANNOTATIONS: True 19 | NUM_WORKERS: 4 20 | VERSION: 2 21 | SOLVER: 22 | MAX_ITER: 20000 23 | 24 | CUDNN_BENCHMARK: True 25 | DATASETS: 26 | TRAIN: ("cityscapes_fine_sem_seg_train", ) 27 | TEST: ("cityscapes_fine_sem_seg_val", "cityscapes_fine_gaussian_noise_5_val", "cityscapes_fine_shot_noise_5_val", "cityscapes_fine_impulse_noise_5_val", "cityscapes_fine_defocus_blur_5_val", "cityscapes_fine_glass_blur_5_val", "cityscapes_fine_motion_blur_5_val", "cityscapes_fine_zoom_blur_5_val", "cityscapes_fine_snow_5_val", "cityscapes_fine_frost_5_val", "cityscapes_fine_fog_5_val", "cityscapes_fine_brightness_5_val", "cityscapes_fine_contrast_5_val", "cityscapes_fine_elastic_transform_5_val", "cityscapes_fine_pixelate_5_val", "cityscapes_fine_jpeg_compression_5_val", "cityscapes_fine_speckle_noise_5_val", "cityscapes_fine_gaussian_blur_5_val", "cityscapes_fine_spatter_5_val", "cityscapes_fine_saturate_5_val") 28 | -------------------------------------------------------------------------------- /hgformer/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #pragma once 17 | #include 18 | 19 | at::Tensor ms_deform_attn_cuda_forward( 20 | const at::Tensor &value, 21 | const at::Tensor &spatial_shapes, 22 | const at::Tensor &level_start_index, 23 | const at::Tensor &sampling_loc, 24 | const at::Tensor &attn_weight, 25 | const int im2col_step); 26 | 27 | std::vector ms_deform_attn_cuda_backward( 28 | const at::Tensor &value, 29 | const at::Tensor &spatial_shapes, 30 | const at::Tensor &level_start_index, 31 | const at::Tensor &sampling_loc, 32 | const at::Tensor &attn_weight, 33 | const at::Tensor &grad_output, 34 | const int im2col_step); 35 | 36 | -------------------------------------------------------------------------------- /hgformer/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #pragma once 17 | #include 18 | 19 | at::Tensor 20 | ms_deform_attn_cpu_forward( 21 | const at::Tensor &value, 22 | const at::Tensor &spatial_shapes, 23 | const at::Tensor &level_start_index, 24 | const at::Tensor &sampling_loc, 25 | const at::Tensor &attn_weight, 26 | const int im2col_step); 27 | 28 | std::vector 29 | ms_deform_attn_cpu_backward( 30 | const at::Tensor &value, 31 | const at::Tensor &spatial_shapes, 32 | const at::Tensor &level_start_index, 33 | const at::Tensor &sampling_loc, 34 | const at::Tensor &attn_weight, 35 | const at::Tensor &grad_output, 36 | const int im2col_step); 37 | 38 | 39 | -------------------------------------------------------------------------------- /configs/cityscapes/maskformer2_R50_bs16_20k_gn.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-Cityscapes-SemanticSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskFormer" 4 | RESNETS: 5 | NORM: "GN" 6 | SEM_SEG_HEAD: 7 | NAME: "MaskFormerHead" 8 | IGNORE_VALUE: 255 9 | NUM_CLASSES: 19 10 | LOSS_WEIGHT: 1.0 11 | CONVS_DIM: 256 12 | MASK_DIM: 256 13 | NORM: "GN" 14 | # pixel decoder 15 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 16 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 17 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 18 | COMMON_STRIDE: 4 19 | TRANSFORMER_ENC_LAYERS: 6 20 | MASK_FORMER: 21 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 22 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 23 | DEEP_SUPERVISION: True 24 | NO_OBJECT_WEIGHT: 0.1 25 | CLASS_WEIGHT: 2.0 26 | MASK_WEIGHT: 5.0 27 | DICE_WEIGHT: 5.0 28 | HIDDEN_DIM: 256 29 | NUM_OBJECT_QUERIES: 100 30 | NHEADS: 8 31 | DROPOUT: 0.0 32 | DIM_FEEDFORWARD: 2048 33 | ENC_LAYERS: 0 34 | PRE_NORM: False 35 | ENFORCE_INPUT_PROJ: False 36 | SIZE_DIVISIBILITY: 32 37 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 38 | TRAIN_NUM_POINTS: 12544 39 | OVERSAMPLE_RATIO: 3.0 40 | IMPORTANCE_SAMPLE_RATIO: 0.75 41 | TEST: 42 | SEMANTIC_ON: True 43 | INSTANCE_ON: False 44 | PANOPTIC_ON: False 45 | OVERLAP_THRESHOLD: 0.8 46 | OBJECT_MASK_THRESHOLD: 0.8 47 | SOLVER: 48 | IMS_PER_BATCH: 16 49 | BASE_LR: 0.0001 50 | MAX_ITER: 20000 -------------------------------------------------------------------------------- /configs/mapillary/maskformer2_R50_bs16_90k_gn_mapillary_20k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-mapillary19-SemanticSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskFormer" 4 | RESNETS: 5 | NORM: "GN" 6 | SEM_SEG_HEAD: 7 | NAME: "MaskFormerHead" 8 | IGNORE_VALUE: 255 9 | NUM_CLASSES: 19 10 | LOSS_WEIGHT: 1.0 11 | CONVS_DIM: 256 12 | MASK_DIM: 256 13 | NORM: "GN" 14 | # pixel decoder 15 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 16 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 17 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 18 | COMMON_STRIDE: 4 19 | TRANSFORMER_ENC_LAYERS: 6 20 | MASK_FORMER: 21 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 22 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 23 | DEEP_SUPERVISION: True 24 | NO_OBJECT_WEIGHT: 0.1 25 | CLASS_WEIGHT: 2.0 26 | MASK_WEIGHT: 5.0 27 | DICE_WEIGHT: 5.0 28 | HIDDEN_DIM: 256 29 | NUM_OBJECT_QUERIES: 100 30 | NHEADS: 8 31 | DROPOUT: 0.0 32 | DIM_FEEDFORWARD: 2048 33 | ENC_LAYERS: 0 34 | PRE_NORM: False 35 | ENFORCE_INPUT_PROJ: False 36 | SIZE_DIVISIBILITY: 32 37 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 38 | TRAIN_NUM_POINTS: 12544 39 | OVERSAMPLE_RATIO: 3.0 40 | IMPORTANCE_SAMPLE_RATIO: 0.75 41 | TEST: 42 | SEMANTIC_ON: True 43 | INSTANCE_ON: False 44 | PANOPTIC_ON: False 45 | OVERLAP_THRESHOLD: 0.8 46 | OBJECT_MASK_THRESHOLD: 0.8 47 | SOLVER: 48 | IMS_PER_BATCH: 16 49 | BASE_LR: 0.0001 50 | MAX_ITER: 20000 -------------------------------------------------------------------------------- /hgformer/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #include 17 | 18 | #include 19 | #include 20 | 21 | 22 | at::Tensor 23 | ms_deform_attn_cpu_forward( 24 | const at::Tensor &value, 25 | const at::Tensor &spatial_shapes, 26 | const at::Tensor &level_start_index, 27 | const at::Tensor &sampling_loc, 28 | const at::Tensor &attn_weight, 29 | const int im2col_step) 30 | { 31 | AT_ERROR("Not implement on cpu"); 32 | } 33 | 34 | std::vector 35 | ms_deform_attn_cpu_backward( 36 | const at::Tensor &value, 37 | const at::Tensor &spatial_shapes, 38 | const at::Tensor &level_start_index, 39 | const at::Tensor &sampling_loc, 40 | const at::Tensor &attn_weight, 41 | const at::Tensor &grad_output, 42 | const int im2col_step) 43 | { 44 | AT_ERROR("Not implement on cpu"); 45 | } 46 | 47 | -------------------------------------------------------------------------------- /tools/convert-torchvision-to-d2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | 4 | import pickle as pkl 5 | import sys 6 | 7 | import torch 8 | 9 | """ 10 | Usage: 11 | # download one of the ResNet{18,34,50,101,152} models from torchvision: 12 | wget https://download.pytorch.org/models/resnet50-19c8e357.pth -O r50.pth 13 | # run the conversion 14 | ./convert-torchvision-to-d2.py r50.pth r50.pkl 15 | # Then, use r50.pkl with the following changes in config: 16 | MODEL: 17 | WEIGHTS: "/path/to/r50.pkl" 18 | PIXEL_MEAN: [123.675, 116.280, 103.530] 19 | PIXEL_STD: [58.395, 57.120, 57.375] 20 | RESNETS: 21 | DEPTH: 50 22 | STRIDE_IN_1X1: False 23 | INPUT: 24 | FORMAT: "RGB" 25 | """ 26 | 27 | if __name__ == "__main__": 28 | input = sys.argv[1] 29 | 30 | obj = torch.load(input, map_location="cpu") 31 | 32 | newmodel = {} 33 | for k in list(obj.keys()): 34 | old_k = k 35 | if "layer" not in k: 36 | k = "stem." + k 37 | for t in [1, 2, 3, 4]: 38 | k = k.replace("layer{}".format(t), "res{}".format(t + 1)) 39 | for t in [1, 2, 3]: 40 | k = k.replace("bn{}".format(t), "conv{}.norm".format(t)) 41 | k = k.replace("downsample.0", "shortcut") 42 | k = k.replace("downsample.1", "shortcut.norm") 43 | print(old_k, "->", k) 44 | newmodel[k] = obj.pop(old_k).detach().numpy() 45 | 46 | res = {"model": newmodel, "__author__": "torchvision", "matching_heuristics": True} 47 | 48 | with open(sys.argv[2], "wb") as f: 49 | pkl.dump(res, f) 50 | if obj: 51 | print("Unconverted keys:", obj.keys()) 52 | -------------------------------------------------------------------------------- /datasets/split_data/gta/resize_img.py: -------------------------------------------------------------------------------- 1 | import os 2 | from PIL import Image 3 | import numpy as np 4 | import cv2 5 | 6 | def GetFileFromThisRootDir(dir,ext = None): 7 | allfiles = [] 8 | needExtFilter = (ext != None) 9 | for root,dirs,files in os.walk(dir): 10 | for filespath in files: 11 | filepath = os.path.join(root, filespath) 12 | extension = os.path.splitext(filepath)[1][1:] 13 | if needExtFilter and extension in ext: 14 | allfiles.append(filepath) 15 | elif not needExtFilter: 16 | allfiles.append(filepath) 17 | return allfiles 18 | 19 | def resize_split(split): 20 | filenames = GetFileFromThisRootDir(f'datasets/GTA/images/{split}') 21 | for filename in filenames: 22 | basename = os.path.basename(filename) 23 | img = Image.open(filename) 24 | gtname = os.path.join(f'datasets/GTA/labels/{split}', basename) 25 | gt = Image.open(gtname) 26 | print(f'filename: {filename}') 27 | if not os.path.exists(f'datasets/GTA/labels/{split}_resize'): 28 | os.makedirs(f'datasets/GTA/labels/{split}_resize') 29 | if (img.width != gt.width) or (img.height != gt.height): 30 | # read img 31 | gt_np = np.asarray(gt) 32 | # resize img 33 | width, height = img.width, img.height 34 | resized_gt_np = cv2.resize(gt_np, (width, height), interpolation=cv2.INTER_NEAREST) 35 | # import ipdb; 36 | # ipdb.set_trace() 37 | # save img 38 | outname = os.path.join(f'datasets/GTA/labels/{split}_resize', basename) 39 | cv2.imwrite(outname, resized_gt_np) 40 | 41 | if __name__ == '__main__': 42 | resize_split('valid') 43 | # resize_split('train') 44 | # resize_split('test') -------------------------------------------------------------------------------- /tools/evaluate_coco_boundary_ap.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 3 | # Modified by Bowen Cheng from: https://github.com/bowenc0221/boundary-iou-api/blob/master/tools/coco_instance_evaluation.py 4 | 5 | """ 6 | Evaluation for COCO val2017: 7 | python ./tools/coco_instance_evaluation.py \ 8 | --gt-json-file COCO_GT_JSON \ 9 | --dt-json-file COCO_DT_JSON 10 | """ 11 | import argparse 12 | import json 13 | 14 | from boundary_iou.coco_instance_api.coco import COCO 15 | from boundary_iou.coco_instance_api.cocoeval import COCOeval 16 | 17 | 18 | def main(): 19 | parser = argparse.ArgumentParser() 20 | parser.add_argument("--gt-json-file", default="") 21 | parser.add_argument("--dt-json-file", default="") 22 | parser.add_argument("--iou-type", default="boundary") 23 | parser.add_argument("--dilation-ratio", default="0.020", type=float) 24 | args = parser.parse_args() 25 | print(args) 26 | 27 | annFile = args.gt_json_file 28 | resFile = args.dt_json_file 29 | dilation_ratio = args.dilation_ratio 30 | if args.iou_type == "boundary": 31 | get_boundary = True 32 | else: 33 | get_boundary = False 34 | cocoGt = COCO(annFile, get_boundary=get_boundary, dilation_ratio=dilation_ratio) 35 | 36 | # remove box predictions 37 | resFile = json.load(open(resFile)) 38 | for c in resFile: 39 | c.pop("bbox", None) 40 | 41 | cocoDt = cocoGt.loadRes(resFile) 42 | cocoEval = COCOeval(cocoGt, cocoDt, iouType=args.iou_type, dilation_ratio=dilation_ratio) 43 | cocoEval.evaluate() 44 | cocoEval.accumulate() 45 | cocoEval.summarize() 46 | 47 | 48 | if __name__ == '__main__': 49 | main() 50 | -------------------------------------------------------------------------------- /INSTALL.md: -------------------------------------------------------------------------------- 1 | ## Installation 2 | 3 | ### Requirements 4 | - Linux or macOS with Python ≥ 3.6 5 | - PyTorch ≥ 1.9 and [torchvision](https://github.com/pytorch/vision/) that matches the PyTorch installation. 6 | Install them together at [pytorch.org](https://pytorch.org) to make sure of this. Note, please check 7 | PyTorch version matches that is required by Detectron2. 8 | - Detectron2: follow [Detectron2 installation instructions](https://detectron2.readthedocs.io/tutorials/install.html). 9 | - OpenCV is optional but needed by demo and visualization 10 | - `pip install -r requirements.txt` 11 | 12 | ### CUDA kernel for MSDeformAttn 13 | After preparing the required environment, run the following command to compile CUDA kernel for MSDeformAttn: 14 | 15 | `CUDA_HOME` must be defined and points to the directory of the installed CUDA toolkit. 16 | 17 | ```bash 18 | cd hgformer/modeling/pixel_decoder/ops 19 | python setup.py build install 20 | ``` 21 | 22 | #### Building on another system 23 | To build on a system that does not have a GPU device but provide the drivers: 24 | ```bash 25 | TORCH_CUDA_ARCH_LIST='8.0' FORCE_CUDA=1 python setup.py build install 26 | ``` 27 | 28 | ### Example conda environment setup 29 | ```bash 30 | conda create --name hgformer python=3.8 -y 31 | conda activate hgformer 32 | conda install pytorch==1.9.0 torchvision==0.10.0 cudatoolkit=11.1 -c pytorch -c nvidia 33 | pip install -U opencv-python 34 | 35 | # under your working directory 36 | python -m pip install detectron2 -f \ 37 | https://dl.fbaipublicfiles.com/detectron2/wheels/cu111/torch1.9/index.html 38 | 39 | pip install git+https://github.com/mcordts/cityscapesScripts.git 40 | 41 | cd .. 42 | git clone https://github.com/dingjiansw101/HGFormer.git 43 | cd HGFormer 44 | pip install -r requirements.txt 45 | cd hgformer/modeling/pixel_decoder/ops 46 | sh make.sh 47 | ``` 48 | -------------------------------------------------------------------------------- /configs/mapillary/Base-mapillary19-SemanticSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | NORM: "SyncBN" # use syncbn for cityscapes dataset 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("mapillary_train",) 18 | TEST: ("mapillary_val", "gta_trainid_val", "synthia_val", "cityscapes_fine_sem_seg_val", "bdd_val") 19 | 20 | SOLVER: 21 | IMS_PER_BATCH: 16 22 | BASE_LR: 0.0001 23 | MAX_ITER: 90000 24 | WARMUP_FACTOR: 1.0 25 | WARMUP_ITERS: 0 26 | WEIGHT_DECAY: 0.05 27 | OPTIMIZER: "ADAMW" 28 | LR_SCHEDULER_NAME: "WarmupPolyLR" 29 | BACKBONE_MULTIPLIER: 0.1 30 | CLIP_GRADIENTS: 31 | ENABLED: True 32 | CLIP_TYPE: "full_model" 33 | CLIP_VALUE: 0.01 34 | NORM_TYPE: 2.0 35 | AMP: 36 | ENABLED: True 37 | INPUT: 38 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1024) for x in range(5, 21)]"] 39 | MIN_SIZE_TRAIN_SAMPLING: "choice" 40 | MIN_SIZE_TEST: 1024 41 | MAX_SIZE_TRAIN: 4096 42 | MAX_SIZE_TEST: 2048 43 | CROP: 44 | ENABLED: True 45 | TYPE: "absolute" 46 | SIZE: (512, 1024) 47 | SINGLE_CATEGORY_MAX_AREA: 1.0 48 | COLOR_AUG_SSD: True 49 | SIZE_DIVISIBILITY: -1 50 | FORMAT: "RGB" 51 | DATASET_MAPPER_NAME: "mask_former_semantic" 52 | TEST: 53 | EVAL_PERIOD: 90000 54 | AUG: 55 | ENABLED: False 56 | MIN_SIZES: [512, 768, 1024, 1280, 1536, 1792] 57 | MAX_SIZE: 4096 58 | FLIP: True 59 | DATALOADER: 60 | FILTER_EMPTY_ANNOTATIONS: True 61 | NUM_WORKERS: 4 62 | VERSION: 2 63 | -------------------------------------------------------------------------------- /datasets/prepare_gta_sem_seg.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | import os 4 | from pathlib import Path 5 | 6 | import numpy as np 7 | import tqdm 8 | from PIL import Image 9 | from multiprocessing import Pool 10 | 11 | id_to_trainid = {7: 0, 8: 1, 11: 2, 12: 3, 13: 4, 17: 5, 12 | 19: 6, 20: 7, 21: 8, 22: 9, 23: 10, 24: 11, 25: 12, 13 | 26: 13, 27: 14, 28: 15, 31: 16, 32: 17, 33: 18} 14 | 15 | 16 | def convert(input, outputpath): 17 | lab = np.asarray(Image.open(input)) 18 | assert lab.dtype == np.uint8 19 | output = np.zeros_like(lab, dtype=np.uint8) + 255 20 | for obj_id in np.unique(lab): 21 | if obj_id in id_to_trainid: 22 | output[lab == obj_id] = id_to_trainid[obj_id] 23 | 24 | Image.fromarray(output).save(outputpath) 25 | 26 | def worker(file_tuple): 27 | file, output_file = file_tuple 28 | lab = np.asarray(Image.open(file)) 29 | assert lab.dtype == np.uint8 30 | output = np.zeros_like(lab, dtype=np.uint8) + 255 31 | for obj_id in np.unique(lab): 32 | if obj_id in id_to_trainid: 33 | output[lab == obj_id] = id_to_trainid[obj_id] 34 | 35 | Image.fromarray(output).save(output_file) 36 | 37 | if __name__ == "__main__": 38 | dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "GTA" 39 | for name in ["train", "valid", "test"]: 40 | annotation_dir = dataset_dir / "labels" / name 41 | output_dir = dataset_dir / "labels_detectron2" / name 42 | output_dir.mkdir(parents=True, exist_ok=True) 43 | 44 | file_list = [] 45 | for file in tqdm.tqdm(list(annotation_dir.iterdir())): 46 | output_file = output_dir / file.name 47 | file_list.append((file, output_file)) 48 | # convert(file, output_file) 49 | 50 | pool = Pool(32) 51 | pool.map(worker, file_list) 52 | print(f'done {name}') 53 | -------------------------------------------------------------------------------- /configs/cityscapes/Base-Cityscapes-SemanticSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | NORM: "SyncBN" # use syncbn for cityscapes dataset 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | 17 | DATASETS: 18 | TRAIN: ("cityscapes_fine_sem_seg_train",) 19 | TEST: ("cityscapes_fine_sem_seg_val", "mapillary_val", "bdd_val", "gta_trainid_val", "synthia_val") 20 | SOLVER: 21 | IMS_PER_BATCH: 16 22 | BASE_LR: 0.0001 23 | MAX_ITER: 90000 24 | WARMUP_FACTOR: 1.0 25 | WARMUP_ITERS: 0 26 | WEIGHT_DECAY: 0.05 27 | OPTIMIZER: "ADAMW" 28 | LR_SCHEDULER_NAME: "WarmupPolyLR" 29 | BACKBONE_MULTIPLIER: 0.1 30 | # dense period for job array on gpu22 31 | CHECKPOINT_PERIOD: 1000 32 | CLIP_GRADIENTS: 33 | ENABLED: True 34 | CLIP_TYPE: "full_model" 35 | CLIP_VALUE: 0.01 36 | NORM_TYPE: 2.0 37 | AMP: 38 | ENABLED: True 39 | INPUT: 40 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1024) for x in range(5, 21)]"] 41 | MIN_SIZE_TRAIN_SAMPLING: "choice" 42 | MIN_SIZE_TEST: 1024 43 | MAX_SIZE_TRAIN: 4096 44 | MAX_SIZE_TEST: 2048 45 | CROP: 46 | ENABLED: True 47 | TYPE: "absolute" 48 | SIZE: (512, 1024) 49 | SINGLE_CATEGORY_MAX_AREA: 1.0 50 | COLOR_AUG_SSD: True 51 | SIZE_DIVISIBILITY: -1 52 | FORMAT: "RGB" 53 | DATASET_MAPPER_NAME: "mask_former_semantic" 54 | TEST: 55 | EVAL_PERIOD: 5000 56 | AUG: 57 | ENABLED: False 58 | MIN_SIZES: [512, 768, 1024, 1280, 1536, 1792] 59 | MAX_SIZE: 4096 60 | FLIP: True 61 | DATALOADER: 62 | FILTER_EMPTY_ANNOTATIONS: True 63 | NUM_WORKERS: 4 64 | VERSION: 2 65 | 66 | #CUDNN_BENCHMARK: True -------------------------------------------------------------------------------- /hgformer/data/datasets/register_city_c.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import os 3 | 4 | from detectron2.data import DatasetCatalog, MetadataCatalog 5 | from detectron2.data.datasets import load_sem_seg 6 | from detectron2.data.datasets.builtin_meta import _get_builtin_metadata 7 | # from .acdc import load_acdc_semantic 8 | from detectron2.data.datasets.cityscapes import load_cityscapes_semantic 9 | 10 | corruptions = ['clean', 'gaussian_noise', 'shot_noise', 'impulse_noise', 'defocus_blur', 11 | 'glass_blur', 'motion_blur', 'zoom_blur', 'snow', 'frost', 'fog', 12 | 'brightness', 'contrast', 'elastic_transform', 'pixelate', 'jpeg_compression', 13 | 'speckle_noise', 'gaussian_blur', 'spatter', 'saturate'] 14 | # ==== Predefined splits for raw cityscapes c images =========== 15 | 16 | _RAW_ACDC_SPLITS = {} 17 | for noise in corruptions: 18 | if noise == 'clean': 19 | cur_data = {f"cityscapes_fine_{noise}_val": (f"cityscapes-c/{noise}/", "cityscapes/gtFine/val/")} 20 | else: 21 | for severity in range(5): 22 | severity_str = str(severity+1) 23 | cur_data = {f"cityscapes_fine_{noise}_{severity_str}_val": (f"cityscapes-c/{noise}/{severity_str}", "cityscapes/gtFine/val/")} 24 | _RAW_ACDC_SPLITS.update(cur_data) 25 | def register_all_city_c(root): 26 | for key, (image_dir, gt_dir) in _RAW_ACDC_SPLITS.items(): 27 | meta = _get_builtin_metadata("cityscapes") 28 | image_dir = os.path.join(root, image_dir) 29 | gt_dir = os.path.join(root, gt_dir) 30 | # sem_key = key.format(task="sem_seg") 31 | DatasetCatalog.register( 32 | key, lambda x=image_dir, y=gt_dir: load_cityscapes_semantic(x, y) 33 | ) 34 | MetadataCatalog.get(key).set( 35 | image_dir=image_dir, 36 | gt_dir=gt_dir, 37 | evaluator_type="cityscapes_sem_seg", 38 | ignore_label=255, 39 | **meta, 40 | ) 41 | 42 | _root = os.getenv("DETECTRON2_DATASETS", "datasets") 43 | register_all_city_c(_root) 44 | -------------------------------------------------------------------------------- /hgformer/data/samplers/grouped_batch_sampler.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import numpy as np 3 | from torch.utils.data.sampler import BatchSampler, Sampler 4 | 5 | 6 | class GroupedBatchSampler(BatchSampler): 7 | """ 8 | Wraps another sampler to yield a mini-batch of indices. 9 | It enforces that the batch only contain elements from the same group. 10 | It also tries to provide mini-batches which follows an ordering which is 11 | as close as possible to the ordering from the original sampler. 12 | """ 13 | 14 | def __init__(self, sampler, group_ids, batch_size): 15 | """ 16 | Args: 17 | sampler (Sampler): Base sampler. 18 | group_ids (list[int]): If the sampler produces indices in range [0, N), 19 | `group_ids` must be a list of `N` ints which contains the group id of each sample. 20 | The group ids must be a set of integers in the range [0, num_groups). 21 | batch_size (int): Size of mini-batch. 22 | """ 23 | if not isinstance(sampler, Sampler): 24 | raise ValueError( 25 | "sampler should be an instance of " 26 | "torch.utils.data.Sampler, but got sampler={}".format(sampler) 27 | ) 28 | self.sampler = sampler 29 | self.group_ids = np.asarray(group_ids) 30 | assert self.group_ids.ndim == 1 31 | self.batch_size = batch_size 32 | groups = np.unique(self.group_ids).tolist() 33 | 34 | # buffer the indices of each group until batch size is reached 35 | self.buffer_per_group = {k: [] for k in groups} 36 | 37 | def __iter__(self): 38 | for idx in self.sampler: 39 | group_id = self.group_ids[idx] 40 | group_buffer = self.buffer_per_group[group_id] 41 | group_buffer.append(idx) 42 | if len(group_buffer) == self.batch_size: 43 | yield group_buffer[:] # yield a copy of the list 44 | del group_buffer[:] 45 | 46 | def __len__(self): 47 | raise NotImplementedError("len() of GroupedBatchSampler is not well-defined.") 48 | -------------------------------------------------------------------------------- /hgformer/modeling/pixel_decoder/ops/src/ms_deform_attn.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #pragma once 17 | 18 | #include "cpu/ms_deform_attn_cpu.h" 19 | 20 | #ifdef WITH_CUDA 21 | #include "cuda/ms_deform_attn_cuda.h" 22 | #endif 23 | 24 | 25 | at::Tensor 26 | ms_deform_attn_forward( 27 | const at::Tensor &value, 28 | const at::Tensor &spatial_shapes, 29 | const at::Tensor &level_start_index, 30 | const at::Tensor &sampling_loc, 31 | const at::Tensor &attn_weight, 32 | const int im2col_step) 33 | { 34 | if (value.type().is_cuda()) 35 | { 36 | #ifdef WITH_CUDA 37 | return ms_deform_attn_cuda_forward( 38 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step); 39 | #else 40 | AT_ERROR("Not compiled with GPU support"); 41 | #endif 42 | } 43 | AT_ERROR("Not implemented on the CPU"); 44 | } 45 | 46 | std::vector 47 | ms_deform_attn_backward( 48 | const at::Tensor &value, 49 | const at::Tensor &spatial_shapes, 50 | const at::Tensor &level_start_index, 51 | const at::Tensor &sampling_loc, 52 | const at::Tensor &attn_weight, 53 | const at::Tensor &grad_output, 54 | const int im2col_step) 55 | { 56 | if (value.type().is_cuda()) 57 | { 58 | #ifdef WITH_CUDA 59 | return ms_deform_attn_cuda_backward( 60 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step); 61 | #else 62 | AT_ERROR("Not compiled with GPU support"); 63 | #endif 64 | } 65 | AT_ERROR("Not implemented on the CPU"); 66 | } 67 | 68 | -------------------------------------------------------------------------------- /hgformer/data/datasets/register_city_c_vis.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import os 3 | 4 | from detectron2.data import DatasetCatalog, MetadataCatalog 5 | from detectron2.data.datasets import load_sem_seg 6 | from detectron2.data.datasets.builtin_meta import _get_builtin_metadata 7 | # from .acdc import load_acdc_semantic 8 | from detectron2.data.datasets.cityscapes import load_cityscapes_semantic 9 | 10 | corruptions = ['clean', 'gaussian_noise', 'shot_noise', 'impulse_noise', 'defocus_blur', 11 | 'glass_blur', 'motion_blur', 'zoom_blur', 'snow', 'frost', 'fog', 12 | 'brightness', 'contrast', 'elastic_transform', 'pixelate', 'jpeg_compression', 13 | 'speckle_noise', 'gaussian_blur', 'spatter', 'saturate'] 14 | # ==== Predefined splits for raw cityscapes c images =========== 15 | _RAW_ACDC_SPLITS = { 16 | "city_c_gaussiannoise5_vis": ("gauss_noise/5/", "cityscapes/gtFine/val/"), 17 | "city_c_gaussiannoise4_vis": ("gauss_noise/4/", "cityscapes/gtFine/val/"), 18 | "city_c_gaussiannoise3_vis": ("gauss_noise/3/", "cityscapes/gtFine/val/"), 19 | "city_c_gaussiannoise2_vis": ("gauss_noise/2/", "cityscapes/gtFine/val/"), 20 | "city_c_gaussiannoise1_vis": ("gauss_noise/1/", "cityscapes/gtFine/val/"), 21 | "city_c_gaussiannoise0_vis": ("gauss_noise/0/", "cityscapes/gtFine/val/"), 22 | "city_c_tmp_gaussiannoise4_vis": ("city_c_tmp/gaussian_noise/4/", "cityscapes/gtFine/val/"), 23 | "city_c_tmp_clean_vis": ("city_c_tmp/clean/", "cityscapes/gtFine/val/"), 24 | 25 | } 26 | 27 | def register_all_city_c_vis(root): 28 | for key, (image_dir, gt_dir) in _RAW_ACDC_SPLITS.items(): 29 | meta = _get_builtin_metadata("cityscapes") 30 | image_dir = os.path.join(root, image_dir) 31 | gt_dir = os.path.join(root, gt_dir) 32 | # sem_key = key.format(task="sem_seg") 33 | DatasetCatalog.register( 34 | key, lambda x=image_dir, y=gt_dir: load_cityscapes_semantic(x, y) 35 | ) 36 | MetadataCatalog.get(key).set( 37 | image_dir=image_dir, 38 | gt_dir=gt_dir, 39 | evaluator_type="cityscapes_sem_seg", 40 | ignore_label=255, 41 | **meta, 42 | ) 43 | 44 | _root = os.getenv("DETECTRON2_DATASETS", "datasets") 45 | register_all_city_c_vis(_root) 46 | -------------------------------------------------------------------------------- /datasets/generate_cityscapes_c.py: -------------------------------------------------------------------------------- 1 | from imagecorruptions import corrupt 2 | from imagecorruptions import get_corruption_names 3 | import os 4 | import cv2 5 | from multiprocessing import Pool 6 | import numpy as np 7 | import random 8 | import mmcv 9 | 10 | random.seed(8) # for reproducibility 11 | np.random.seed(8) 12 | corruptions = ['gaussian_noise', 'shot_noise', 'impulse_noise', 'defocus_blur', 13 | 'glass_blur', 'motion_blur', 'zoom_blur', 'snow', 'frost', 'fog', 14 | 'brightness', 'contrast', 'elastic_transform', 'pixelate', 'jpeg_compression', 15 | 'speckle_noise', 'gaussian_blur', 'spatter', 'saturate'] 16 | 17 | img_dir = 'datasets/cityscapes-c/clean' 18 | num_imgs = 500 19 | img_names = [] 20 | prog_bar = mmcv.ProgressBar(num_imgs) 21 | img_dict = {} 22 | for img_path in mmcv.scandir(img_dir, suffix='png', recursive=True): 23 | img_name = os.path.join(img_dir, img_path) 24 | img = mmcv.imread(img_name) 25 | img_dict[img_name] = img 26 | prog_bar.update() 27 | 28 | def perturb(i, p, s): 29 | img = corrupt(i, corruption_name=p, severity=s) 30 | return img 31 | 32 | def worker(optuple): 33 | srcfile, p, s, perturbed_img_path = optuple 34 | img = img_dict[srcfile] 35 | perturbed_img = perturb(img, p, s) 36 | mmcv.imwrite(perturbed_img, perturbed_img_path, auto_mkdir=True) 37 | 38 | def convert_img_path(ori_path, suffix): 39 | new_path = ori_path.replace('clean', suffix) 40 | assert new_path != ori_path 41 | return new_path 42 | 43 | if __name__ == '__main__': 44 | 45 | pool = Pool(32) 46 | filelist = [] 47 | for p in corruptions: 48 | print("\n ### gen corruption:{} ###".format(p)) 49 | for img_path in mmcv.scandir(img_dir, suffix='png', recursive=True): 50 | srcfile = os.path.join(img_dir, img_path) 51 | for s in range(5): 52 | img_suffix = p + "/" + str(s+1) 53 | out_dir = img_dir.replace('clean', img_suffix) 54 | assert out_dir != img_dir 55 | if not os.path.exists(out_dir): 56 | os.makedirs(out_dir) 57 | perturbed_img_path = convert_img_path(srcfile, img_suffix) 58 | filelist.append((srcfile, p, s+1, perturbed_img_path)) 59 | # import ipdb; ipdb.set_trace() 60 | pool.map(worker, filelist) -------------------------------------------------------------------------------- /datasets/find_truncated_images.py: -------------------------------------------------------------------------------- 1 | from PIL import Image 2 | import os 3 | import numpy as np 4 | 5 | # https://en.wikipedia.org/wiki/YUV#SDTV_with_BT.601 6 | _M_RGB2YUV = [[0.299, 0.587, 0.114], [-0.14713, -0.28886, 0.436], [0.615, -0.51499, -0.10001]] 7 | _M_YUV2RGB = [[1.0, 0.0, 1.13983], [1.0, -0.39465, -0.58060], [1.0, 2.03211, 0.0]] 8 | 9 | def GetFileFromThisRootDir(dir,ext = None): 10 | allfiles = [] 11 | needExtFilter = (ext != None) 12 | for root,dirs,files in os.walk(dir): 13 | for filespath in files: 14 | filepath = os.path.join(root, filespath) 15 | extension = os.path.splitext(filepath)[1][1:] 16 | if needExtFilter and extension in ext: 17 | allfiles.append(filepath) 18 | elif not needExtFilter: 19 | allfiles.append(filepath) 20 | return allfiles 21 | 22 | def convert_PIL_to_numpy(image, format): 23 | """ 24 | Convert PIL image to numpy array of target format. 25 | 26 | Args: 27 | image (PIL.Image): a PIL image 28 | format (str): the format of output image 29 | 30 | Returns: 31 | (np.ndarray): also see `read_image` 32 | """ 33 | if format is not None: 34 | # PIL only supports RGB, so convert to RGB and flip channels over below 35 | conversion_format = format 36 | if format in ["BGR", "YUV-BT.601"]: 37 | conversion_format = "RGB" 38 | image = image.convert(conversion_format) 39 | image = np.asarray(image) 40 | # PIL squeezes out the channel dimension for "L", so make it HWC 41 | if format == "L": 42 | image = np.expand_dims(image, -1) 43 | 44 | # handle formats not supported by PIL 45 | elif format == "BGR": 46 | # flip channels if needed 47 | image = image[:, :, ::-1] 48 | elif format == "YUV-BT.601": 49 | image = image / 255.0 50 | image = np.dot(image, np.array(_M_RGB2YUV).T) 51 | 52 | return image 53 | 54 | filepath = "/BS/databases15/GTA/images/train" 55 | 56 | filenames = GetFileFromThisRootDir(filepath) 57 | count = 0 58 | for file in filenames: 59 | img = Image.open(file) 60 | print(f'filename: {file}') 61 | try: 62 | img_np = convert_PIL_to_numpy(img, format="RGB") 63 | except: 64 | # import ipdb; ipdb.set_trace() 65 | count = count + 1 66 | print(f"count: {count}") 67 | print(f"count: {count}") -------------------------------------------------------------------------------- /datasets/split_data/synthia/split_synthia.py: -------------------------------------------------------------------------------- 1 | from multiprocessing import Pool 2 | import shutil 3 | import os 4 | shutil._USE_CP_SENDFILE = False 5 | 6 | def worker(path_pair): 7 | srcpath, dstpath = path_pair 8 | # print(f'srcpath{srcpath}') 9 | # print(f'dstpath{dstpath}') 10 | # shutil.copyfile(srcpath, dstpath) 11 | shutil.move(srcpath, dstpath) 12 | 13 | if __name__ == '__main__': 14 | pool = Pool(32) 15 | image_path = r'datasets/synthia/RGB' 16 | label_path = r'datasets/synthia/GT/LABELS' 17 | 18 | # dst_image_path = r'datasets/synthia_split/RGB' 19 | # dst_label_path = r'datasets/synthia_split/GT' 20 | 21 | dst_image_path = image_path 22 | dst_label_path = label_path 23 | 24 | with open('datasets/split_data/synthia_split_train.txt', 'r') as f: 25 | train_list = f.readlines() 26 | train_list = [x.strip() for x in train_list] 27 | 28 | with open('datasets/split_data/synthia_split_val.txt', 'r') as f: 29 | val_list = f.readlines() 30 | val_list = [x.strip() for x in val_list] 31 | 32 | train_pairs = [] 33 | 34 | if not os.path.exists(os.path.join(dst_image_path, 'train')): 35 | os.makedirs(os.path.join(dst_image_path, 'train')) 36 | 37 | if not os.path.exists(os.path.join(dst_label_path, 'train')): 38 | os.makedirs(os.path.join(dst_label_path, 'train')) 39 | 40 | for file in train_list: 41 | srcfile = os.path.join(image_path, file) 42 | dstfile = os.path.join(dst_image_path, 'train', file) 43 | train_pairs.append((srcfile, dstfile)) 44 | 45 | srclabel = os.path.join(label_path, file) 46 | dstlabel = os.path.join(dst_label_path, 'train', file) 47 | train_pairs.append((srclabel, dstlabel)) 48 | pool.map(worker, train_pairs) 49 | 50 | val_pairs = [] 51 | 52 | if not os.path.exists(os.path.join(dst_image_path, 'val')): 53 | os.makedirs(os.path.join(dst_image_path, 'val')) 54 | 55 | if not os.path.exists(os.path.join(dst_label_path, 'val')): 56 | os.makedirs(os.path.join(dst_label_path, 'val')) 57 | 58 | for file in val_list: 59 | srcfile = os.path.join(image_path, file) 60 | dstfile = os.path.join(dst_image_path, 'val', file) 61 | val_pairs.append((srcfile, dstfile)) 62 | 63 | srclabel = os.path.join(label_path, file) 64 | dstlabel = os.path.join(dst_label_path, 'val', file) 65 | val_pairs.append((srclabel, dstlabel)) 66 | pool.map(worker, val_pairs) 67 | -------------------------------------------------------------------------------- /configs/cityscapes/hgformer_R50_bs16_20k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-Cityscapes-SemanticSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "GroupFormer" 4 | RESNETS: 5 | NORM: "GN" 6 | SEM_SEG_HEAD: 7 | NAME: "MaskFormerHead" 8 | IGNORE_VALUE: 255 9 | NUM_CLASSES: 19 10 | LOSS_WEIGHT: 1.0 11 | CONVS_DIM: 256 12 | MASK_DIM: 256 13 | NORM: "GN" 14 | # pixel decoder 15 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 16 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 17 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 18 | COMMON_STRIDE: 4 19 | TRANSFORMER_ENC_LAYERS: 6 20 | NUM_GROUP_TOKENS: [512, 32] 21 | NUM_OUTPUT_GROUPS: [512, 32] 22 | # DOWNSAMPLE_RATE: 16 # 0.31 23 | # DOWNSAMPLE_RATE: 8 # 24 | DOWNSAMPLE_RATE: 4 # 0.32s 25 | 26 | # SPIX_RES: [16, 16] 27 | MASK_FORMER: 28 | # TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 29 | TRANSFORMER_DECODER_NAME: "GroupFormerDecoder" 30 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 31 | DEEP_SUPERVISION: True 32 | # DEEP_MASK_SUPERVISION: False 33 | NO_OBJECT_WEIGHT: 0.1 34 | STAGE_WEIGHTS: [1.0] 35 | CLASS_WEIGHT: 2.0 36 | MASK_WEIGHT: 5.0 37 | DICE_WEIGHT: 5.0 38 | SPIX_MASK_WEIGHT: 0.0 39 | SPIX_CLASS_WEIGHT: 2.0 40 | CONTRASTIVE_LOSS: True 41 | CONTRASTIVE_WEIGH: 6.0 42 | CONTRASTIVE_TAU: 0.1 43 | HIDDEN_DIM: 256 44 | NUM_OBJECT_QUERIES: 100 45 | NHEADS: 8 46 | DROPOUT: 0.0 47 | DIM_FEEDFORWARD: 2048 48 | ENC_LAYERS: 0 49 | PRE_NORM: False 50 | ENFORCE_INPUT_PROJ: False 51 | SIZE_DIVISIBILITY: 32 52 | DEC_LAYERS: 6 # 9 decoder layers, add one for the loss on learnable query 53 | SPIX_SELF_ATTEN_LAYERS: 6 54 | TRAIN_NUM_POINTS: 12544 55 | OVERSAMPLE_RATIO: 3.0 56 | IMPORTANCE_SAMPLE_RATIO: 0.75 57 | TEST: 58 | SEMANTIC_ON: True 59 | INSTANCE_ON: False 60 | PANOPTIC_ON: False 61 | OVERLAP_THRESHOLD: 0.8 62 | OBJECT_MASK_THRESHOLD: 0.8 63 | SOLVER: 64 | IMS_PER_BATCH: 16 65 | BASE_LR: 0.0001 66 | MAX_ITER: 20000 67 | WARMUP_FACTOR: 1.0 68 | WARMUP_ITERS: 0 69 | WEIGHT_DECAY: 0.05 70 | OPTIMIZER: "ADAMW" 71 | LR_SCHEDULER_NAME: "WarmupPolyLR" 72 | BACKBONE_MULTIPLIER: 0.1 73 | CLIP_GRADIENTS: 74 | ENABLED: True 75 | CLIP_TYPE: "full_model" 76 | CLIP_VALUE: 0.01 77 | NORM_TYPE: 2.0 78 | AMP: 79 | ENABLED: False 80 | 81 | DATALOADER: 82 | FILTER_EMPTY_ANNOTATIONS: True 83 | NUM_WORKERS: 4 84 | VERSION: 2 85 | 86 | CUDNN_BENCHMARK: True 87 | TEST: 88 | CLUSTER_SOFTMAX: True 89 | PRED_STAGE: "spix_pixelexclude0125+stage3" -------------------------------------------------------------------------------- /GETTING_STARTED.md: -------------------------------------------------------------------------------- 1 | # Getting Started with HGFormer 2 | 3 | This document provides a brief intro of the usage of HGFormer. 4 | 5 | Please see [Getting Started with Detectron2](https://github.com/facebookresearch/detectron2/blob/master/GETTING_STARTED.md) for full usage. 6 | 7 | ## Evaluation with Pre-trained Models 8 | 9 | Download [models](https://drive.google.com/drive/folders/1fUWaIhXtSxHLdTFxnuOSldLUe_ferauh?usp=drive_link). 10 | 11 | ### Cityscapes -> ACDC 12 | 13 | ``` 14 | python demo/inference.py --config-file configs/cityscapes/hgformer_swin_tiny_bs16_20k.yaml \ 15 | --input datasets/acdc/rgb_anon/all/test --output path_to_output \ 16 | --opts MODEL.WEIGHTS path_to_checkpoint 17 | ``` 18 | After running the command, you will find the results on ```path_to_output```. Then you can follow the instructions on [ACDC evaluation server](https://acdc.vision.ee.ethz.ch/login?target=%2Fsubmit) to get your scores. 19 | You can replace ```all``` with a specific type ```fog, snow, night, rain```, if you want to evaluate on a specific type 20 | 21 | ### Cityscapes -> Cityscapes-c 22 | 23 | ``` 24 | python test_city_c_level5.py --num-gpus 8 --config-file configs/city_c/hgformer_swin_tiny_bs16_20k.yaml \ 25 | --eval-only MODEL.WEIGHTS path_to_checkpoint OUTPUT_DIR path_to_output 26 | ``` 27 | 28 | ### Cityscapes -> Others 29 | 30 | ``` 31 | python plain_train_net.py --num-gpus 8 --config-file configs/cityscapes/hgformer_swin_tiny_bs16_20k.yaml \ 32 | --eval-only MODEL.WEIGHTS path_to_checkpoint OUTPUT_DIR path_to_output 33 | ``` 34 | 35 | ### Mapillary -> Others 36 | 37 | ``` 38 | python plain_train_net.py --num-gpus 8 --config-file configs/mapillary/hgformer_swin_tiny_bs16_20k_mapillary.yaml \ 39 | --eval-only MODEL.WEIGHTS path_to_checkpoint OUTPUT_DIR path_to_output 40 | ``` 41 | 42 | ## Training in Command Line 43 | 44 | 45 | To train a model, first 46 | setup the corresponding datasets following 47 | [datasets/README.md](./datasets/README.md), then prepare the models pre-trained on ImageNet classificaiton following [tools/README.md](./tools/README.md). Finally run: 48 | ``` 49 | python plain_train_net.py --num-gpus 8 \ 50 | --config-file configs/cityscapes/hgformer_swin_tiny_bs16_20k.yaml OUTPUT_DIR path_to_output 51 | ``` 52 | 53 | The configs are made for 8-GPU training. 54 | Since we use ADAMW optimizer, it is not clear how to scale learning rate with batch size. 55 | To train on 1 GPU, you need to figure out learning rate and batch size by yourself: 56 | ``` 57 | python plain_train_net.py \ 58 | --config-file configs/cityscapes/hgformer_swin_tiny_bs16_20k.yaml \ 59 | --num-gpus 1 SOLVER.IMS_PER_BATCH SET_TO_SOME_REASONABLE_VALUE SOLVER.BASE_LR SET_TO_SOME_REASONABLE_VALUE 60 | ``` 61 | -------------------------------------------------------------------------------- /configs/mapillary/hgformer_R50_bs16_20k_mapillary.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-mapillary19-SemanticSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "GroupFormer" 4 | RESNETS: 5 | NORM: "GN" 6 | SEM_SEG_HEAD: 7 | NAME: "MaskFormerHead" 8 | IGNORE_VALUE: 255 9 | NUM_CLASSES: 19 10 | LOSS_WEIGHT: 1.0 11 | CONVS_DIM: 256 12 | MASK_DIM: 256 13 | NORM: "GN" 14 | # pixel decoder 15 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 16 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 17 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 18 | COMMON_STRIDE: 4 19 | TRANSFORMER_ENC_LAYERS: 6 20 | NUM_GROUP_TOKENS: [512, 32] 21 | NUM_OUTPUT_GROUPS: [512, 32] 22 | # DOWNSAMPLE_RATE: 16 # 0.31 23 | # DOWNSAMPLE_RATE: 8 # mapillary: (16, 22) 24 | DOWNSAMPLE_RATE: 4 # 0.32s mapillary: (32, 44), cityscapes: (32, 64) 25 | 26 | # SPIX_RES: [16, 16] 27 | MASK_FORMER: 28 | # TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 29 | TRANSFORMER_DECODER_NAME: "GroupFormerDecoder" 30 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 31 | DEEP_SUPERVISION: True 32 | # DEEP_MASK_SUPERVISION: False 33 | NO_OBJECT_WEIGHT: 0.1 34 | STAGE_WEIGHTS: [1.0] 35 | CLASS_WEIGHT: 2.0 36 | MASK_WEIGHT: 5.0 37 | DICE_WEIGHT: 5.0 38 | SPIX_MASK_WEIGHT: 0.0 39 | SPIX_CLASS_WEIGHT: 2.0 40 | CONTRASTIVE_LOSS: True 41 | CONTRASTIVE_WEIGH: 6.0 42 | CONTRASTIVE_TAU: 0.1 43 | HIDDEN_DIM: 256 44 | NUM_OBJECT_QUERIES: 100 45 | NHEADS: 8 46 | DROPOUT: 0.0 47 | DIM_FEEDFORWARD: 2048 48 | ENC_LAYERS: 0 49 | PRE_NORM: False 50 | ENFORCE_INPUT_PROJ: False 51 | SIZE_DIVISIBILITY: 32 52 | # SIZE_DIVISIBILITY: 64 53 | DEC_LAYERS: 6 # 9 decoder layers, add one for the loss on learnable query 54 | SPIX_SELF_ATTEN_LAYERS: 6 55 | TRAIN_NUM_POINTS: 12544 56 | OVERSAMPLE_RATIO: 3.0 57 | IMPORTANCE_SAMPLE_RATIO: 0.75 58 | TEST: 59 | SEMANTIC_ON: True 60 | INSTANCE_ON: False 61 | PANOPTIC_ON: False 62 | OVERLAP_THRESHOLD: 0.8 63 | OBJECT_MASK_THRESHOLD: 0.8 64 | SOLVER: 65 | IMS_PER_BATCH: 16 66 | BASE_LR: 0.0001 67 | MAX_ITER: 20000 68 | WARMUP_FACTOR: 1.0 69 | WARMUP_ITERS: 0 70 | WEIGHT_DECAY: 0.05 71 | OPTIMIZER: "ADAMW" 72 | LR_SCHEDULER_NAME: "WarmupPolyLR" 73 | BACKBONE_MULTIPLIER: 0.1 74 | CLIP_GRADIENTS: 75 | ENABLED: True 76 | CLIP_TYPE: "full_model" 77 | CLIP_VALUE: 0.01 78 | NORM_TYPE: 2.0 79 | AMP: 80 | ENABLED: False 81 | DATALOADER: 82 | FILTER_EMPTY_ANNOTATIONS: True 83 | NUM_WORKERS: 4 84 | VERSION: 2 85 | 86 | CUDNN_BENCHMARK: True 87 | 88 | TEST: 89 | CLUSTER_SOFTMAX: True 90 | PRED_STAGE: "spix_pixelexclude0125+stage3" -------------------------------------------------------------------------------- /datasets/split_data/gta/split_gta.py: -------------------------------------------------------------------------------- 1 | from multiprocessing import Pool 2 | import shutil 3 | import os 4 | shutil._USE_CP_SENDFILE = False 5 | def worker(path_pair): 6 | srcpath, dstpath = path_pair 7 | # shutil.copyfile(srcpath, dstpath) 8 | shutil.move(srcpath, dstpath) 9 | 10 | if __name__ == '__main__': 11 | pool = Pool(32) 12 | image_path = r'datasets/GTA/images' 13 | label_path = r'datasets/GTA/labels' 14 | 15 | with open('datasets/split_data/gtav_split_train.txt', 'r') as f: 16 | train_list = f.readlines() 17 | train_list = [x.strip() for x in train_list] 18 | 19 | with open('datasets/split_data/gtav_split_val.txt', 'r') as f: 20 | val_list = f.readlines() 21 | val_list = [x.strip() for x in val_list] 22 | 23 | with open('datasets/split_data/gtav_split_test.txt', 'r') as f: 24 | test_list = f.readlines() 25 | test_list = [x.strip() for x in test_list] 26 | 27 | train_pairs = [] 28 | 29 | if not os.path.exists(os.path.join(image_path, 'train')): 30 | os.makedirs(os.path.join(image_path, 'train')) 31 | 32 | if not os.path.exists(os.path.join(label_path, 'train')): 33 | os.makedirs(os.path.join(label_path, 'train')) 34 | 35 | for file in train_list: 36 | srcfile = os.path.join(image_path, file) 37 | dstfile = os.path.join(image_path, 'train', file) 38 | train_pairs.append((srcfile, dstfile)) 39 | 40 | srclabel = os.path.join(label_path, file) 41 | dstlabel = os.path.join(label_path, 'train', file) 42 | train_pairs.append((srclabel, dstlabel)) 43 | pool.map(worker, train_pairs) 44 | 45 | val_pairs = [] 46 | 47 | if not os.path.exists(os.path.join(image_path, 'valid')): 48 | os.makedirs(os.path.join(image_path, 'valid')) 49 | 50 | if not os.path.exists(os.path.join(label_path, 'valid')): 51 | os.makedirs(os.path.join(label_path, 'valid')) 52 | 53 | for file in val_list: 54 | srcfile = os.path.join(image_path, file) 55 | dstfile = os.path.join(image_path, 'valid', file) 56 | val_pairs.append((srcfile, dstfile)) 57 | 58 | srclabel = os.path.join(label_path, file) 59 | dstlabel = os.path.join(label_path, 'valid', file) 60 | val_pairs.append((srclabel, dstlabel)) 61 | pool.map(worker, val_pairs) 62 | 63 | test_pairs = [] 64 | 65 | if not os.path.exists(os.path.join(image_path, 'test')): 66 | os.makedirs(os.path.join(image_path, 'test')) 67 | 68 | if not os.path.exists(os.path.join(label_path, 'test')): 69 | os.makedirs(os.path.join(label_path, 'test')) 70 | 71 | for file in test_list: 72 | srcfile = os.path.join(image_path, file) 73 | dstfile = os.path.join(image_path, 'test', file) 74 | test_pairs.append((srcfile, dstfile)) 75 | 76 | srclabel = os.path.join(label_path, file) 77 | dstlabel = os.path.join(label_path, 'test', file) 78 | test_pairs.append((srclabel, dstlabel)) 79 | pool.map(worker, test_pairs) -------------------------------------------------------------------------------- /tools/README.md: -------------------------------------------------------------------------------- 1 | This directory contains few tools for HGFormer. 2 | 3 | * `convert-torchvision-to-d2.py` 4 | 5 | Tool to convert torchvision pre-trained weights for D2. 6 | 7 | ``` 8 | wget https://download.pytorch.org/models/resnet101-63fe2227.pth 9 | python tools/convert-torchvision-to-d2.py resnet101-63fe2227.pth R-101.pkl 10 | ``` 11 | 12 | * `convert-pretrained-swin-model-to-d2.py` 13 | 14 | Tool to convert Swin Transformer pre-trained weights for D2. 15 | 16 | ``` 17 | pip install timm 18 | 19 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth 20 | python tools/convert-pretrained-swin-model-to-d2.py swin_tiny_patch4_window7_224.pth swin_tiny_patch4_window7_224.pkl 21 | 22 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_small_patch4_window7_224.pth 23 | python tools/convert-pretrained-swin-model-to-d2.py swin_small_patch4_window7_224.pth swin_small_patch4_window7_224.pkl 24 | 25 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window12_384_22k.pth 26 | python tools/convert-pretrained-swin-model-to-d2.py swin_base_patch4_window12_384_22k.pth swin_base_patch4_window12_384_22k.pkl 27 | 28 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth 29 | python tools/convert-pretrained-swin-model-to-d2.py swin_large_patch4_window12_384_22k.pth swin_large_patch4_window12_384_22k.pkl 30 | ``` 31 | 32 | * `evaluate_pq_for_semantic_segmentation.py` 33 | 34 | Tool to evaluate PQ (PQ-stuff) for semantic segmentation predictions. 35 | 36 | Usage: 37 | 38 | ``` 39 | python tools/evaluate_pq_for_semantic_segmentation.py --dataset-name ade20k_sem_seg_val --json-file OUTPUT_DIR/inference/sem_seg_predictions.json 40 | ``` 41 | 42 | where `OUTPUT_DIR` is set in the config file. 43 | 44 | * `evaluate_coco_boundary_ap.py` 45 | 46 | Tool to evaluate Boundary AP for instance segmentation predictions. 47 | 48 | Usage: 49 | 50 | ``` 51 | python tools/coco_instance_evaluation.py --gt-json-file COCO_GT_JSON --dt-json-file COCO_DT_JSON 52 | ``` 53 | 54 | To install Boundary IoU API, run: 55 | 56 | ``` 57 | pip install git+https://github.com/bowenc0221/boundary-iou-api.git 58 | ``` 59 | 60 | * `analyze_model.py` 61 | 62 | Tool to analyze model parameters and flops. 63 | 64 | Usage for semantic segmentation (ADE20K only, use with caution!): 65 | 66 | ``` 67 | python tools/analyze_model.py --num-inputs 1 --tasks flop --use-fixed-input-size --config-file CONFIG_FILE 68 | ``` 69 | 70 | Note that, for semantic segmentation (ADE20K only), we use a dummy image with fixed size that equals to `cfg.INPUT.CROP.SIZE[0] x cfg.INPUT.CROP.SIZE[0]`. 71 | Please do not use `--use-fixed-input-size` for calculating FLOPs on other datasets like Cityscapes! 72 | 73 | Usage for panoptic and instance segmentation: 74 | 75 | ``` 76 | python tools/analyze_model.py --num-inputs 100 --tasks flop --config-file CONFIG_FILE 77 | ``` 78 | 79 | Note that, for panoptic and instance segmentation, we compute the average flops over 100 real validation images. 80 | -------------------------------------------------------------------------------- /hgformer/modeling/transformer_decoder/position_encoding.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py 3 | """ 4 | Various positional encodings for the transformer. 5 | """ 6 | import math 7 | 8 | import torch 9 | from torch import nn 10 | 11 | 12 | class PositionEmbeddingSine(nn.Module): 13 | """ 14 | This is a more standard version of the position embedding, very similar to the one 15 | used by the Attention is all you need paper, generalized to work on images. 16 | """ 17 | 18 | def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None): 19 | super().__init__() 20 | self.num_pos_feats = num_pos_feats 21 | self.temperature = temperature 22 | self.normalize = normalize 23 | if scale is not None and normalize is False: 24 | raise ValueError("normalize should be True if scale is passed") 25 | if scale is None: 26 | scale = 2 * math.pi 27 | self.scale = scale 28 | 29 | def forward(self, x, mask=None): 30 | if mask is None: 31 | mask = torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool) 32 | # mask: e.g. shape [2, 16, 32], [B, H, W] 33 | not_mask = ~mask 34 | y_embed = not_mask.cumsum(1, dtype=torch.float32) # [B, H, W] 35 | x_embed = not_mask.cumsum(2, dtype=torch.float32) # [B, H, W] 36 | if self.normalize: 37 | eps = 1e-6 38 | y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale # normalize the coordinates, then multiply 2pi 39 | x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale 40 | 41 | dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) # [128] 42 | # import ipdb; ipdb.set_trace() 43 | dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) 44 | 45 | pos_x = x_embed[:, :, :, None] / dim_t # [B, H, W, num_pos_feats] 46 | pos_y = y_embed[:, :, :, None] / dim_t 47 | pos_x = torch.stack( 48 | (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4 49 | ).flatten(3) # [B, H, W, num_pos_feats] 50 | # import ipdb; ipdb.set_trace() 51 | pos_y = torch.stack( 52 | (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4 53 | ).flatten(3) 54 | 55 | # import ipdb; ipdb.set_trace() 56 | pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) # [B, 2*num_pos_feats, H, W], 2 * num_pos_feats is equal to the number of feat channels 57 | return pos 58 | 59 | def __repr__(self, _repr_indent=4): 60 | head = "Positional encoding " + self.__class__.__name__ 61 | body = [ 62 | "num_pos_feats: {}".format(self.num_pos_feats), 63 | "temperature: {}".format(self.temperature), 64 | "normalize: {}".format(self.normalize), 65 | "scale: {}".format(self.scale), 66 | ] 67 | # _repr_indent = 4 68 | lines = [head] + [" " * _repr_indent + line for line in body] 69 | return "\n".join(lines) 70 | -------------------------------------------------------------------------------- /MODEL_ZOO.md: -------------------------------------------------------------------------------- 1 | # HGFormer Model Zoo and Baselines 2 | 3 | #### Detectron2 ImageNet Pretrained Models 4 | 5 | It's common to initialize from backbone models pre-trained on ImageNet classification tasks. 6 | 7 | To prepare the backbones pre-trained on ImageNet classification, please following [tools/README.md](./tools/README.md) 8 | 9 | #### License 10 | 11 | All models available for download through this document are licensed under the 12 | [Creative Commons Attribution-NonCommercial 4.0 International License](https://creativecommons.org/licenses/by-nc/4.0/). 13 | 14 | ## Cityscapes -> ACDC 15 | | Method | Backbone | Fog | Night | Rain | Snow | All | Download | 16 | |:-----------:|:---------:|:-----:|:-----:|:-----:|:-----:|:-----:|:--------:| 17 | | Mask2former | Swin-Tiny | 54.06 | 38.11 | 59.54 | 55.76 | 53.65 | [model](https://drive.google.com/drive/folders/1eL38sFGdUNV8o9EbFsheurHjdm-CNg5K?usp=sharing) | 18 | | HGFormer | Swin-Tiny | 59.82 | 41.88 | 60.92 | 60.82 | 56.95 | [model](https://drive.google.com/drive/folders/1Rq1PnaYTFACpZX_-oXTq7laCa0zwbfFR?usp=drive_link) | 19 | 20 | 21 | ## Cityscapes -> Cityscapes-C (level 5) 22 | | Method | Backbone | Average | Motion | Defoc | Glass | Gauss | Gauss | Impul | Shot | Speck | Bright | Contr | Satur | JPEG | Snow | Spatt | Fog | Frost | Download | 23 | |:-----------:|:-----------:|:---------:|:------:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|:------:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|:--------:| 24 | | Mask2former | Swin-Tiny | 41.68 | 51.61 | 51.52 | 39.69 | 46.71 | 6.89 | 7.68 | 12.75 | 44.10 | 72.71 | 58.60 | 69.14 | 22.86 | 26.10 | 58.35 | 67.12 | 31.11 | [model](https://drive.google.com/drive/folders/1eL38sFGdUNV8o9EbFsheurHjdm-CNg5K?usp=sharing) | 25 | | HGFormer | Swin-Tiny | 43.81 | 52.51 | 53.03 | 39.02 | 47.93 | 16.45 | 16.03 | 20.55 | 48.44 | 74.51 | 57.14 | 70.53 | 27.32 | 25.66 | 59.19 | 66.49 | 26.11 | [model](https://drive.google.com/drive/folders/1Rq1PnaYTFACpZX_-oXTq7laCa0zwbfFR?usp=drive_link) | 26 | ## Cityscapes -> Others 27 | | Method | Backbone | Mapillary | BDD | GTA | Synthia | Average | Download | 28 | |:-----------:|:---------:|:---------:|:-----:|:-----:|:-------:|:--------:|:--------:| 29 | | Mask2former | Swin-Tiny | 65.28 | 49.87 | 51.38 | 34.76 | 50.32 | [model](https://drive.google.com/drive/folders/1eL38sFGdUNV8o9EbFsheurHjdm-CNg5K?usp=sharing) | 30 | | HGFormer | Swin-Tiny | 67.22 | 52.69 | 51.94 | 32.98 | 51.21 |[model](https://drive.google.com/drive/folders/1Rq1PnaYTFACpZX_-oXTq7laCa0zwbfFR?usp=drive_link) | 31 | ## Mapillary -> Others 32 | 33 | | Method | Backbone | GTA | Synthia | Cityscapes | BDD | Average | Download | 34 | |:-----------:|:---------:|:-----:|:-------:|:----------:|:-----:|:-------:|:--------:| 35 | | Mask2former | Swin-Tiny | 57.81 | 40.14 | 68.23 | 59.05 | 56.31 | [model](https://drive.google.com/drive/folders/1xqvAcQZs2NZhUD5dG2KGPmYBnlkH4u-s?usp=drive_link) | 36 | | HGFormer | Swin-Tiny | 60.79 | 39.15 | 69.28 | 62.22 | 57.86 | [model](https://drive.google.com/drive/folders/1XJgHBKT7J-_Gzqgzo3EiX0wAnjXMNCGG?usp=drive_link) | 37 | 38 | ## Disclaimer 39 | The numbers differ slightly from the results reported in the paper because we presented an average of three runs in the paper. -------------------------------------------------------------------------------- /hgformer/modeling/pixel_decoder/ops/setup.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | import os 13 | import glob 14 | 15 | import torch 16 | 17 | from torch.utils.cpp_extension import CUDA_HOME 18 | from torch.utils.cpp_extension import CppExtension 19 | from torch.utils.cpp_extension import CUDAExtension 20 | 21 | from setuptools import find_packages 22 | from setuptools import setup 23 | 24 | requirements = ["torch", "torchvision"] 25 | 26 | def get_extensions(): 27 | this_dir = os.path.dirname(os.path.abspath(__file__)) 28 | extensions_dir = os.path.join(this_dir, "src") 29 | 30 | main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) 31 | source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) 32 | source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) 33 | 34 | sources = main_file + source_cpu 35 | extension = CppExtension 36 | extra_compile_args = {"cxx": []} 37 | define_macros = [] 38 | 39 | # Force cuda since torch ask for a device, not if cuda is in fact available. 40 | if (os.environ.get('FORCE_CUDA') or torch.cuda.is_available()) and CUDA_HOME is not None: 41 | extension = CUDAExtension 42 | sources += source_cuda 43 | define_macros += [("WITH_CUDA", None)] 44 | extra_compile_args["nvcc"] = [ 45 | "-DCUDA_HAS_FP16=1", 46 | "-D__CUDA_NO_HALF_OPERATORS__", 47 | "-D__CUDA_NO_HALF_CONVERSIONS__", 48 | "-D__CUDA_NO_HALF2_OPERATORS__", 49 | ] 50 | else: 51 | if CUDA_HOME is None: 52 | raise NotImplementedError('CUDA_HOME is None. Please set environment variable CUDA_HOME.') 53 | else: 54 | raise NotImplementedError('No CUDA runtime is found. Please set FORCE_CUDA=1 or test it by running torch.cuda.is_available().') 55 | 56 | sources = [os.path.join(extensions_dir, s) for s in sources] 57 | include_dirs = [extensions_dir] 58 | ext_modules = [ 59 | extension( 60 | "MultiScaleDeformableAttention", 61 | sources, 62 | include_dirs=include_dirs, 63 | define_macros=define_macros, 64 | extra_compile_args=extra_compile_args, 65 | ) 66 | ] 67 | return ext_modules 68 | 69 | setup( 70 | name="MultiScaleDeformableAttention", 71 | version="1.0", 72 | author="Weijie Su", 73 | url="https://github.com/fundamentalvision/Deformable-DETR", 74 | description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention", 75 | packages=find_packages(exclude=("configs", "tests",)), 76 | ext_modules=get_extensions(), 77 | cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, 78 | ) 79 | -------------------------------------------------------------------------------- /datasets/prepare_mapillary_sem_seg.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # Copyright (c) Facebook, Inc. and its affiliates. 4 | import os 5 | from pathlib import Path 6 | 7 | import numpy as np 8 | import tqdm 9 | from PIL import Image 10 | from multiprocessing import Pool 11 | 12 | ignore_label = 255 13 | 14 | id_to_ignore_or_group = {} 15 | 16 | # def gen_id_to_ignore(): 17 | # global id_to_ignore_or_group 18 | for i in range(66): 19 | id_to_ignore_or_group[i] = ignore_label 20 | 21 | ### Convert each class to a corresponding cityscapes class 22 | ### Road 23 | # Road 24 | id_to_ignore_or_group[13] = 0 25 | # Lane Marking - General 26 | id_to_ignore_or_group[24] = 0 27 | # Manhole 28 | id_to_ignore_or_group[41] = 0 29 | 30 | ### Sidewalk 31 | # Curb 32 | id_to_ignore_or_group[2] = 1 33 | # Sidewalk 34 | id_to_ignore_or_group[15] = 1 35 | 36 | ### Building 37 | # Building 38 | id_to_ignore_or_group[17] = 2 39 | 40 | ### Wall 41 | # Wall 42 | id_to_ignore_or_group[6] = 3 43 | 44 | ### Fence 45 | # Fence 46 | id_to_ignore_or_group[3] = 4 47 | 48 | ### Pole 49 | # Pole 50 | id_to_ignore_or_group[45] = 5 51 | # Utility Pole 52 | id_to_ignore_or_group[47] = 5 53 | 54 | ### Traffic Light 55 | # Traffic Light 56 | id_to_ignore_or_group[48] = 6 57 | 58 | ### Traffic Sign 59 | # Traffic Sign 60 | id_to_ignore_or_group[50] = 7 61 | 62 | ### Vegetation 63 | # Vegitation 64 | id_to_ignore_or_group[30] = 8 65 | 66 | ### Terrain 67 | # Terrain 68 | id_to_ignore_or_group[29] = 9 69 | 70 | ### Sky 71 | # Sky 72 | id_to_ignore_or_group[27] = 10 73 | 74 | ### Person 75 | # Person 76 | id_to_ignore_or_group[19] = 11 77 | 78 | ### Rider 79 | # Bicyclist 80 | id_to_ignore_or_group[20] = 12 81 | # Motorcyclist 82 | id_to_ignore_or_group[21] = 12 83 | # Other Rider 84 | id_to_ignore_or_group[22] = 12 85 | 86 | ### Car 87 | # Car 88 | id_to_ignore_or_group[55] = 13 89 | 90 | ### Truck 91 | # Truck 92 | id_to_ignore_or_group[61] = 14 93 | 94 | ### Bus 95 | # Bus 96 | id_to_ignore_or_group[54] = 15 97 | 98 | ### Train 99 | # On Rails 100 | id_to_ignore_or_group[58] = 16 101 | 102 | ### Motorcycle 103 | # Motorcycle 104 | id_to_ignore_or_group[57] = 17 105 | 106 | ### Bicycle 107 | # Bicycle 108 | id_to_ignore_or_group[52] = 18 109 | 110 | 111 | 112 | def convert(filetuple): 113 | input, outputpath = filetuple 114 | lab = np.asarray(Image.open(input)) 115 | assert lab.dtype == np.uint8 116 | output = np.zeros_like(lab, dtype=np.uint8) + 255 117 | for obj_id in np.unique(lab): 118 | # print(f'obj_id{obj_id}') 119 | # print(f'{id_to_ignore_or_group}') 120 | if obj_id in id_to_ignore_or_group: 121 | output[lab == obj_id] = id_to_ignore_or_group[obj_id] 122 | 123 | Image.fromarray(output).save(outputpath) 124 | 125 | if __name__ == "__main__": 126 | dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "mapillary" 127 | pool = Pool(32) 128 | # gen_id_to_ignore() 129 | # import ipdb; ipdb.set_trace() 130 | for name in ["training", "validation"]: 131 | annotation_dir = dataset_dir / name / "labels" 132 | output_dir = dataset_dir / "labels_detectron2" / name 133 | output_dir.mkdir(parents=True, exist_ok=True) 134 | filelist = [] 135 | for file in tqdm.tqdm(list(annotation_dir.iterdir())): 136 | output_file = output_dir / file.name 137 | # convert(file, output_file) 138 | filelist.append((file, output_file)) 139 | pool.map(convert, filelist) -------------------------------------------------------------------------------- /datasets/prepare_synthia_sem_seg.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # Copyright (c) Facebook, Inc. and its affiliates. 4 | import os 5 | from pathlib import Path 6 | 7 | import numpy as np 8 | import tqdm 9 | from PIL import Image 10 | from multiprocessing import Pool 11 | import cv2 12 | import imageio 13 | import imageio.v2 as imageio 14 | ignore_label = 255 15 | 16 | # mapping based on README.txt from SYNTHIA_RAND_CITYSCAPES 17 | trainid_to_trainid = { 18 | 0: ignore_label, # void 19 | 1: 10, # sky 20 | 2: 2, # building 21 | 3: 0, # road 22 | 4: 1, # sidewalk 23 | 5: 4, # fence 24 | 6: 8, # vegetation 25 | 7: 5, # pole 26 | 8: 13, # car 27 | 9: 7, # traffic sign 28 | 10: 11, # pedestrian - person 29 | 11: 18, # bicycle 30 | 12: 17, # motorcycle 31 | 13: ignore_label, # parking-slot 32 | 14: ignore_label, # road-work 33 | 15: 6, # traffic light 34 | 16: 9, # terrain 35 | 17: 12, # rider 36 | 18: 14, # truck 37 | 19: 15, # bus 38 | 20: 16, # train 39 | 21: 3, # wall 40 | 22: ignore_label # Lanemarking 41 | } 42 | 43 | # def convert(filetupe): 44 | # input, outputpath = filetupe 45 | # # lab = np.asarray(Image.open(input)) 46 | # # lab = imageio.imread(input, format='PNG-FI') 47 | # lab = imageio.imread(input, format='PNG') 48 | # 49 | # # print(input) 50 | # # lab = cv2.imread(str(input), cv2.IMREAD_UNCHANGED)[:, :, -1] 51 | # lab = np.array(lab, dtype=np.uint8)[:, :, 0] 52 | # assert lab.dtype == np.uint8 53 | # output = np.zeros_like(lab, dtype=np.uint8) + 255 54 | # for obj_id in np.unique(lab): 55 | # if obj_id in trainid_to_trainid: 56 | # output[lab == obj_id] = trainid_to_trainid[obj_id] 57 | # 58 | # Image.fromarray(output).save(outputpath) 59 | 60 | 61 | def convert(filetupe): 62 | file, new_file = filetupe 63 | # re-assign labels to match the format of Cityscapes 64 | # PIL does not work with the image format, but cv2 does 65 | label = cv2.imread(str(file), cv2.IMREAD_UNCHANGED)[:, :, -1] 66 | 67 | label_copy = 255 * np.ones(label.shape, dtype=np.uint8) 68 | sample_class_stats = {} 69 | for k, v in trainid_to_trainid.items(): 70 | k_mask = label == k 71 | label_copy[k_mask] = v 72 | n = int(np.sum(k_mask)) 73 | if n > 0: 74 | sample_class_stats[v] = n 75 | # new_file = file.replace('.png', '_labelTrainIds.png') 76 | # assert file != new_file 77 | # sample_class_stats['file'] = new_file 78 | Image.fromarray(label_copy, mode='L').save(new_file) 79 | # return sample_class_stats 80 | 81 | if __name__ == "__main__": 82 | dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "synthia" 83 | pool = Pool(32) 84 | for name in ["train", "val"]: 85 | # for name in ["train"]: 86 | annotation_dir = dataset_dir / "GT" / "LABELS" / name 87 | output_dir = dataset_dir / "labels_detectron2" / name 88 | output_dir.mkdir(parents=True, exist_ok=True) 89 | filelist = [] 90 | for file in tqdm.tqdm(list(annotation_dir.iterdir())): 91 | output_file = output_dir / file.name 92 | # convert(file, output_file) 93 | filelist.append((file, output_file)) 94 | pool.map(convert, filelist) -------------------------------------------------------------------------------- /hgformer/modeling/pixel_decoder/ops/functions/ms_deform_attn_func.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | from __future__ import absolute_import 13 | from __future__ import print_function 14 | from __future__ import division 15 | 16 | import torch 17 | import torch.nn.functional as F 18 | from torch.autograd import Function 19 | from torch.autograd.function import once_differentiable 20 | 21 | try: 22 | import MultiScaleDeformableAttention as MSDA 23 | except ModuleNotFoundError as e: 24 | info_string = ( 25 | "\n\nPlease compile MultiScaleDeformableAttention CUDA op with the following commands:\n" 26 | "\t`cd mask2former/modeling/pixel_decoder/ops`\n" 27 | "\t`sh make.sh`\n" 28 | ) 29 | raise ModuleNotFoundError(info_string) 30 | 31 | 32 | class MSDeformAttnFunction(Function): 33 | @staticmethod 34 | def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step): 35 | ctx.im2col_step = im2col_step 36 | output = MSDA.ms_deform_attn_forward( 37 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step) 38 | ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights) 39 | return output 40 | 41 | @staticmethod 42 | @once_differentiable 43 | def backward(ctx, grad_output): 44 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors 45 | grad_value, grad_sampling_loc, grad_attn_weight = \ 46 | MSDA.ms_deform_attn_backward( 47 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step) 48 | 49 | return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None 50 | 51 | 52 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights): 53 | # for debug and test only, 54 | # need to use cuda version instead 55 | N_, S_, M_, D_ = value.shape 56 | _, Lq_, M_, L_, P_, _ = sampling_locations.shape 57 | value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1) 58 | sampling_grids = 2 * sampling_locations - 1 59 | sampling_value_list = [] 60 | for lid_, (H_, W_) in enumerate(value_spatial_shapes): 61 | # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_ 62 | value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_) 63 | # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2 64 | sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1) 65 | # N_*M_, D_, Lq_, P_ 66 | sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_, 67 | mode='bilinear', padding_mode='zeros', align_corners=False) 68 | sampling_value_list.append(sampling_value_l_) 69 | # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_) 70 | attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_) 71 | output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_) 72 | return output.transpose(1, 2).contiguous() 73 | -------------------------------------------------------------------------------- /hgformer/test_time_augmentation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import copy 3 | import logging 4 | from itertools import count 5 | 6 | import numpy as np 7 | import torch 8 | from fvcore.transforms import HFlipTransform 9 | from torch import nn 10 | from torch.nn.parallel import DistributedDataParallel 11 | 12 | from detectron2.data.detection_utils import read_image 13 | from detectron2.modeling import DatasetMapperTTA 14 | 15 | 16 | __all__ = [ 17 | "SemanticSegmentorWithTTA", 18 | ] 19 | 20 | 21 | class SemanticSegmentorWithTTA(nn.Module): 22 | """ 23 | A SemanticSegmentor with test-time augmentation enabled. 24 | Its :meth:`__call__` method has the same interface as :meth:`SemanticSegmentor.forward`. 25 | """ 26 | 27 | def __init__(self, cfg, model, tta_mapper=None, batch_size=1): 28 | """ 29 | Args: 30 | cfg (CfgNode): 31 | model (SemanticSegmentor): a SemanticSegmentor to apply TTA on. 32 | tta_mapper (callable): takes a dataset dict and returns a list of 33 | augmented versions of the dataset dict. Defaults to 34 | `DatasetMapperTTA(cfg)`. 35 | batch_size (int): batch the augmented images into this batch size for inference. 36 | """ 37 | super().__init__() 38 | if isinstance(model, DistributedDataParallel): 39 | model = model.module 40 | self.cfg = cfg.clone() 41 | 42 | self.model = model 43 | 44 | if tta_mapper is None: 45 | tta_mapper = DatasetMapperTTA(cfg) 46 | self.tta_mapper = tta_mapper 47 | self.batch_size = batch_size 48 | 49 | def __call__(self, batched_inputs): 50 | """ 51 | Same input/output format as :meth:`SemanticSegmentor.forward` 52 | """ 53 | 54 | def _maybe_read_image(dataset_dict): 55 | ret = copy.copy(dataset_dict) 56 | if "image" not in ret: 57 | image = read_image(ret.pop("file_name"), self.model.input_format) 58 | image = torch.from_numpy(np.ascontiguousarray(image.transpose(2, 0, 1))) # CHW 59 | ret["image"] = image 60 | if "height" not in ret and "width" not in ret: 61 | ret["height"] = image.shape[1] 62 | ret["width"] = image.shape[2] 63 | return ret 64 | 65 | processed_results = [] 66 | for x in batched_inputs: 67 | result = self._inference_one_image(_maybe_read_image(x)) 68 | processed_results.append(result) 69 | return processed_results 70 | 71 | def _inference_one_image(self, input): 72 | """ 73 | Args: 74 | input (dict): one dataset dict with "image" field being a CHW tensor 75 | Returns: 76 | dict: one output dict 77 | """ 78 | orig_shape = (input["height"], input["width"]) 79 | augmented_inputs, tfms = self._get_augmented_inputs(input) 80 | 81 | final_predictions = None 82 | count_predictions = 0 83 | for input, tfm in zip(augmented_inputs, tfms): 84 | count_predictions += 1 85 | with torch.no_grad(): 86 | if final_predictions is None: 87 | if any(isinstance(t, HFlipTransform) for t in tfm.transforms): 88 | final_predictions = self.model([input])[0].pop("sem_seg").flip(dims=[2]) 89 | else: 90 | final_predictions = self.model([input])[0].pop("sem_seg") 91 | else: 92 | if any(isinstance(t, HFlipTransform) for t in tfm.transforms): 93 | final_predictions += self.model([input])[0].pop("sem_seg").flip(dims=[2]) 94 | else: 95 | final_predictions += self.model([input])[0].pop("sem_seg") 96 | 97 | final_predictions = final_predictions / count_predictions 98 | return {"sem_seg": final_predictions} 99 | 100 | def _get_augmented_inputs(self, input): 101 | augmented_inputs = self.tta_mapper(input) 102 | tfms = [x.pop("transforms") for x in augmented_inputs] 103 | return augmented_inputs, tfms 104 | -------------------------------------------------------------------------------- /hgformer/modeling/meta_arch/group_former_head.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import logging 3 | from copy import deepcopy 4 | from typing import Callable, Dict, List, Optional, Tuple, Union 5 | 6 | import fvcore.nn.weight_init as weight_init 7 | from torch import nn 8 | from torch.nn import functional as F 9 | 10 | from detectron2.config import configurable 11 | from detectron2.layers import Conv2d, ShapeSpec, get_norm 12 | from detectron2.modeling import SEM_SEG_HEADS_REGISTRY 13 | 14 | from ..transformer_decoder.maskformer_transformer_decoder import build_transformer_decoder 15 | from ..pixel_decoder.fpn import build_pixel_decoder 16 | 17 | 18 | @SEM_SEG_HEADS_REGISTRY.register() 19 | class GroupFormerHead(nn.Module): 20 | 21 | @configurable 22 | def __init__( 23 | self, 24 | input_shape: Dict[str, ShapeSpec], 25 | *, 26 | num_classes: int, 27 | pixel_decoder: nn.Module, 28 | loss_weight: float = 1.0, 29 | ignore_value: int = -1, 30 | # extra parameters 31 | transformer_predictor: nn.Module, 32 | transformer_in_feature: str, 33 | ): 34 | """ 35 | NOTE: this interface is experimental. 36 | Args: 37 | input_shape: shapes (channels and stride) of the input features 38 | num_classes: number of classes to predict 39 | pixel_decoder: the pixel decoder module 40 | loss_weight: loss weight 41 | ignore_value: category id to be ignored during training. 42 | transformer_predictor: the transformer decoder that makes prediction 43 | transformer_in_feature: input feature name to the transformer_predictor 44 | """ 45 | super().__init__() 46 | input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride) 47 | self.in_features = [k for k, v in input_shape] 48 | feature_strides = [v.stride for k, v in input_shape] 49 | feature_channels = [v.channels for k, v in input_shape] 50 | 51 | self.ignore_value = ignore_value 52 | self.common_stride = 4 53 | self.loss_weight = loss_weight 54 | 55 | self.pixel_decoder = pixel_decoder 56 | self.predictor = transformer_predictor 57 | self.transformer_in_feature = transformer_in_feature 58 | 59 | self.num_classes = num_classes 60 | 61 | @classmethod 62 | def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]): 63 | # figure out in_channels to transformer predictor 64 | if cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "transformer_encoder": 65 | transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM 66 | elif cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "pixel_embedding": 67 | transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM 68 | elif cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "multi_scale_pixel_decoder": # for maskformer2 69 | transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM 70 | else: 71 | transformer_predictor_in_channels = input_shape[cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE].channels 72 | 73 | return { 74 | "input_shape": { 75 | k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES 76 | }, 77 | "ignore_value": cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE, 78 | "num_classes": cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES, 79 | "pixel_decoder": build_pixel_decoder(cfg, input_shape), 80 | "loss_weight": cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT, 81 | "transformer_in_feature": cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE, 82 | "transformer_predictor": build_transformer_decoder( 83 | cfg, 84 | transformer_predictor_in_channels, 85 | mask_classification=True, 86 | ), 87 | } 88 | 89 | def forward(self, features, mask=None): 90 | return self.layers(features, mask) 91 | 92 | def layers(self, features, mask=None): 93 | multi_scale_features = self.pixel_decoder.forward_features(features) 94 | if self.transformer_in_feature == "multi_scale_pixel_decoder": 95 | predictions = self.predictor(multi_scale_features, mask) 96 | else: 97 | raise NotImplementedError 98 | return predictions 99 | -------------------------------------------------------------------------------- /hgformer/utils/misc.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/util/misc.py 3 | """ 4 | Misc functions, including distributed helpers. 5 | 6 | Mostly copy-paste from torchvision references. 7 | """ 8 | from typing import List, Optional 9 | 10 | import torch 11 | import torch.distributed as dist 12 | import torchvision 13 | from torch import Tensor 14 | 15 | 16 | def _max_by_axis(the_list): 17 | # type: (List[List[int]]) -> List[int] 18 | maxes = the_list[0] 19 | for sublist in the_list[1:]: 20 | for index, item in enumerate(sublist): 21 | maxes[index] = max(maxes[index], item) 22 | return maxes 23 | 24 | 25 | class NestedTensor(object): 26 | def __init__(self, tensors, mask: Optional[Tensor]): 27 | self.tensors = tensors 28 | self.mask = mask 29 | 30 | def to(self, device): 31 | # type: (Device) -> NestedTensor # noqa 32 | cast_tensor = self.tensors.to(device) 33 | mask = self.mask 34 | if mask is not None: 35 | assert mask is not None 36 | cast_mask = mask.to(device) 37 | else: 38 | cast_mask = None 39 | return NestedTensor(cast_tensor, cast_mask) 40 | 41 | def decompose(self): 42 | return self.tensors, self.mask 43 | 44 | def __repr__(self): 45 | return str(self.tensors) 46 | 47 | 48 | def nested_tensor_from_tensor_list(tensor_list: List[Tensor]): 49 | # TODO make this more general 50 | if tensor_list[0].ndim == 3: 51 | if torchvision._is_tracing(): 52 | # nested_tensor_from_tensor_list() does not export well to ONNX 53 | # call _onnx_nested_tensor_from_tensor_list() instead 54 | return _onnx_nested_tensor_from_tensor_list(tensor_list) 55 | 56 | # TODO make it support different-sized images 57 | max_size = _max_by_axis([list(img.shape) for img in tensor_list]) 58 | # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list])) 59 | batch_shape = [len(tensor_list)] + max_size 60 | b, c, h, w = batch_shape 61 | dtype = tensor_list[0].dtype 62 | device = tensor_list[0].device 63 | tensor = torch.zeros(batch_shape, dtype=dtype, device=device) 64 | mask = torch.ones((b, h, w), dtype=torch.bool, device=device) 65 | for img, pad_img, m in zip(tensor_list, tensor, mask): 66 | pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) 67 | m[: img.shape[1], : img.shape[2]] = False 68 | else: 69 | raise ValueError("not supported") 70 | return NestedTensor(tensor, mask) 71 | 72 | 73 | # _onnx_nested_tensor_from_tensor_list() is an implementation of 74 | # nested_tensor_from_tensor_list() that is supported by ONNX tracing. 75 | @torch.jit.unused 76 | def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor: 77 | max_size = [] 78 | for i in range(tensor_list[0].dim()): 79 | max_size_i = torch.max( 80 | torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32) 81 | ).to(torch.int64) 82 | max_size.append(max_size_i) 83 | max_size = tuple(max_size) 84 | 85 | # work around for 86 | # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) 87 | # m[: img.shape[1], :img.shape[2]] = False 88 | # which is not yet supported in onnx 89 | padded_imgs = [] 90 | padded_masks = [] 91 | for img in tensor_list: 92 | padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))] 93 | padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0])) 94 | padded_imgs.append(padded_img) 95 | 96 | m = torch.zeros_like(img[0], dtype=torch.int, device=img.device) 97 | padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1) 98 | padded_masks.append(padded_mask.to(torch.bool)) 99 | 100 | tensor = torch.stack(padded_imgs) 101 | mask = torch.stack(padded_masks) 102 | 103 | return NestedTensor(tensor, mask=mask) 104 | 105 | 106 | def is_dist_avail_and_initialized(): 107 | if not dist.is_available(): 108 | return False 109 | if not dist.is_initialized(): 110 | return False 111 | return True 112 | -------------------------------------------------------------------------------- /hgformer/modeling/pixel_decoder/ops/test.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | from __future__ import absolute_import 13 | from __future__ import print_function 14 | from __future__ import division 15 | 16 | import time 17 | import torch 18 | import torch.nn as nn 19 | from torch.autograd import gradcheck 20 | 21 | from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch 22 | 23 | 24 | N, M, D = 1, 2, 2 25 | Lq, L, P = 2, 2, 2 26 | shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda() 27 | level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1])) 28 | S = sum([(H*W).item() for H, W in shapes]) 29 | 30 | 31 | torch.manual_seed(3) 32 | 33 | 34 | @torch.no_grad() 35 | def check_forward_equal_with_pytorch_double(): 36 | value = torch.rand(N, S, M, D).cuda() * 0.01 37 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 38 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 39 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 40 | im2col_step = 2 41 | output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu() 42 | output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu() 43 | fwdok = torch.allclose(output_cuda, output_pytorch) 44 | max_abs_err = (output_cuda - output_pytorch).abs().max() 45 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() 46 | 47 | print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') 48 | 49 | 50 | @torch.no_grad() 51 | def check_forward_equal_with_pytorch_float(): 52 | value = torch.rand(N, S, M, D).cuda() * 0.01 53 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 54 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 55 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 56 | im2col_step = 2 57 | output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu() 58 | output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu() 59 | fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3) 60 | max_abs_err = (output_cuda - output_pytorch).abs().max() 61 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() 62 | 63 | print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') 64 | 65 | 66 | def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True): 67 | 68 | value = torch.rand(N, S, M, channels).cuda() * 0.01 69 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 70 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 71 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 72 | im2col_step = 2 73 | func = MSDeformAttnFunction.apply 74 | 75 | value.requires_grad = grad_value 76 | sampling_locations.requires_grad = grad_sampling_loc 77 | attention_weights.requires_grad = grad_attn_weight 78 | 79 | gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step)) 80 | 81 | print(f'* {gradok} check_gradient_numerical(D={channels})') 82 | 83 | 84 | if __name__ == '__main__': 85 | check_forward_equal_with_pytorch_double() 86 | check_forward_equal_with_pytorch_float() 87 | 88 | for channels in [30, 32, 64, 71, 1025, 2048, 3096]: 89 | check_gradient_numerical(channels, True, True, True) 90 | 91 | 92 | 93 | -------------------------------------------------------------------------------- /tools/visualize_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | import argparse 4 | import os 5 | from itertools import chain 6 | import cv2 7 | import tqdm 8 | 9 | from detectron2.config import get_cfg 10 | from detectron2.data import DatasetCatalog, MetadataCatalog, build_detection_train_loader 11 | from detectron2.data import detection_utils as utils 12 | from detectron2.data.build import filter_images_with_few_keypoints 13 | from detectron2.utils.logger import setup_logger 14 | from detectron2.utils.visualizer import Visualizer 15 | from detectron2.projects.deeplab import add_deeplab_config, build_lr_scheduler 16 | 17 | # MaskFormer 18 | from hgformer import ( 19 | COCOInstanceNewBaselineDatasetMapper, 20 | COCOPanopticNewBaselineDatasetMapper, 21 | InstanceSegEvaluator, 22 | MaskFormerInstanceDatasetMapper, 23 | MaskFormerPanopticDatasetMapper, 24 | MaskFormerSemanticDatasetMapper, 25 | SemanticSegmentorWithTTA, 26 | add_maskformer2_config, 27 | ) 28 | 29 | def setup(args): 30 | cfg = get_cfg() 31 | add_deeplab_config(cfg) 32 | add_maskformer2_config(cfg) 33 | if args.config_file: 34 | cfg.merge_from_file(args.config_file) 35 | cfg.merge_from_list(args.opts) 36 | cfg.DATALOADER.NUM_WORKERS = 0 37 | cfg.freeze() 38 | return cfg 39 | 40 | 41 | def parse_args(in_args=None): 42 | parser = argparse.ArgumentParser(description="Visualize ground-truth data") 43 | parser.add_argument( 44 | "--source", 45 | choices=["annotation", "dataloader"], 46 | required=True, 47 | help="visualize the annotations or the data loader (with pre-processing)", 48 | ) 49 | parser.add_argument("--config-file", metavar="FILE", help="path to config file") 50 | parser.add_argument("--output-dir", default="./", help="path to output directory") 51 | parser.add_argument("--show", action="store_true", help="show output in a window") 52 | parser.add_argument( 53 | "opts", 54 | help="Modify config options using the command-line", 55 | default=None, 56 | nargs=argparse.REMAINDER, 57 | ) 58 | return parser.parse_args(in_args) 59 | 60 | 61 | if __name__ == "__main__": 62 | args = parse_args() 63 | logger = setup_logger() 64 | logger.info("Arguments: " + str(args)) 65 | cfg = setup(args) 66 | 67 | dirname = args.output_dir 68 | os.makedirs(dirname, exist_ok=True) 69 | metadata = MetadataCatalog.get(cfg.DATASETS.TRAIN[0]) 70 | 71 | def output(vis, fname): 72 | if args.show: 73 | print(fname) 74 | cv2.imshow("window", vis.get_image()[:, :, ::-1]) 75 | cv2.waitKey() 76 | else: 77 | filepath = os.path.join(dirname, fname) 78 | print("Saving to {} ...".format(filepath)) 79 | vis.save(filepath) 80 | 81 | scale = 1.0 82 | if args.source == "dataloader": 83 | mapper = MaskFormerSemanticDatasetMapper(cfg, True) 84 | train_data_loader = build_detection_train_loader(cfg, mapper=mapper) 85 | for batch in train_data_loader: 86 | for per_image in batch: 87 | # Pytorch tensor is in (C, H, W) format 88 | img = per_image["image"].permute(1, 2, 0).cpu().detach().numpy() 89 | img = utils.convert_image_to_rgb(img, cfg.INPUT.FORMAT) 90 | 91 | visualizer = Visualizer(img, metadata=metadata, scale=scale) 92 | # import ipdb; ipdb.set_trace() 93 | target_fields = per_image["instances"].get_fields() 94 | # import ipdb; ipdb.set_trace() 95 | labels = [metadata.stuff_classes[i] for i in target_fields["gt_classes"]] 96 | 97 | 98 | 99 | vis = visualizer.output 100 | # output(vis, str(per_image["image_id"]) + ".jpg") 101 | output(vis, os.path.basename(per_image['file_name'])) 102 | 103 | 104 | 105 | # vis = visualizer.overlay_instances( 106 | # labels=labels, 107 | # # boxes=target_fields.get("gt_boxes", None), 108 | # masks=target_fields.get("gt_masks", None), 109 | # # keypoints=target_fields.get("gt_keypoints", None), 110 | # ) 111 | # # output(vis, str(per_image["image_id"]) + ".jpg") 112 | # output(vis, os.path.basename(per_image['file_name'])) 113 | else: 114 | dicts = list(chain.from_iterable([DatasetCatalog.get(k) for k in cfg.DATASETS.TRAIN])) 115 | if cfg.MODEL.KEYPOINT_ON: 116 | dicts = filter_images_with_few_keypoints(dicts, 1) 117 | for dic in tqdm.tqdm(dicts): 118 | img = utils.read_image(dic["file_name"], "RGB") 119 | visualizer = Visualizer(img, metadata=metadata, scale=scale) 120 | vis = visualizer.draw_dataset_dict(dic) 121 | output(vis, os.path.basename(dic["file_name"])) 122 | -------------------------------------------------------------------------------- /hgformer/evaluation/instance_evaluation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import contextlib 3 | import copy 4 | import io 5 | import itertools 6 | import json 7 | import logging 8 | import numpy as np 9 | import os 10 | import pickle 11 | from collections import OrderedDict 12 | import pycocotools.mask as mask_util 13 | import torch 14 | from pycocotools.coco import COCO 15 | from pycocotools.cocoeval import COCOeval 16 | from tabulate import tabulate 17 | 18 | import detectron2.utils.comm as comm 19 | from detectron2.config import CfgNode 20 | from detectron2.data import MetadataCatalog 21 | from detectron2.data.datasets.coco import convert_to_coco_json 22 | from detectron2.evaluation.coco_evaluation import COCOEvaluator, _evaluate_predictions_on_coco 23 | from detectron2.evaluation.fast_eval_api import COCOeval_opt 24 | from detectron2.structures import Boxes, BoxMode, pairwise_iou 25 | from detectron2.utils.file_io import PathManager 26 | from detectron2.utils.logger import create_small_table 27 | 28 | 29 | # modified from COCOEvaluator for instance segmetnat 30 | class InstanceSegEvaluator(COCOEvaluator): 31 | """ 32 | Evaluate AR for object proposals, AP for instance detection/segmentation, AP 33 | for keypoint detection outputs using COCO's metrics. 34 | See http://cocodataset.org/#detection-eval and 35 | http://cocodataset.org/#keypoints-eval to understand its metrics. 36 | The metrics range from 0 to 100 (instead of 0 to 1), where a -1 or NaN means 37 | the metric cannot be computed (e.g. due to no predictions made). 38 | 39 | In addition to COCO, this evaluator is able to support any bounding box detection, 40 | instance segmentation, or keypoint detection dataset. 41 | """ 42 | 43 | def _eval_predictions(self, predictions, img_ids=None): 44 | """ 45 | Evaluate predictions. Fill self._results with the metrics of the tasks. 46 | """ 47 | self._logger.info("Preparing results for COCO format ...") 48 | coco_results = list(itertools.chain(*[x["instances"] for x in predictions])) 49 | tasks = self._tasks or self._tasks_from_predictions(coco_results) 50 | 51 | # unmap the category ids for COCO 52 | if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"): 53 | dataset_id_to_contiguous_id = self._metadata.thing_dataset_id_to_contiguous_id 54 | # all_contiguous_ids = list(dataset_id_to_contiguous_id.values()) 55 | # num_classes = len(all_contiguous_ids) 56 | # assert min(all_contiguous_ids) == 0 and max(all_contiguous_ids) == num_classes - 1 57 | 58 | reverse_id_mapping = {v: k for k, v in dataset_id_to_contiguous_id.items()} 59 | for result in coco_results: 60 | category_id = result["category_id"] 61 | # assert category_id < num_classes, ( 62 | # f"A prediction has class={category_id}, " 63 | # f"but the dataset only has {num_classes} classes and " 64 | # f"predicted class id should be in [0, {num_classes - 1}]." 65 | # ) 66 | assert category_id in reverse_id_mapping, ( 67 | f"A prediction has class={category_id}, " 68 | f"but the dataset only has class ids in {dataset_id_to_contiguous_id}." 69 | ) 70 | result["category_id"] = reverse_id_mapping[category_id] 71 | 72 | if self._output_dir: 73 | file_path = os.path.join(self._output_dir, "coco_instances_results.json") 74 | self._logger.info("Saving results to {}".format(file_path)) 75 | with PathManager.open(file_path, "w") as f: 76 | f.write(json.dumps(coco_results)) 77 | f.flush() 78 | 79 | if not self._do_evaluation: 80 | self._logger.info("Annotations are not available for evaluation.") 81 | return 82 | 83 | self._logger.info( 84 | "Evaluating predictions with {} COCO API...".format( 85 | "unofficial" if self._use_fast_impl else "official" 86 | ) 87 | ) 88 | for task in sorted(tasks): 89 | assert task in {"bbox", "segm", "keypoints"}, f"Got unknown task: {task}!" 90 | coco_eval = ( 91 | _evaluate_predictions_on_coco( 92 | self._coco_api, 93 | coco_results, 94 | task, 95 | kpt_oks_sigmas=self._kpt_oks_sigmas, 96 | use_fast_impl=self._use_fast_impl, 97 | img_ids=img_ids, 98 | max_dets_per_image=self._max_dets_per_image, 99 | ) 100 | if len(coco_results) > 0 101 | else None # cocoapi does not handle empty results very well 102 | ) 103 | 104 | res = self._derive_coco_results( 105 | coco_eval, task, class_names=self._metadata.get("thing_classes") 106 | ) 107 | self._results[task] = res 108 | -------------------------------------------------------------------------------- /demo/inference.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Modified by Bowen Cheng from: https://github.com/facebookresearch/detectron2/blob/master/demo/demo.py 3 | import argparse 4 | import glob 5 | import multiprocessing as mp 6 | import os 7 | 8 | # fmt: off 9 | import sys 10 | sys.path.insert(1, os.path.join(sys.path[0], '..')) 11 | # fmt: on 12 | 13 | import tempfile 14 | import time 15 | import warnings 16 | 17 | import cv2 18 | import numpy as np 19 | import tqdm 20 | 21 | from detectron2.config import get_cfg 22 | from detectron2.data.detection_utils import read_image 23 | from detectron2.projects.deeplab import add_deeplab_config 24 | from detectron2.utils.logger import setup_logger 25 | 26 | from hgformer import add_maskformer2_config 27 | from predictor import VisualizationDemo 28 | 29 | 30 | # constants 31 | WINDOW_NAME = "mask2former demo" 32 | 33 | def GetFileFromThisRootDir(dir,ext = None): 34 | allfiles = [] 35 | needExtFilter = (ext != None) 36 | for root,dirs,files in os.walk(dir): 37 | for filespath in files: 38 | filepath = os.path.join(root, filespath) 39 | extension = os.path.splitext(filepath)[1][1:] 40 | if needExtFilter and extension in ext: 41 | allfiles.append(filepath) 42 | elif not needExtFilter: 43 | allfiles.append(filepath) 44 | return allfiles 45 | 46 | def setup_cfg(args): 47 | # load config from file and command-line arguments 48 | cfg = get_cfg() 49 | add_deeplab_config(cfg) 50 | add_maskformer2_config(cfg) 51 | cfg.merge_from_file(args.config_file) 52 | cfg.merge_from_list(args.opts) 53 | cfg.freeze() 54 | return cfg 55 | 56 | 57 | def get_parser(): 58 | parser = argparse.ArgumentParser(description="maskformer2 demo for builtin configs") 59 | parser.add_argument( 60 | "--config-file", 61 | default="configs/coco/panoptic-segmentation/maskformer2_R50_bs16_50ep.yaml", 62 | metavar="FILE", 63 | help="path to config file", 64 | ) 65 | parser.add_argument("--webcam", action="store_true", help="Take inputs from webcam.") 66 | parser.add_argument("--video-input", help="Path to video file.") 67 | parser.add_argument( 68 | "--input", 69 | nargs="+", 70 | help="A list of space separated input images; " 71 | "or a single glob pattern such as 'directory/*.jpg'", 72 | ) 73 | parser.add_argument( 74 | "--output", 75 | help="A file or directory to save output visualizations. " 76 | "If not given, will show output in an OpenCV window.", 77 | ) 78 | 79 | parser.add_argument( 80 | "--confidence-threshold", 81 | type=float, 82 | default=0.5, 83 | help="Minimum score for instance predictions to be shown", 84 | ) 85 | parser.add_argument( 86 | "--opts", 87 | help="Modify config options using the command-line 'KEY VALUE' pairs", 88 | default=[], 89 | nargs=argparse.REMAINDER, 90 | ) 91 | return parser 92 | 93 | 94 | def test_opencv_video_format(codec, file_ext): 95 | with tempfile.TemporaryDirectory(prefix="video_format_test") as dir: 96 | filename = os.path.join(dir, "test_file" + file_ext) 97 | writer = cv2.VideoWriter( 98 | filename=filename, 99 | fourcc=cv2.VideoWriter_fourcc(*codec), 100 | fps=float(30), 101 | frameSize=(10, 10), 102 | isColor=True, 103 | ) 104 | [writer.write(np.zeros((10, 10, 3), np.uint8)) for _ in range(30)] 105 | writer.release() 106 | if os.path.isfile(filename): 107 | return True 108 | return False 109 | 110 | 111 | if __name__ == "__main__": 112 | mp.set_start_method("spawn", force=True) 113 | args = get_parser().parse_args() 114 | setup_logger(name="fvcore") 115 | logger = setup_logger() 116 | logger.info("Arguments: " + str(args)) 117 | 118 | cfg = setup_cfg(args) 119 | 120 | demo = VisualizationDemo(cfg) 121 | 122 | # import ipdb; ipdb.set_trace() 123 | filelist = GetFileFromThisRootDir(args.input[0]) 124 | for path in tqdm.tqdm(filelist, disable=not args.output): 125 | # use PIL, to be consistent with evaluation 126 | img = read_image(path, format="BGR") 127 | start_time = time.time() 128 | # predictions, visualized_output = demo.run_on_image(img) 129 | predictions = demo.predictor(img) 130 | 131 | # import ipdb; ipdb.set_trace() 132 | logger.info( 133 | "{}: {} in {:.2f}s".format( 134 | path, 135 | "detected {} instances".format(len(predictions["instances"])) 136 | if "instances" in predictions 137 | else "finished", 138 | time.time() - start_time, 139 | ) 140 | ) 141 | 142 | basename = os.path.basename(path) 143 | if not os.path.exists(args.output): 144 | os.makedirs(args.output) 145 | output_path = os.path.join(args.output, basename) 146 | 147 | outimg = predictions['sem_seg'].detach().cpu().numpy().argmax(0).astype(np.uint8) 148 | cv2.imwrite(output_path, outimg) 149 | -------------------------------------------------------------------------------- /datasets/README.md: -------------------------------------------------------------------------------- 1 | # Prepare Datasets for HGFormer 2 | 3 | A dataset can be used by accessing [DatasetCatalog](https://detectron2.readthedocs.io/modules/data.html#detectron2.data.DatasetCatalog) 4 | for its data, or [MetadataCatalog](https://detectron2.readthedocs.io/modules/data.html#detectron2.data.MetadataCatalog) for its metadata (class names, etc). 5 | This document explains how to setup the builtin datasets so they can be used by the above APIs. 6 | [Use Custom Datasets](https://detectron2.readthedocs.io/tutorials/datasets.html) gives a deeper dive on how to use `DatasetCatalog` and `MetadataCatalog`, 7 | and how to add new datasets to them. 8 | 9 | HGFormer has builtin support for a few datasets. 10 | The datasets are assumed to exist in a directory specified by the environment variable 11 | `DETECTRON2_DATASETS`. 12 | Under this directory, detectron2 will look for datasets in the structure described below, if needed. 13 | ``` 14 | $DETECTRON2_DATASETS/ 15 | cityscapes/ 16 | cityscapes-c/ 17 | mapillary/ 18 | acdc/ 19 | bdd/ 20 | gta/ 21 | synthia/ 22 | ``` 23 | 24 | You can set the location for builtin datasets by `export DETECTRON2_DATASETS=/path/to/datasets`. 25 | If left unset, the default is `./datasets` relative to your current working directory. 26 | 27 | 28 | ## Expected dataset structure for [cityscapes](https://www.cityscapes-dataset.com/downloads/): 29 | ``` 30 | cityscapes/ 31 | gtFine/ 32 | train/ 33 | aachen/ 34 | color.png, instanceIds.png, labelIds.png, polygons.json, 35 | labelTrainIds.png 36 | ... 37 | val/ 38 | test/ 39 | leftImg8bit/ 40 | train/ 41 | val/ 42 | test/ 43 | ``` 44 | 45 | Install cityscapes scripts by: 46 | ``` 47 | pip install git+https://github.com/mcordts/cityscapesScripts.git 48 | ``` 49 | 50 | Note: to create labelTrainIds.png, first prepare the above structure, then run cityscapesescript with: 51 | ``` 52 | CITYSCAPES_DATASET=/path/to/abovementioned/cityscapes python cityscapesscripts/preparation/createTrainIdLabelImgs.py 53 | ``` 54 | 55 | ## Expected dataset structure for [ACDC](https://acdc.vision.ee.ethz.ch/download): 56 | ``` 57 | acdc/ 58 | rgb_anon/ 59 | fog/ 60 | test/ 61 | night/ 62 | test/ 63 | rain/ 64 | test/ 65 | snow/ 66 | test/ 67 | all/ 68 | test/ 69 | 70 | ``` 71 | You should create the folder of ```all``` and copy test images of all types to ```all/test``` 72 | 73 | ## Expected dataset structure for [Mapillary](https://www.mapillary.com/dataset/vistas): 74 | ``` 75 | mapillary/ 76 | training/ 77 | images/ 78 | labels 79 | validation/ 80 | images/ 81 | labels/ 82 | testing/ 83 | images/ 84 | labels/ 85 | labels_detectron2/ 86 | training/ 87 | validation/ 88 | ``` 89 | Run `python datasets/prepare_mapillary_sem_seg.py`, to map the mapillary labels to the Cityscapes labels 90 | 91 | 92 | ## Expected dataset structure for [BDD](https://www.mapillary.com/dataset/vistas): 93 | ``` 94 | bdd/ 95 | images/ 96 | 10k/ 97 | train/ 98 | val/ 99 | labels/ 100 | sem_seg/ 101 | masks/ 102 | train/ 103 | val/ 104 | ``` 105 | 106 | 107 | ## Expected dataset structure for [Cityscapes-c](): 108 | 109 | ``` 110 | cityscapes-c/ 111 | clean/ 112 | brightness/ 113 | 1/ 114 | 2/ 115 | 3/ 116 | 4/ 117 | 5/ 118 | ... 119 | ``` 120 | 121 | The folder clean should include the cityscapes images of val set. 122 | 123 | The folders of corruption types (e.g. brightness) are generated by run `python datasets/generate_cityscapes_c.py` 124 | 125 | 126 | ## Expected dataset structure for [GTAV](https://download.visinf.tu-darmstadt.de/data/from_games/): 127 | ``` 128 | gta/ 129 | images/ 130 | train/ 131 | valid/ 132 | test/ 133 | labels/ 134 | train/ 135 | valid/ 136 | test/ 137 | labels_detectron2/ 138 | train/ 139 | valid/ 140 | test/ 141 | ``` 142 | Downlaod the GTA from https://download.visinf.tu-darmstadt.de/data/from_games/ 143 | 144 | Then unzip the images and labels. 145 | 146 | We split the dataset following [RobustNet](https://github.com/shachoi/RobustNet) 147 | ``` 148 | python datasets/split_data/gta/split_gta.py 149 | ``` 150 | For the GTA dataset, a small set of label maps (60 frames) has a different resolution than their corresponding image. 151 | Therefore, we need to resize these label maps. 152 | ``` 153 | python datasets/split_data/gta/resize_img.py 154 | mv datasets/GTA/labels/valid_resize/* datasets/GTA/labels/valid/ 155 | rm -rf datasets/GTA/labels/valid_resize/ 156 | ``` 157 | Finally, we map the labels for detectron2: 158 | ``` 159 | python datasets/prepare_gta_sem_seg.py 160 | ``` 161 | 162 | ## Expected dataset structure for [Synthia](https://synthia-dataset.net/downloads/): 163 | ``` 164 | synthia/ 165 | Depth/ 166 | Depth 167 | GT/ 168 | COLOR/ 169 | LABELS/ 170 | train/ 171 | val/ 172 | RGB/ 173 | train/ 174 | val/ 175 | ``` 176 | We follow the [RobustNet]() to split the dataset. 177 | ``` 178 | python datasets/synthia/split_synthia.py 179 | ``` 180 | We then map the labels from synthia to cityscapes. 181 | ``` 182 | python datasets/prepare_synthia_sem_seg.py 183 | ``` 184 | 185 | -------------------------------------------------------------------------------- /hgformer/data/samplers/balanced_sampler.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import itertools 3 | import logging 4 | import math 5 | from collections import defaultdict 6 | from typing import Optional 7 | import torch 8 | from torch.utils.data.sampler import Sampler 9 | from detectron2.utils import comm 10 | 11 | class BalancedTrainingSampler(Sampler): 12 | """ 13 | This is modified from repeat sampler 14 | Similar to TrainingSampler, but a sample may appear more times than others based 15 | on its "repeat factor". 16 | """ 17 | 18 | def __init__(self, repeat_factors, *, shuffle=True, seed=None): 19 | """ 20 | Args: 21 | repeat_factors (Tensor): a float vector, the repeat factor for each indice. When it's 22 | full of ones, it is equivalent to ``TrainingSampler(len(repeat_factors), ...)``. 23 | shuffle (bool): whether to shuffle the indices or not 24 | seed (int): the initial seed of the shuffle. Must be the same 25 | across all workers. If None, will use a random seed shared 26 | among workers (require synchronization among all workers). 27 | """ 28 | self._shuffle = shuffle 29 | if seed is None: 30 | seed = comm.shared_random_seed() 31 | self._seed = int(seed) 32 | 33 | self._rank = comm.get_rank() 34 | self._world_size = comm.get_world_size() 35 | 36 | # Split into whole number (_int_part) and fractional (_frac_part) parts. 37 | self._int_part = torch.trunc(repeat_factors) 38 | self._frac_part = repeat_factors - self._int_part 39 | 40 | @staticmethod 41 | def repeat_factors_from_category_frequency(dataset_dicts, repeat_thresh): 42 | """ 43 | Compute (fractional) per-image repeat factors based on category frequency. 44 | The repeat factor for an image is a function of the frequency of the rarest 45 | category labeled in that image. The "frequency of category c" in [0, 1] is defined 46 | as the fraction of images in the training set (without repeats) in which category c 47 | appears. 48 | See :paper:`lvis` (>= v2) Appendix B.2. 49 | 50 | Args: 51 | dataset_dicts (list[dict]): annotations in Detectron2 dataset format. 52 | repeat_thresh (float): frequency threshold below which data is repeated. 53 | If the frequency is half of `repeat_thresh`, the image will be 54 | repeated twice. 55 | 56 | Returns: 57 | torch.Tensor: 58 | the i-th element is the repeat factor for the dataset image at index i. 59 | """ 60 | # 1. For each category c, compute the fraction of images that contain it: f(c) 61 | category_freq = defaultdict(int) 62 | import ipdb; ipdb.set_trace() 63 | for dataset_dict in dataset_dicts: # For each image (without repeats) 64 | cat_ids = {ann["category_id"] for ann in dataset_dict["annotations"]} 65 | for cat_id in cat_ids: 66 | category_freq[cat_id] += 1 67 | num_images = len(dataset_dicts) 68 | for k, v in category_freq.items(): 69 | category_freq[k] = v / num_images 70 | 71 | # 2. For each category c, compute the category-level repeat factor: 72 | # r(c) = max(1, sqrt(t / f(c))) 73 | category_rep = { 74 | cat_id: max(1.0, math.sqrt(repeat_thresh / cat_freq)) 75 | for cat_id, cat_freq in category_freq.items() 76 | } 77 | 78 | # 3. For each image I, compute the image-level repeat factor: 79 | # r(I) = max_{c in I} r(c) 80 | rep_factors = [] 81 | for dataset_dict in dataset_dicts: 82 | cat_ids = {ann["category_id"] for ann in dataset_dict["annotations"]} 83 | rep_factor = max({category_rep[cat_id] for cat_id in cat_ids}, default=1.0) 84 | rep_factors.append(rep_factor) 85 | 86 | return torch.tensor(rep_factors, dtype=torch.float32) 87 | 88 | def _get_epoch_indices(self, generator): 89 | """ 90 | Create a list of dataset indices (with repeats) to use for one epoch. 91 | 92 | Args: 93 | generator (torch.Generator): pseudo random number generator used for 94 | stochastic rounding. 95 | 96 | Returns: 97 | torch.Tensor: list of dataset indices to use in one epoch. Each index 98 | is repeated based on its calculated repeat factor. 99 | """ 100 | # Since repeat factors are fractional, we use stochastic rounding so 101 | # that the target repeat factor is achieved in expectation over the 102 | # course of training 103 | rands = torch.rand(len(self._frac_part), generator=generator) 104 | rep_factors = self._int_part + (rands < self._frac_part).float() 105 | # Construct a list of indices in which we repeat images as specified 106 | indices = [] 107 | for dataset_index, rep_factor in enumerate(rep_factors): 108 | indices.extend([dataset_index] * int(rep_factor.item())) 109 | return torch.tensor(indices, dtype=torch.int64) 110 | 111 | def __iter__(self): 112 | start = self._rank 113 | yield from itertools.islice(self._infinite_indices(), start, None, self._world_size) 114 | 115 | def _infinite_indices(self): 116 | g = torch.Generator() 117 | g.manual_seed(self._seed) 118 | while True: 119 | # Sample indices with repeats determined by stochastic rounding; each 120 | # "epoch" may have a slightly different size due to the rounding. 121 | indices = self._get_epoch_indices(g) 122 | if self._shuffle: 123 | randperm = torch.randperm(len(indices), generator=g) 124 | yield from indices[randperm].tolist() 125 | else: 126 | yield from indices.tolist() 127 | -------------------------------------------------------------------------------- /hgformer/modeling/meta_arch/mask_former_head.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import logging 3 | from copy import deepcopy 4 | from typing import Callable, Dict, List, Optional, Tuple, Union 5 | 6 | import fvcore.nn.weight_init as weight_init 7 | from torch import nn 8 | from torch.nn import functional as F 9 | 10 | from detectron2.config import configurable 11 | from detectron2.layers import Conv2d, ShapeSpec, get_norm 12 | from detectron2.modeling import SEM_SEG_HEADS_REGISTRY 13 | 14 | from ..transformer_decoder.maskformer_transformer_decoder import build_transformer_decoder 15 | from ..pixel_decoder.fpn import build_pixel_decoder 16 | 17 | 18 | @SEM_SEG_HEADS_REGISTRY.register() 19 | class MaskFormerHead(nn.Module): 20 | 21 | _version = 2 22 | 23 | # def _load_from_state_dict( 24 | # self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs 25 | # ): 26 | # version = local_metadata.get("version", None) 27 | # if version is None or version < 2: 28 | # # Do not warn if train from scratch 29 | # scratch = True 30 | # logger = logging.getLogger(__name__) 31 | # for k in list(state_dict.keys()): 32 | # newk = k 33 | # if "sem_seg_head" in k and not k.startswith(prefix + "predictor"): 34 | # newk = k.replace(prefix, prefix + "pixel_decoder.") 35 | # # logger.debug(f"{k} ==> {newk}") 36 | # if newk != k: 37 | # state_dict[newk] = state_dict[k] 38 | # del state_dict[k] 39 | # scratch = False 40 | # 41 | # if not scratch: 42 | # logger.warning( 43 | # f"Weight format of {self.__class__.__name__} have changed! " 44 | # "Please upgrade your models. Applying automatic conversion now ..." 45 | # ) 46 | 47 | @configurable 48 | def __init__( 49 | self, 50 | input_shape: Dict[str, ShapeSpec], 51 | *, 52 | num_classes: int, 53 | pixel_decoder: nn.Module, 54 | loss_weight: float = 1.0, 55 | ignore_value: int = -1, 56 | # extra parameters 57 | transformer_predictor: nn.Module, 58 | transformer_in_feature: str, 59 | ): 60 | """ 61 | NOTE: this interface is experimental. 62 | Args: 63 | input_shape: shapes (channels and stride) of the input features 64 | num_classes: number of classes to predict 65 | pixel_decoder: the pixel decoder module 66 | loss_weight: loss weight 67 | ignore_value: category id to be ignored during training. 68 | transformer_predictor: the transformer decoder that makes prediction 69 | transformer_in_feature: input feature name to the transformer_predictor 70 | """ 71 | super().__init__() 72 | input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride) 73 | self.in_features = [k for k, v in input_shape] 74 | feature_strides = [v.stride for k, v in input_shape] 75 | feature_channels = [v.channels for k, v in input_shape] 76 | 77 | self.ignore_value = ignore_value 78 | self.common_stride = 4 79 | self.loss_weight = loss_weight 80 | 81 | self.pixel_decoder = pixel_decoder 82 | self.predictor = transformer_predictor 83 | self.transformer_in_feature = transformer_in_feature 84 | 85 | self.num_classes = num_classes 86 | 87 | @classmethod 88 | def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]): 89 | # figure out in_channels to transformer predictor 90 | if cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "transformer_encoder": 91 | transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM 92 | elif cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "pixel_embedding": 93 | transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM 94 | elif cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "multi_scale_pixel_decoder": # for maskformer2 95 | transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM 96 | else: 97 | transformer_predictor_in_channels = input_shape[cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE].channels 98 | 99 | return { 100 | "input_shape": { 101 | k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES 102 | }, 103 | "ignore_value": cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE, 104 | "num_classes": cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES, 105 | "pixel_decoder": build_pixel_decoder(cfg, input_shape), 106 | "loss_weight": cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT, 107 | "transformer_in_feature": cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE, 108 | "transformer_predictor": build_transformer_decoder( 109 | cfg, 110 | transformer_predictor_in_channels, 111 | mask_classification=True, 112 | ), 113 | } 114 | 115 | def forward(self, features, mask=None): 116 | return self.layers(features, mask) 117 | 118 | def layers(self, features, mask=None): 119 | mask_features, transformer_encoder_features, multi_scale_features = self.pixel_decoder.forward_features(features) 120 | if self.transformer_in_feature == "multi_scale_pixel_decoder": 121 | predictions = self.predictor(multi_scale_features, mask_features, mask) 122 | else: 123 | if self.transformer_in_feature == "transformer_encoder": 124 | assert ( 125 | transformer_encoder_features is not None 126 | ), "Please use the TransformerEncoderPixelDecoder." 127 | predictions = self.predictor(transformer_encoder_features, mask_features, mask) 128 | elif self.transformer_in_feature == "pixel_embedding": 129 | predictions = self.predictor(mask_features, mask_features, mask) 130 | else: 131 | predictions = self.predictor(features[self.transformer_in_feature], mask_features, mask) 132 | return predictions 133 | -------------------------------------------------------------------------------- /tools/analyze_model.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # Modified by Bowen Cheng from https://github.com/facebookresearch/detectron2/blob/main/tools/analyze_model.py 4 | 5 | import logging 6 | import numpy as np 7 | from collections import Counter 8 | import tqdm 9 | from fvcore.nn import flop_count_table # can also try flop_count_str 10 | 11 | from detectron2.checkpoint import DetectionCheckpointer 12 | from detectron2.config import CfgNode, LazyConfig, get_cfg, instantiate 13 | from detectron2.data import build_detection_test_loader 14 | from detectron2.engine import default_argument_parser 15 | from detectron2.modeling import build_model 16 | from detectron2.projects.deeplab import add_deeplab_config 17 | from detectron2.utils.analysis import ( 18 | FlopCountAnalysis, 19 | activation_count_operators, 20 | parameter_count_table, 21 | ) 22 | from detectron2.utils.logger import setup_logger 23 | 24 | # fmt: off 25 | import os 26 | import sys 27 | sys.path.insert(1, os.path.join(sys.path[0], '..')) 28 | # fmt: on 29 | 30 | from hgformer import add_maskformer2_config 31 | 32 | logger = logging.getLogger("detectron2") 33 | 34 | 35 | def setup(args): 36 | if args.config_file.endswith(".yaml"): 37 | cfg = get_cfg() 38 | add_deeplab_config(cfg) 39 | add_maskformer2_config(cfg) 40 | cfg.merge_from_file(args.config_file) 41 | cfg.DATALOADER.NUM_WORKERS = 0 42 | cfg.merge_from_list(args.opts) 43 | cfg.freeze() 44 | else: 45 | cfg = LazyConfig.load(args.config_file) 46 | cfg = LazyConfig.apply_overrides(cfg, args.opts) 47 | setup_logger(name="fvcore") 48 | setup_logger() 49 | return cfg 50 | 51 | 52 | def do_flop(cfg): 53 | if isinstance(cfg, CfgNode): 54 | data_loader = build_detection_test_loader(cfg, cfg.DATASETS.TEST[0]) 55 | model = build_model(cfg) 56 | DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS) 57 | else: 58 | data_loader = instantiate(cfg.dataloader.test) 59 | model = instantiate(cfg.model) 60 | model.to(cfg.train.device) 61 | DetectionCheckpointer(model).load(cfg.train.init_checkpoint) 62 | model.eval() 63 | 64 | counts = Counter() 65 | total_flops = [] 66 | for idx, data in zip(tqdm.trange(args.num_inputs), data_loader): # noqa 67 | if args.use_fixed_input_size and isinstance(cfg, CfgNode): 68 | import torch 69 | crop_size = cfg.INPUT.CROP.SIZE[0] 70 | data[0]["image"] = torch.zeros((3, crop_size, crop_size)) 71 | flops = FlopCountAnalysis(model, data) 72 | if idx > 0: 73 | flops.unsupported_ops_warnings(False).uncalled_modules_warnings(False) 74 | counts += flops.by_operator() 75 | total_flops.append(flops.total()) 76 | 77 | logger.info("Flops table computed from only one input sample:\n" + flop_count_table(flops)) 78 | logger.info( 79 | "Average GFlops for each type of operators:\n" 80 | + str([(k, v / (idx + 1) / 1e9) for k, v in counts.items()]) 81 | ) 82 | logger.info( 83 | "Total GFlops: {:.1f}±{:.1f}".format(np.mean(total_flops) / 1e9, np.std(total_flops) / 1e9) 84 | ) 85 | 86 | 87 | def do_activation(cfg): 88 | if isinstance(cfg, CfgNode): 89 | data_loader = build_detection_test_loader(cfg, cfg.DATASETS.TEST[0]) 90 | model = build_model(cfg) 91 | DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS) 92 | else: 93 | data_loader = instantiate(cfg.dataloader.test) 94 | model = instantiate(cfg.model) 95 | model.to(cfg.train.device) 96 | DetectionCheckpointer(model).load(cfg.train.init_checkpoint) 97 | model.eval() 98 | 99 | counts = Counter() 100 | total_activations = [] 101 | for idx, data in zip(tqdm.trange(args.num_inputs), data_loader): # noqa 102 | count = activation_count_operators(model, data) 103 | counts += count 104 | total_activations.append(sum(count.values())) 105 | logger.info( 106 | "(Million) Activations for Each Type of Operators:\n" 107 | + str([(k, v / idx) for k, v in counts.items()]) 108 | ) 109 | logger.info( 110 | "Total (Million) Activations: {}±{}".format( 111 | np.mean(total_activations), np.std(total_activations) 112 | ) 113 | ) 114 | 115 | 116 | def do_parameter(cfg): 117 | if isinstance(cfg, CfgNode): 118 | model = build_model(cfg) 119 | else: 120 | model = instantiate(cfg.model) 121 | logger.info("Parameter Count:\n" + parameter_count_table(model, max_depth=5)) 122 | 123 | 124 | def do_structure(cfg): 125 | if isinstance(cfg, CfgNode): 126 | model = build_model(cfg) 127 | else: 128 | model = instantiate(cfg.model) 129 | logger.info("Model Structure:\n" + str(model)) 130 | 131 | 132 | if __name__ == "__main__": 133 | parser = default_argument_parser( 134 | epilog=""" 135 | Examples: 136 | To show parameters of a model: 137 | $ ./analyze_model.py --tasks parameter \\ 138 | --config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml 139 | Flops and activations are data-dependent, therefore inputs and model weights 140 | are needed to count them: 141 | $ ./analyze_model.py --num-inputs 100 --tasks flop \\ 142 | --config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \\ 143 | MODEL.WEIGHTS /path/to/model.pkl 144 | """ 145 | ) 146 | parser.add_argument( 147 | "--tasks", 148 | choices=["flop", "activation", "parameter", "structure"], 149 | required=True, 150 | nargs="+", 151 | ) 152 | parser.add_argument( 153 | "-n", 154 | "--num-inputs", 155 | default=100, 156 | type=int, 157 | help="number of inputs used to compute statistics for flops/activations, " 158 | "both are data dependent.", 159 | ) 160 | parser.add_argument( 161 | "--use-fixed-input-size", 162 | action="store_true", 163 | help="use fixed input size when calculating flops", 164 | ) 165 | args = parser.parse_args() 166 | assert not args.eval_only 167 | assert args.num_gpus == 1 168 | 169 | cfg = setup(args) 170 | 171 | for task in args.tasks: 172 | { 173 | "flop": do_flop, 174 | "activation": do_activation, 175 | "parameter": do_parameter, 176 | "structure": do_structure, 177 | }[task](cfg) 178 | -------------------------------------------------------------------------------- /hgformer/data/dataset_mappers/mask_former_panoptic_dataset_mapper.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import copy 3 | import logging 4 | 5 | import numpy as np 6 | import torch 7 | from torch.nn import functional as F 8 | 9 | from detectron2.config import configurable 10 | from detectron2.data import detection_utils as utils 11 | from detectron2.data import transforms as T 12 | from detectron2.structures import BitMasks, Instances 13 | 14 | from .mask_former_semantic_dataset_mapper import MaskFormerSemanticDatasetMapper 15 | 16 | __all__ = ["MaskFormerPanopticDatasetMapper"] 17 | 18 | 19 | class MaskFormerPanopticDatasetMapper(MaskFormerSemanticDatasetMapper): 20 | """ 21 | A callable which takes a dataset dict in Detectron2 Dataset format, 22 | and map it into a format used by MaskFormer for panoptic segmentation. 23 | 24 | The callable currently does the following: 25 | 26 | 1. Read the image from "file_name" 27 | 2. Applies geometric transforms to the image and annotation 28 | 3. Find and applies suitable cropping to the image and annotation 29 | 4. Prepare image and annotation to Tensors 30 | """ 31 | 32 | @configurable 33 | def __init__( 34 | self, 35 | is_train=True, 36 | *, 37 | augmentations, 38 | image_format, 39 | ignore_label, 40 | size_divisibility, 41 | ): 42 | """ 43 | NOTE: this interface is experimental. 44 | Args: 45 | is_train: for training or inference 46 | augmentations: a list of augmentations or deterministic transforms to apply 47 | image_format: an image format supported by :func:`detection_utils.read_image`. 48 | ignore_label: the label that is ignored to evaluation 49 | size_divisibility: pad image size to be divisible by this value 50 | """ 51 | super().__init__( 52 | is_train, 53 | augmentations=augmentations, 54 | image_format=image_format, 55 | ignore_label=ignore_label, 56 | size_divisibility=size_divisibility, 57 | ) 58 | 59 | def __call__(self, dataset_dict): 60 | """ 61 | Args: 62 | dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. 63 | 64 | Returns: 65 | dict: a format that builtin models in detectron2 accept 66 | """ 67 | assert self.is_train, "MaskFormerPanopticDatasetMapper should only be used for training!" 68 | 69 | dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below 70 | image = utils.read_image(dataset_dict["file_name"], format=self.img_format) 71 | utils.check_image_size(dataset_dict, image) 72 | 73 | # semantic segmentation 74 | if "sem_seg_file_name" in dataset_dict: 75 | # PyTorch transformation not implemented for uint16, so converting it to double first 76 | sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name")).astype("double") 77 | else: 78 | sem_seg_gt = None 79 | 80 | # panoptic segmentation 81 | if "pan_seg_file_name" in dataset_dict: 82 | pan_seg_gt = utils.read_image(dataset_dict.pop("pan_seg_file_name"), "RGB") 83 | segments_info = dataset_dict["segments_info"] 84 | else: 85 | pan_seg_gt = None 86 | segments_info = None 87 | 88 | if pan_seg_gt is None: 89 | raise ValueError( 90 | "Cannot find 'pan_seg_file_name' for panoptic segmentation dataset {}.".format( 91 | dataset_dict["file_name"] 92 | ) 93 | ) 94 | 95 | aug_input = T.AugInput(image, sem_seg=sem_seg_gt) 96 | aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input) 97 | image = aug_input.image 98 | if sem_seg_gt is not None: 99 | sem_seg_gt = aug_input.sem_seg 100 | 101 | # apply the same transformation to panoptic segmentation 102 | pan_seg_gt = transforms.apply_segmentation(pan_seg_gt) 103 | 104 | from panopticapi.utils import rgb2id 105 | 106 | pan_seg_gt = rgb2id(pan_seg_gt) 107 | 108 | # Pad image and segmentation label here! 109 | image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))) 110 | if sem_seg_gt is not None: 111 | sem_seg_gt = torch.as_tensor(sem_seg_gt.astype("long")) 112 | pan_seg_gt = torch.as_tensor(pan_seg_gt.astype("long")) 113 | 114 | if self.size_divisibility > 0: 115 | image_size = (image.shape[-2], image.shape[-1]) 116 | padding_size = [ 117 | 0, 118 | self.size_divisibility - image_size[1], 119 | 0, 120 | self.size_divisibility - image_size[0], 121 | ] 122 | image = F.pad(image, padding_size, value=128).contiguous() 123 | if sem_seg_gt is not None: 124 | sem_seg_gt = F.pad(sem_seg_gt, padding_size, value=self.ignore_label).contiguous() 125 | pan_seg_gt = F.pad( 126 | pan_seg_gt, padding_size, value=0 127 | ).contiguous() # 0 is the VOID panoptic label 128 | 129 | image_shape = (image.shape[-2], image.shape[-1]) # h, w 130 | 131 | # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory, 132 | # but not efficient on large generic data structures due to the use of pickle & mp.Queue. 133 | # Therefore it's important to use torch.Tensor. 134 | dataset_dict["image"] = image 135 | if sem_seg_gt is not None: 136 | dataset_dict["sem_seg"] = sem_seg_gt.long() 137 | 138 | if "annotations" in dataset_dict: 139 | raise ValueError("Pemantic segmentation dataset should not have 'annotations'.") 140 | 141 | # Prepare per-category binary masks 142 | pan_seg_gt = pan_seg_gt.numpy() 143 | instances = Instances(image_shape) 144 | classes = [] 145 | masks = [] 146 | for segment_info in segments_info: 147 | class_id = segment_info["category_id"] 148 | if not segment_info["iscrowd"]: 149 | classes.append(class_id) 150 | masks.append(pan_seg_gt == segment_info["id"]) 151 | 152 | classes = np.array(classes) 153 | instances.gt_classes = torch.tensor(classes, dtype=torch.int64) 154 | if len(masks) == 0: 155 | # Some image does not have annotation (all ignored) 156 | instances.gt_masks = torch.zeros((0, pan_seg_gt.shape[-2], pan_seg_gt.shape[-1])) 157 | else: 158 | masks = BitMasks( 159 | torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks]) 160 | ) 161 | instances.gt_masks = masks.tensor 162 | 163 | dataset_dict["instances"] = instances 164 | 165 | return dataset_dict 166 | -------------------------------------------------------------------------------- /hgformer/modeling/pixel_decoder/ops/modules/ms_deform_attn.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | from __future__ import absolute_import 13 | from __future__ import print_function 14 | from __future__ import division 15 | 16 | import warnings 17 | import math 18 | 19 | import torch 20 | from torch import nn 21 | import torch.nn.functional as F 22 | from torch.nn.init import xavier_uniform_, constant_ 23 | 24 | from ..functions import MSDeformAttnFunction 25 | from ..functions.ms_deform_attn_func import ms_deform_attn_core_pytorch 26 | 27 | 28 | def _is_power_of_2(n): 29 | if (not isinstance(n, int)) or (n < 0): 30 | raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n))) 31 | return (n & (n-1) == 0) and n != 0 32 | 33 | 34 | class MSDeformAttn(nn.Module): 35 | def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4): 36 | """ 37 | Multi-Scale Deformable Attention Module 38 | :param d_model hidden dimension 39 | :param n_levels number of feature levels 40 | :param n_heads number of attention heads 41 | :param n_points number of sampling points per attention head per feature level 42 | """ 43 | super().__init__() 44 | if d_model % n_heads != 0: 45 | raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads)) 46 | _d_per_head = d_model // n_heads 47 | # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation 48 | if not _is_power_of_2(_d_per_head): 49 | warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 " 50 | "which is more efficient in our CUDA implementation.") 51 | 52 | self.im2col_step = 128 53 | 54 | self.d_model = d_model 55 | self.n_levels = n_levels 56 | self.n_heads = n_heads 57 | self.n_points = n_points 58 | 59 | self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2) 60 | self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points) 61 | self.value_proj = nn.Linear(d_model, d_model) 62 | self.output_proj = nn.Linear(d_model, d_model) 63 | 64 | self._reset_parameters() 65 | 66 | def _reset_parameters(self): 67 | constant_(self.sampling_offsets.weight.data, 0.) 68 | thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads) 69 | grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) 70 | grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1) 71 | for i in range(self.n_points): 72 | grid_init[:, :, i, :] *= i + 1 73 | with torch.no_grad(): 74 | self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1)) 75 | constant_(self.attention_weights.weight.data, 0.) 76 | constant_(self.attention_weights.bias.data, 0.) 77 | xavier_uniform_(self.value_proj.weight.data) 78 | constant_(self.value_proj.bias.data, 0.) 79 | xavier_uniform_(self.output_proj.weight.data) 80 | constant_(self.output_proj.bias.data, 0.) 81 | 82 | def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None): 83 | """ 84 | :param query (N, Length_{query}, C) 85 | :param reference_points (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area 86 | or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes 87 | :param input_flatten (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C) 88 | :param input_spatial_shapes (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})] 89 | :param input_level_start_index (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}] 90 | :param input_padding_mask (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements 91 | 92 | :return output (N, Length_{query}, C) 93 | """ 94 | N, Len_q, _ = query.shape 95 | N, Len_in, _ = input_flatten.shape 96 | assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in 97 | 98 | value = self.value_proj(input_flatten) 99 | if input_padding_mask is not None: 100 | value = value.masked_fill(input_padding_mask[..., None], float(0)) 101 | value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads) 102 | sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2) 103 | attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points) 104 | attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points) 105 | # N, Len_q, n_heads, n_levels, n_points, 2 106 | if reference_points.shape[-1] == 2: 107 | offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1) 108 | sampling_locations = reference_points[:, :, None, :, None, :] \ 109 | + sampling_offsets / offset_normalizer[None, None, None, :, None, :] 110 | elif reference_points.shape[-1] == 4: 111 | sampling_locations = reference_points[:, :, None, :, None, :2] \ 112 | + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5 113 | else: 114 | raise ValueError( 115 | 'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1])) 116 | try: 117 | output = MSDeformAttnFunction.apply( 118 | value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step) 119 | except: 120 | # CPU 121 | output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights) 122 | # # For FLOPs calculation only 123 | # output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights) 124 | output = self.output_proj(output) 125 | return output 126 | -------------------------------------------------------------------------------- /hgformer/data/dataset_mappers/mask_former_instance_dataset_mapper.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import copy 3 | import logging 4 | 5 | import numpy as np 6 | import pycocotools.mask as mask_util 7 | import torch 8 | from torch.nn import functional as F 9 | 10 | from detectron2.config import configurable 11 | from detectron2.data import detection_utils as utils 12 | from detectron2.data import transforms as T 13 | from detectron2.projects.point_rend import ColorAugSSDTransform 14 | from detectron2.structures import BitMasks, Instances, polygons_to_bitmask 15 | 16 | __all__ = ["MaskFormerInstanceDatasetMapper"] 17 | 18 | 19 | class MaskFormerInstanceDatasetMapper: 20 | """ 21 | A callable which takes a dataset dict in Detectron2 Dataset format, 22 | and map it into a format used by MaskFormer for instance segmentation. 23 | 24 | The callable currently does the following: 25 | 26 | 1. Read the image from "file_name" 27 | 2. Applies geometric transforms to the image and annotation 28 | 3. Find and applies suitable cropping to the image and annotation 29 | 4. Prepare image and annotation to Tensors 30 | """ 31 | 32 | @configurable 33 | def __init__( 34 | self, 35 | is_train=True, 36 | *, 37 | augmentations, 38 | image_format, 39 | size_divisibility, 40 | ): 41 | """ 42 | NOTE: this interface is experimental. 43 | Args: 44 | is_train: for training or inference 45 | augmentations: a list of augmentations or deterministic transforms to apply 46 | image_format: an image format supported by :func:`detection_utils.read_image`. 47 | size_divisibility: pad image size to be divisible by this value 48 | """ 49 | self.is_train = is_train 50 | self.tfm_gens = augmentations 51 | self.img_format = image_format 52 | self.size_divisibility = size_divisibility 53 | 54 | logger = logging.getLogger(__name__) 55 | mode = "training" if is_train else "inference" 56 | logger.info(f"[{self.__class__.__name__}] Augmentations used in {mode}: {augmentations}") 57 | 58 | @classmethod 59 | def from_config(cls, cfg, is_train=True): 60 | # Build augmentation 61 | augs = [ 62 | T.ResizeShortestEdge( 63 | cfg.INPUT.MIN_SIZE_TRAIN, 64 | cfg.INPUT.MAX_SIZE_TRAIN, 65 | cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING, 66 | ) 67 | ] 68 | if cfg.INPUT.CROP.ENABLED: 69 | augs.append( 70 | T.RandomCrop( 71 | cfg.INPUT.CROP.TYPE, 72 | cfg.INPUT.CROP.SIZE, 73 | ) 74 | ) 75 | if cfg.INPUT.COLOR_AUG_SSD: 76 | augs.append(ColorAugSSDTransform(img_format=cfg.INPUT.FORMAT)) 77 | augs.append(T.RandomFlip()) 78 | 79 | ret = { 80 | "is_train": is_train, 81 | "augmentations": augs, 82 | "image_format": cfg.INPUT.FORMAT, 83 | "size_divisibility": cfg.INPUT.SIZE_DIVISIBILITY, 84 | } 85 | return ret 86 | 87 | def __call__(self, dataset_dict): 88 | """ 89 | Args: 90 | dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. 91 | 92 | Returns: 93 | dict: a format that builtin models in detectron2 accept 94 | """ 95 | assert self.is_train, "MaskFormerPanopticDatasetMapper should only be used for training!" 96 | 97 | dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below 98 | image = utils.read_image(dataset_dict["file_name"], format=self.img_format) 99 | utils.check_image_size(dataset_dict, image) 100 | 101 | aug_input = T.AugInput(image) 102 | aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input) 103 | image = aug_input.image 104 | 105 | # transform instnace masks 106 | assert "annotations" in dataset_dict 107 | for anno in dataset_dict["annotations"]: 108 | anno.pop("keypoints", None) 109 | 110 | annos = [ 111 | utils.transform_instance_annotations(obj, transforms, image.shape[:2]) 112 | for obj in dataset_dict.pop("annotations") 113 | if obj.get("iscrowd", 0) == 0 114 | ] 115 | 116 | if len(annos): 117 | assert "segmentation" in annos[0] 118 | segms = [obj["segmentation"] for obj in annos] 119 | masks = [] 120 | for segm in segms: 121 | if isinstance(segm, list): 122 | # polygon 123 | masks.append(polygons_to_bitmask(segm, *image.shape[:2])) 124 | elif isinstance(segm, dict): 125 | # COCO RLE 126 | masks.append(mask_util.decode(segm)) 127 | elif isinstance(segm, np.ndarray): 128 | assert segm.ndim == 2, "Expect segmentation of 2 dimensions, got {}.".format( 129 | segm.ndim 130 | ) 131 | # mask array 132 | masks.append(segm) 133 | else: 134 | raise ValueError( 135 | "Cannot convert segmentation of type '{}' to BitMasks!" 136 | "Supported types are: polygons as list[list[float] or ndarray]," 137 | " COCO-style RLE as a dict, or a binary segmentation mask " 138 | " in a 2D numpy array of shape HxW.".format(type(segm)) 139 | ) 140 | 141 | # Pad image and segmentation label here! 142 | image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))) 143 | masks = [torch.from_numpy(np.ascontiguousarray(x)) for x in masks] 144 | 145 | classes = [int(obj["category_id"]) for obj in annos] 146 | classes = torch.tensor(classes, dtype=torch.int64) 147 | 148 | if self.size_divisibility > 0: 149 | image_size = (image.shape[-2], image.shape[-1]) 150 | padding_size = [ 151 | 0, 152 | self.size_divisibility - image_size[1], 153 | 0, 154 | self.size_divisibility - image_size[0], 155 | ] 156 | # pad image 157 | image = F.pad(image, padding_size, value=128).contiguous() 158 | # pad mask 159 | masks = [F.pad(x, padding_size, value=0).contiguous() for x in masks] 160 | 161 | image_shape = (image.shape[-2], image.shape[-1]) # h, w 162 | 163 | # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory, 164 | # but not efficient on large generic data structures due to the use of pickle & mp.Queue. 165 | # Therefore it's important to use torch.Tensor. 166 | dataset_dict["image"] = image 167 | 168 | # Prepare per-category binary masks 169 | instances = Instances(image_shape) 170 | instances.gt_classes = classes 171 | if len(masks) == 0: 172 | # Some image does not have annotation (all ignored) 173 | instances.gt_masks = torch.zeros((0, image.shape[-2], image.shape[-1])) 174 | else: 175 | masks = BitMasks(torch.stack(masks)) 176 | instances.gt_masks = masks.tensor 177 | 178 | dataset_dict["instances"] = instances 179 | 180 | return dataset_dict 181 | -------------------------------------------------------------------------------- /hgformer/modeling/transformer_decoder/maskformer_transformer_decoder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/detr.py 3 | import fvcore.nn.weight_init as weight_init 4 | import torch 5 | from torch import nn 6 | from torch.nn import functional as F 7 | 8 | from detectron2.config import configurable 9 | from detectron2.layers import Conv2d 10 | from detectron2.utils.registry import Registry 11 | 12 | from .position_encoding import PositionEmbeddingSine 13 | from .transformer import Transformer 14 | 15 | 16 | TRANSFORMER_DECODER_REGISTRY = Registry("TRANSFORMER_MODULE") 17 | TRANSFORMER_DECODER_REGISTRY.__doc__ = """ 18 | Registry for transformer module in MaskFormer. 19 | """ 20 | 21 | 22 | def build_transformer_decoder(cfg, in_channels, mask_classification=True): 23 | """ 24 | Build a instance embedding branch from `cfg.MODEL.INS_EMBED_HEAD.NAME`. 25 | """ 26 | name = cfg.MODEL.MASK_FORMER.TRANSFORMER_DECODER_NAME 27 | return TRANSFORMER_DECODER_REGISTRY.get(name)(cfg, in_channels, mask_classification) 28 | 29 | 30 | @TRANSFORMER_DECODER_REGISTRY.register() 31 | class StandardTransformerDecoder(nn.Module): 32 | @configurable 33 | def __init__( 34 | self, 35 | in_channels, 36 | mask_classification=True, 37 | *, 38 | num_classes: int, 39 | hidden_dim: int, 40 | num_queries: int, 41 | nheads: int, 42 | dropout: float, 43 | dim_feedforward: int, 44 | enc_layers: int, 45 | dec_layers: int, 46 | pre_norm: bool, 47 | deep_supervision: bool, 48 | mask_dim: int, 49 | enforce_input_project: bool, 50 | ): 51 | """ 52 | NOTE: this interface is experimental. 53 | Args: 54 | in_channels: channels of the input features 55 | mask_classification: whether to add mask classifier or not 56 | num_classes: number of classes 57 | hidden_dim: Transformer feature dimension 58 | num_queries: number of queries 59 | nheads: number of heads 60 | dropout: dropout in Transformer 61 | dim_feedforward: feature dimension in feedforward network 62 | enc_layers: number of Transformer encoder layers 63 | dec_layers: number of Transformer decoder layers 64 | pre_norm: whether to use pre-LayerNorm or not 65 | deep_supervision: whether to add supervision to every decoder layers 66 | mask_dim: mask feature dimension 67 | enforce_input_project: add input project 1x1 conv even if input 68 | channels and hidden dim is identical 69 | """ 70 | super().__init__() 71 | 72 | self.mask_classification = mask_classification 73 | 74 | # positional encoding 75 | N_steps = hidden_dim // 2 76 | self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True) 77 | 78 | transformer = Transformer( 79 | d_model=hidden_dim, 80 | dropout=dropout, 81 | nhead=nheads, 82 | dim_feedforward=dim_feedforward, 83 | num_encoder_layers=enc_layers, 84 | num_decoder_layers=dec_layers, 85 | normalize_before=pre_norm, 86 | return_intermediate_dec=deep_supervision, 87 | ) 88 | 89 | self.num_queries = num_queries 90 | self.transformer = transformer 91 | hidden_dim = transformer.d_model 92 | 93 | self.query_embed = nn.Embedding(num_queries, hidden_dim) 94 | 95 | if in_channels != hidden_dim or enforce_input_project: 96 | self.input_proj = Conv2d(in_channels, hidden_dim, kernel_size=1) 97 | weight_init.c2_xavier_fill(self.input_proj) 98 | else: 99 | self.input_proj = nn.Sequential() 100 | self.aux_loss = deep_supervision 101 | 102 | # output FFNs 103 | if self.mask_classification: 104 | self.class_embed = nn.Linear(hidden_dim, num_classes + 1) 105 | self.mask_embed = MLP(hidden_dim, hidden_dim, mask_dim, 3) 106 | 107 | @classmethod 108 | def from_config(cls, cfg, in_channels, mask_classification): 109 | ret = {} 110 | ret["in_channels"] = in_channels 111 | ret["mask_classification"] = mask_classification 112 | 113 | ret["num_classes"] = cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES 114 | ret["hidden_dim"] = cfg.MODEL.MASK_FORMER.HIDDEN_DIM 115 | ret["num_queries"] = cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES 116 | # Transformer parameters: 117 | ret["nheads"] = cfg.MODEL.MASK_FORMER.NHEADS 118 | ret["dropout"] = cfg.MODEL.MASK_FORMER.DROPOUT 119 | ret["dim_feedforward"] = cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD 120 | ret["enc_layers"] = cfg.MODEL.MASK_FORMER.ENC_LAYERS 121 | ret["dec_layers"] = cfg.MODEL.MASK_FORMER.DEC_LAYERS 122 | ret["pre_norm"] = cfg.MODEL.MASK_FORMER.PRE_NORM 123 | ret["deep_supervision"] = cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION 124 | ret["enforce_input_project"] = cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ 125 | 126 | ret["mask_dim"] = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM 127 | 128 | return ret 129 | 130 | def forward(self, x, mask_features, mask=None): 131 | if mask is not None: 132 | mask = F.interpolate(mask[None].float(), size=x.shape[-2:]).to(torch.bool)[0] 133 | pos = self.pe_layer(x, mask) 134 | 135 | src = x 136 | hs, memory = self.transformer(self.input_proj(src), mask, self.query_embed.weight, pos) 137 | 138 | if self.mask_classification: 139 | outputs_class = self.class_embed(hs) 140 | out = {"pred_logits": outputs_class[-1]} 141 | else: 142 | out = {} 143 | 144 | if self.aux_loss: 145 | # [l, bs, queries, embed] 146 | mask_embed = self.mask_embed(hs) 147 | outputs_seg_masks = torch.einsum("lbqc,bchw->lbqhw", mask_embed, mask_features) 148 | out["pred_masks"] = outputs_seg_masks[-1] 149 | out["aux_outputs"] = self._set_aux_loss( 150 | outputs_class if self.mask_classification else None, outputs_seg_masks 151 | ) 152 | else: 153 | # FIXME h_boxes takes the last one computed, keep this in mind 154 | # [bs, queries, embed] 155 | mask_embed = self.mask_embed(hs[-1]) 156 | outputs_seg_masks = torch.einsum("bqc,bchw->bqhw", mask_embed, mask_features) 157 | out["pred_masks"] = outputs_seg_masks 158 | return out 159 | 160 | @torch.jit.unused 161 | def _set_aux_loss(self, outputs_class, outputs_seg_masks): 162 | # this is a workaround to make torchscript happy, as torchscript 163 | # doesn't support dictionary with non-homogeneous values, such 164 | # as a dict having both a Tensor and a list. 165 | if self.mask_classification: 166 | return [ 167 | {"pred_logits": a, "pred_masks": b} 168 | for a, b in zip(outputs_class[:-1], outputs_seg_masks[:-1]) 169 | ] 170 | else: 171 | return [{"pred_masks": b} for b in outputs_seg_masks[:-1]] 172 | 173 | 174 | class MLP(nn.Module): 175 | """Very simple multi-layer perceptron (also called FFN)""" 176 | 177 | def __init__(self, input_dim, hidden_dim, output_dim, num_layers): 178 | super().__init__() 179 | self.num_layers = num_layers 180 | h = [hidden_dim] * (num_layers - 1) 181 | self.layers = nn.ModuleList( 182 | nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]) 183 | ) 184 | 185 | def forward(self, x): 186 | for i, layer in enumerate(self.layers): 187 | x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x) 188 | return x 189 | -------------------------------------------------------------------------------- /hgformer/data/dataset_mappers/mask_former_semantic_dataset_mapper.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import copy 3 | import logging 4 | 5 | import numpy as np 6 | import torch 7 | from torch.nn import functional as F 8 | 9 | from detectron2.config import configurable 10 | from detectron2.data import MetadataCatalog 11 | from detectron2.data import detection_utils as utils 12 | from detectron2.data import transforms as T 13 | from detectron2.projects.point_rend import ColorAugSSDTransform 14 | from detectron2.structures import BitMasks, Instances 15 | 16 | __all__ = ["MaskFormerSemanticDatasetMapper"] 17 | 18 | 19 | class MaskFormerSemanticDatasetMapper: 20 | """ 21 | A callable which takes a dataset dict in Detectron2 Dataset format, 22 | and map it into a format used by MaskFormer for semantic segmentation. 23 | 24 | The callable currently does the following: 25 | 26 | 1. Read the image from "file_name" 27 | 2. Applies geometric transforms to the image and annotation 28 | 3. Find and applies suitable cropping to the image and annotation 29 | 4. Prepare image and annotation to Tensors 30 | """ 31 | 32 | @configurable 33 | def __init__( 34 | self, 35 | is_train=True, 36 | *, 37 | augmentations, 38 | image_format, 39 | ignore_label, 40 | size_divisibility, 41 | ): 42 | """ 43 | NOTE: this interface is experimental. 44 | Args: 45 | is_train: for training or inference 46 | augmentations: a list of augmentations or deterministic transforms to apply 47 | image_format: an image format supported by :func:`detection_utils.read_image`. 48 | ignore_label: the label that is ignored to evaluation 49 | size_divisibility: pad image size to be divisible by this value 50 | """ 51 | self.is_train = is_train 52 | self.tfm_gens = augmentations 53 | self.img_format = image_format 54 | self.ignore_label = ignore_label 55 | self.size_divisibility = size_divisibility 56 | 57 | logger = logging.getLogger(__name__) 58 | mode = "training" if is_train else "inference" 59 | logger.info(f"[{self.__class__.__name__}] Augmentations used in {mode}: {augmentations}") 60 | 61 | @classmethod 62 | def from_config(cls, cfg, is_train=True): 63 | # Build augmentation 64 | augs = [ 65 | T.ResizeShortestEdge( 66 | cfg.INPUT.MIN_SIZE_TRAIN, 67 | cfg.INPUT.MAX_SIZE_TRAIN, 68 | cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING, 69 | ) 70 | ] 71 | if cfg.INPUT.CROP.ENABLED: 72 | augs.append( 73 | T.RandomCrop_CategoryAreaConstraint( 74 | cfg.INPUT.CROP.TYPE, 75 | cfg.INPUT.CROP.SIZE, 76 | cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA, 77 | cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE, 78 | ) 79 | ) 80 | # import ipdb; ipdb.set_trace() 81 | if cfg.INPUT.COLOR_AUG_SSD: 82 | augs.append(ColorAugSSDTransform(img_format=cfg.INPUT.FORMAT)) 83 | augs.append(T.RandomFlip()) 84 | 85 | # Assume always applies to the training set. 86 | dataset_names = cfg.DATASETS.TRAIN 87 | meta = MetadataCatalog.get(dataset_names[0]) 88 | ignore_label = meta.ignore_label 89 | 90 | ret = { 91 | "is_train": is_train, 92 | "augmentations": augs, 93 | "image_format": cfg.INPUT.FORMAT, 94 | "ignore_label": ignore_label, 95 | "size_divisibility": cfg.INPUT.SIZE_DIVISIBILITY, 96 | } 97 | return ret 98 | 99 | def __call__(self, dataset_dict): 100 | """ 101 | Args: 102 | dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. 103 | 104 | Returns: 105 | dict: a format that builtin models in detectron2 accept 106 | """ 107 | assert self.is_train, "MaskFormerSemanticDatasetMapper should only be used for training!" 108 | 109 | dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below 110 | image = utils.read_image(dataset_dict["file_name"], format=self.img_format) 111 | utils.check_image_size(dataset_dict, image) 112 | # import ipdb; ipdb.set_trace() 113 | if "sem_seg_file_name" in dataset_dict: 114 | # PyTorch transformation not implemented for uint16, so converting it to double first 115 | sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name")).astype("double") 116 | else: 117 | sem_seg_gt = None 118 | 119 | if sem_seg_gt is None: 120 | raise ValueError( 121 | "Cannot find 'sem_seg_file_name' for semantic segmentation dataset {}.".format( 122 | dataset_dict["file_name"] 123 | ) 124 | ) 125 | 126 | aug_input = T.AugInput(image, sem_seg=sem_seg_gt) 127 | aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input) 128 | image = aug_input.image 129 | sem_seg_gt = aug_input.sem_seg 130 | 131 | # Pad image and segmentation label here! 132 | image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))) 133 | if sem_seg_gt is not None: 134 | sem_seg_gt = torch.as_tensor(sem_seg_gt.astype("long")) 135 | 136 | if self.size_divisibility > 0: 137 | image_size = (image.shape[-2], image.shape[-1]) 138 | padding_size = [ 139 | 0, 140 | self.size_divisibility - image_size[1], 141 | 0, 142 | self.size_divisibility - image_size[0], 143 | ] 144 | image = F.pad(image, padding_size, value=128).contiguous() 145 | if sem_seg_gt is not None: 146 | sem_seg_gt = F.pad(sem_seg_gt, padding_size, value=self.ignore_label).contiguous() 147 | 148 | image_shape = (image.shape[-2], image.shape[-1]) # h, w 149 | 150 | # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory, 151 | # but not efficient on large generic data structures due to the use of pickle & mp.Queue. 152 | # Therefore it's important to use torch.Tensor. 153 | dataset_dict["image"] = image 154 | 155 | if sem_seg_gt is not None: 156 | dataset_dict["sem_seg"] = sem_seg_gt.long() 157 | # import ipdb; ipdb.set_trace() 158 | if "annotations" in dataset_dict: 159 | raise ValueError("Semantic segmentation dataset should not have 'annotations'.") 160 | # import ipdb; ipdb.set_trace() 161 | # Prepare per-category binary masks 162 | if sem_seg_gt is not None: 163 | sem_seg_gt = sem_seg_gt.numpy() 164 | instances = Instances(image_shape) 165 | classes = np.unique(sem_seg_gt) 166 | # remove ignored region 167 | classes = classes[classes != self.ignore_label] 168 | instances.gt_classes = torch.tensor(classes, dtype=torch.int64) 169 | 170 | masks = [] 171 | for class_id in classes: 172 | masks.append(sem_seg_gt == class_id) 173 | 174 | if len(masks) == 0: 175 | # Some image does not have annotation (all ignored) 176 | instances.gt_masks = torch.zeros((0, sem_seg_gt.shape[-2], sem_seg_gt.shape[-1])) 177 | else: 178 | try: 179 | masks = BitMasks( 180 | torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks]) 181 | ) 182 | instances.gt_masks = masks.tensor 183 | except: 184 | import ipdb; ipdb.set_trace() 185 | 186 | dataset_dict["instances"] = instances 187 | # import ipdb; ipdb.set_trace() 188 | return dataset_dict 189 | -------------------------------------------------------------------------------- /hgformer/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.cu: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #include 17 | #include "cuda/ms_deform_im2col_cuda.cuh" 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | 25 | at::Tensor ms_deform_attn_cuda_forward( 26 | const at::Tensor &value, 27 | const at::Tensor &spatial_shapes, 28 | const at::Tensor &level_start_index, 29 | const at::Tensor &sampling_loc, 30 | const at::Tensor &attn_weight, 31 | const int im2col_step) 32 | { 33 | AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous"); 34 | AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous"); 35 | AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous"); 36 | AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous"); 37 | AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous"); 38 | 39 | AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor"); 40 | AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor"); 41 | AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor"); 42 | AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor"); 43 | AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor"); 44 | 45 | const int batch = value.size(0); 46 | const int spatial_size = value.size(1); 47 | const int num_heads = value.size(2); 48 | const int channels = value.size(3); 49 | 50 | const int num_levels = spatial_shapes.size(0); 51 | 52 | const int num_query = sampling_loc.size(1); 53 | const int num_point = sampling_loc.size(4); 54 | 55 | const int im2col_step_ = std::min(batch, im2col_step); 56 | 57 | AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_); 58 | 59 | auto output = at::zeros({batch, num_query, num_heads, channels}, value.options()); 60 | 61 | const int batch_n = im2col_step_; 62 | auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels}); 63 | auto per_value_size = spatial_size * num_heads * channels; 64 | auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2; 65 | auto per_attn_weight_size = num_query * num_heads * num_levels * num_point; 66 | for (int n = 0; n < batch/im2col_step_; ++n) 67 | { 68 | auto columns = output_n.select(0, n); 69 | AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_forward_cuda", ([&] { 70 | ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(), 71 | value.data() + n * im2col_step_ * per_value_size, 72 | spatial_shapes.data(), 73 | level_start_index.data(), 74 | sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, 75 | attn_weight.data() + n * im2col_step_ * per_attn_weight_size, 76 | batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point, 77 | columns.data()); 78 | 79 | })); 80 | } 81 | 82 | output = output.view({batch, num_query, num_heads*channels}); 83 | 84 | return output; 85 | } 86 | 87 | 88 | std::vector ms_deform_attn_cuda_backward( 89 | const at::Tensor &value, 90 | const at::Tensor &spatial_shapes, 91 | const at::Tensor &level_start_index, 92 | const at::Tensor &sampling_loc, 93 | const at::Tensor &attn_weight, 94 | const at::Tensor &grad_output, 95 | const int im2col_step) 96 | { 97 | 98 | AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous"); 99 | AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous"); 100 | AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous"); 101 | AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous"); 102 | AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous"); 103 | AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous"); 104 | 105 | AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor"); 106 | AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor"); 107 | AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor"); 108 | AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor"); 109 | AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor"); 110 | AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor"); 111 | 112 | const int batch = value.size(0); 113 | const int spatial_size = value.size(1); 114 | const int num_heads = value.size(2); 115 | const int channels = value.size(3); 116 | 117 | const int num_levels = spatial_shapes.size(0); 118 | 119 | const int num_query = sampling_loc.size(1); 120 | const int num_point = sampling_loc.size(4); 121 | 122 | const int im2col_step_ = std::min(batch, im2col_step); 123 | 124 | AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_); 125 | 126 | auto grad_value = at::zeros_like(value); 127 | auto grad_sampling_loc = at::zeros_like(sampling_loc); 128 | auto grad_attn_weight = at::zeros_like(attn_weight); 129 | 130 | const int batch_n = im2col_step_; 131 | auto per_value_size = spatial_size * num_heads * channels; 132 | auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2; 133 | auto per_attn_weight_size = num_query * num_heads * num_levels * num_point; 134 | auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels}); 135 | 136 | for (int n = 0; n < batch/im2col_step_; ++n) 137 | { 138 | auto grad_output_g = grad_output_n.select(0, n); 139 | AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_backward_cuda", ([&] { 140 | ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(), 141 | grad_output_g.data(), 142 | value.data() + n * im2col_step_ * per_value_size, 143 | spatial_shapes.data(), 144 | level_start_index.data(), 145 | sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, 146 | attn_weight.data() + n * im2col_step_ * per_attn_weight_size, 147 | batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point, 148 | grad_value.data() + n * im2col_step_ * per_value_size, 149 | grad_sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, 150 | grad_attn_weight.data() + n * im2col_step_ * per_attn_weight_size); 151 | 152 | })); 153 | } 154 | 155 | return { 156 | grad_value, grad_sampling_loc, grad_attn_weight 157 | }; 158 | } -------------------------------------------------------------------------------- /hgformer/config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | from detectron2.config import CfgNode as CN 4 | 5 | 6 | def add_maskformer2_config(cfg): 7 | """ 8 | Add config for MASK_FORMER. 9 | """ 10 | # NOTE: configs from original maskformer 11 | # data config 12 | # select the dataset mapper 13 | cfg.INPUT.DATASET_MAPPER_NAME = "mask_former_semantic" 14 | # Color augmentation 15 | cfg.INPUT.COLOR_AUG_SSD = False 16 | cfg.INPUT.COLOR_AUG_MIX = 'partial' 17 | # We retry random cropping until no single category in semantic segmentation GT occupies more 18 | # than `SINGLE_CATEGORY_MAX_AREA` part of the crop. 19 | cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA = 1.0 20 | # Pad image and segmentation GT in dataset mapper. 21 | cfg.INPUT.SIZE_DIVISIBILITY = -1 22 | 23 | # solver config 24 | # weight decay on embedding 25 | cfg.SOLVER.WEIGHT_DECAY_EMBED = 0.0 26 | # optimizer 27 | cfg.SOLVER.OPTIMIZER = "ADAMW" 28 | cfg.SOLVER.BACKBONE_MULTIPLIER = 0.1 29 | 30 | # mask_former model config 31 | cfg.MODEL.MASK_FORMER = CN() 32 | 33 | # loss 34 | cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION = True 35 | cfg.MODEL.MASK_FORMER.DEEP_MASK_SUPERVISION = False 36 | cfg.MODEL.MASK_FORMER.NO_OBJECT_WEIGHT = 0.1 37 | cfg.MODEL.MASK_FORMER.CLASS_WEIGHT = 1.0 38 | cfg.MODEL.MASK_FORMER.DICE_WEIGHT = 1.0 39 | cfg.MODEL.MASK_FORMER.MASK_WEIGHT = 20.0 40 | cfg.MODEL.MASK_FORMER.SPIX_MASK_WEIGHT = 20.0 41 | cfg.MODEL.MASK_FORMER.SPIX_COLOR_WEIGHT = 1.0 42 | cfg.MODEL.MASK_FORMER.SPIX_CLASS_WEIGHT = 1.0 43 | cfg.MODEL.MASK_FORMER.PIXEL_CLASS_WEIGHT = 2.0 44 | cfg.MODEL.MASK_FORMER.REGION_PROXY_CLS_WEIGHT = 2.0 45 | cfg.MODEL.MASK_FORMER.CONTRASTIVE_WEIGH = 2.0 46 | cfg.MODEL.MASK_FORMER.CONTRASTIVE_LOSS = False 47 | # cfg.MODEL.MASK_FORMER.EDGE_DISTANCES = [1, 2, 4, 8] 48 | cfg.MODEL.MASK_FORMER.HIGH_THRESHOLD = 0.3 49 | cfg.MODEL.MASK_FORMER.LOW_THRESHOLD = 0.05 50 | cfg.MODEL.MASK_FORMER.RETURN_ITERATION = False 51 | cfg.MODEL.MASK_FORMER.OBLIQUE_DISTANCES = [1, 2, 4, 8] 52 | # cfg.MODEL.MASK_FORMER.BYOL_WEIGH = 2.0 53 | # cfg.MODEL.MASK_FORMER.EDGE_WEIGH = 2.0 54 | # cfg.MODEL.MASK_FORMER.PSEUDO_EDGE_WEIGH = 2.0 55 | cfg.MODEL.MASK_FORMER.SPIX_PIXEL_CLS_WEIGH = 2.0 56 | # cfg.MODEL.MASK_FORMER.BYOL_LOSS = False 57 | # cfg.MODEL.MASK_FORMER.EDGE_LOSS = False 58 | cfg.MODEL.MASK_FORMER.CONTRASTIVE_TAU = 0.3 59 | cfg.MODEL.MASK_FORMER.COMPUTE_RAMA = False 60 | cfg.MODEL.MASK_FORMER.RECONSTRUCT_LOSS = False 61 | cfg.MODEL.MASK_FORMER.RECONSTRUCT_COLOR = False 62 | cfg.MODEL.MASK_FORMER.RECONSTRUCT_COORD = False 63 | cfg.MODEL.MASK_FORMER.STAGE_WEIGHTS = [1.0, 1.0] 64 | cfg.MODEL.MASK_FORMER.SPIX_MASK_STAGE2 = 1.0 65 | 66 | # transformer config 67 | cfg.MODEL.MASK_FORMER.NHEADS = 8 68 | cfg.MODEL.MASK_FORMER.DROPOUT = 0.1 69 | cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD = 2048 70 | cfg.MODEL.MASK_FORMER.ENC_LAYERS = 0 71 | cfg.MODEL.MASK_FORMER.DEC_LAYERS = 6 72 | cfg.MODEL.MASK_FORMER.SPIX_SELF_ATTEN_LAYERS = 6 73 | cfg.MODEL.MASK_FORMER.PRE_NORM = False 74 | 75 | cfg.MODEL.MASK_FORMER.HIDDEN_DIM = 256 76 | cfg.MODEL.MASK_FORMER.CONTRASTIVE_DIM = 128 77 | cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES = 100 78 | 79 | cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE = "res5" 80 | cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ = False 81 | 82 | # mask_former inference config 83 | cfg.MODEL.MASK_FORMER.TEST = CN() 84 | cfg.MODEL.MASK_FORMER.TEST.SEMANTIC_ON = True 85 | cfg.MODEL.MASK_FORMER.TEST.INSTANCE_ON = False 86 | cfg.MODEL.MASK_FORMER.TEST.PANOPTIC_ON = False 87 | cfg.MODEL.MASK_FORMER.TEST.OBJECT_MASK_THRESHOLD = 0.0 88 | cfg.MODEL.MASK_FORMER.TEST.OVERLAP_THRESHOLD = 0.0 89 | cfg.MODEL.MASK_FORMER.TEST.SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE = False 90 | # cfg.TEST.MODE = "whole" # "whole" or "slide" 91 | # cfg.TEST.STRIDE = (300, 768) 92 | # cfg.TEST.CROP_SIZE = (512, 1024) 93 | cfg.TEST.CLUSTER_SOFTMAX = False 94 | cfg.TEST.PRED_STAGE = "all" 95 | 96 | # Sometimes `backbone.size_divisibility` is set to 0 for some backbone (e.g. ResNet) 97 | # you can use this config to override 98 | cfg.MODEL.MASK_FORMER.SIZE_DIVISIBILITY = 32 99 | 100 | cfg.MODEL.MASK_FORMER.GZERO_CALIBRATE = -1.0 101 | cfg.MODEL.MASK_FORMER.ENSEMBLING = False 102 | cfg.MODEL.MASK_FORMER.ENSEMBLING_ALL_CLS = False 103 | 104 | # vis 105 | cfg.MODEL.MASK_FORMER.VIS = False 106 | cfg.MODEL.MASK_FORMER.QUERY_SHAPE = [8, 16] # h, w 107 | cfg.MODEL.MASK_FORMER.ENSEMBLING_START = 1 108 | 109 | # pixel decoder config 110 | cfg.MODEL.SEM_SEG_HEAD.MASK_DIM = 256 111 | # adding transformer in pixel decoder 112 | cfg.MODEL.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS = 0 113 | # pixel decoder 114 | cfg.MODEL.SEM_SEG_HEAD.PIXEL_DECODER_NAME = "BasePixelDecoder" 115 | # gzero calibrate 116 | cfg.MODEL.SEM_SEG_HEAD.GZERO_CALIBRATE = -1.0 117 | 118 | # swin transformer backbone 119 | cfg.MODEL.SWIN = CN() 120 | cfg.MODEL.SWIN.PRETRAIN_IMG_SIZE = 224 121 | cfg.MODEL.SWIN.PATCH_SIZE = 4 122 | cfg.MODEL.SWIN.EMBED_DIM = 96 123 | cfg.MODEL.SWIN.DEPTHS = [2, 2, 6, 2] 124 | cfg.MODEL.SWIN.NUM_HEADS = [3, 6, 12, 24] 125 | cfg.MODEL.SWIN.WINDOW_SIZE = 7 126 | cfg.MODEL.SWIN.MLP_RATIO = 4.0 127 | cfg.MODEL.SWIN.QKV_BIAS = True 128 | cfg.MODEL.SWIN.QK_SCALE = None 129 | cfg.MODEL.SWIN.DROP_RATE = 0.0 130 | cfg.MODEL.SWIN.ATTN_DROP_RATE = 0.0 131 | cfg.MODEL.SWIN.DROP_PATH_RATE = 0.3 132 | cfg.MODEL.SWIN.APE = False 133 | cfg.MODEL.SWIN.PATCH_NORM = True 134 | cfg.MODEL.SWIN.OUT_FEATURES = ["res2", "res3", "res4", "res5"] 135 | cfg.MODEL.SWIN.USE_CHECKPOINT = False 136 | 137 | # pvt backbone 138 | cfg.MODEL.PVTV2 = CN() 139 | cfg.MODEL.PVTV2.PATCH_SIZE = 4 140 | cfg.MODEL.PVTV2.IN_CHANS = 3 141 | cfg.MODEL.PVTV2.EMBED_DIMS = [32, 64, 160, 256] 142 | cfg.MODEL.PVTV2.NUM_HEADS = [1, 2, 5, 8] 143 | cfg.MODEL.PVTV2.MLP_RATIO = [8, 8, 4, 4] 144 | cfg.MODEL.PVTV2.QKV_BIAS = True 145 | cfg.MODEL.PVTV2.DROP_RATE = 0.0 146 | cfg.MODEL.PVTV2.DROP_PATH_RATE = 0. 147 | cfg.MODEL.PVTV2.QK_SCALE = None 148 | cfg.MODEL.PVTV2.DEPTHS = [2, 2, 2, 2] 149 | cfg.MODEL.PVTV2.SR_RATIOS = [8, 4, 2, 1] 150 | cfg.MODEL.PVTV2.OUT_FEATURES = ["res2", "res3", "res4", "res5"] 151 | 152 | 153 | cfg.MODEL.SEM_SEG_HEAD.MASKATTENTIONPOOL = False 154 | cfg.MODEL.SEM_SEG_HEAD.TEMPERATURE = 0.01 155 | cfg.MODEL.SEM_SEG_HEAD.GAT_NUM_LAYERS = 2 156 | cfg.MODEL.SEM_SEG_HEAD.DOWNSAMPLE_RATE = 4 157 | # cfg.MODEL.CRITERION = "spix" # default 158 | 159 | # self training config 160 | cfg.MODEL.PSEUDO_LABEL = False 161 | cfg.MODEL.PSEUDO_WEIGHT = 1.0 162 | cfg.MODEL.PSEUDO_THR = -1. 163 | 164 | 165 | cfg.MODEL.DYNAMIC_MEN_STD = False 166 | # cfg.MODEL.LAB_INPUT = False 167 | 168 | # NOTE: maskformer2 extra conffigs 169 | # transformer module 170 | cfg.MODEL.MASK_FORMER.TRANSFORMER_DECODER_NAME = "MultiScaleMaskedTransformerDecoder" 171 | 172 | # LSJ aug 173 | cfg.INPUT.IMAGE_SIZE = 1024 174 | cfg.INPUT.MIN_SCALE = 0.1 175 | cfg.INPUT.MAX_SCALE = 2.0 176 | 177 | # MSDeformAttn encoder configs 178 | cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES = ["res3", "res4", "res5"] 179 | cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_POINTS = 4 180 | cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_HEADS = 8 181 | 182 | # point loss configs 183 | # Number of points sampled during training for a mask point head. 184 | cfg.MODEL.MASK_FORMER.TRAIN_NUM_POINTS = 112 * 112 185 | # Oversampling parameter for PointRend point sampling during training. Parameter `k` in the 186 | # original paper. 187 | cfg.MODEL.MASK_FORMER.OVERSAMPLE_RATIO = 3.0 188 | # Importance sampling parameter for PointRend point sampling during training. Parametr `beta` in 189 | # the original paper. 190 | cfg.MODEL.MASK_FORMER.IMPORTANCE_SAMPLE_RATIO = 0.75 191 | 192 | # params for groupformer 193 | cfg.MODEL.SEM_SEG_HEAD.NUM_GROUP_TOKENS = [256, 128, 64] 194 | cfg.MODEL.SEM_SEG_HEAD.NUM_OUTPUT_GROUPS = [256, 128, 64] 195 | cfg.MODEL.SEM_SEG_HEAD.NUM_HEADS = [8, 8, 8] 196 | cfg.MODEL.SEM_SEG_HEAD.SPIX_RES = [32, 32] 197 | cfg.MODEL.SEM_SEG_HEAD.MASK_POOL_STYLE = "attn_pool" 198 | cfg.MODEL.SEM_SEG_HEAD.TAU = 0.07 199 | 200 | cfg.MODEL.OUT_SUBMISSION_FORMAT = False 201 | 202 | cfg.MODEL.SEM_SEG_HEAD.SPIX_SELF_ATTEN = True 203 | cfg.MODEL.SEM_SEG_HEAD.SPIX_FFN = True 204 | -------------------------------------------------------------------------------- /demo/predictor.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Copied from: https://github.com/facebookresearch/detectron2/blob/master/demo/predictor.py 3 | import atexit 4 | import bisect 5 | import multiprocessing as mp 6 | from collections import deque 7 | 8 | import cv2 9 | import torch 10 | 11 | from detectron2.data import MetadataCatalog 12 | from detectron2.engine.defaults import DefaultPredictor 13 | from detectron2.utils.video_visualizer import VideoVisualizer 14 | from detectron2.utils.visualizer import ColorMode, Visualizer 15 | 16 | 17 | class VisualizationDemo(object): 18 | def __init__(self, cfg, instance_mode=ColorMode.IMAGE, parallel=False): 19 | """ 20 | Args: 21 | cfg (CfgNode): 22 | instance_mode (ColorMode): 23 | parallel (bool): whether to run the model in different processes from visualization. 24 | Useful since the visualization logic can be slow. 25 | """ 26 | # import ipdb; ipdb.set_trace() 27 | # self.metadata = MetadataCatalog.get( 28 | # cfg.DATASETS.TEST[0] if len(cfg.DATASETS.TEST) else "__unused" 29 | # ) 30 | # TODO: fix it, sorry, hard coded for cityscapes categories 31 | self.metadata = MetadataCatalog.get( 32 | cfg.DATASETS.TEST[0] if len(cfg.DATASETS.TEST) else "__unused" 33 | ) 34 | self.cpu_device = torch.device("cpu") 35 | self.instance_mode = instance_mode 36 | 37 | self.parallel = parallel 38 | if parallel: 39 | num_gpu = torch.cuda.device_count() 40 | self.predictor = AsyncPredictor(cfg, num_gpus=num_gpu) 41 | else: 42 | self.predictor = DefaultPredictor(cfg) 43 | 44 | def run_on_image(self, image): 45 | """ 46 | Args: 47 | image (np.ndarray): an image of shape (H, W, C) (in BGR order). 48 | This is the format used by OpenCV. 49 | Returns: 50 | predictions (dict): the output of the model. 51 | vis_output (VisImage): the visualized image output. 52 | """ 53 | vis_output = None 54 | predictions = self.predictor(image) 55 | # Convert image from OpenCV BGR format to Matplotlib RGB format. 56 | image = image[:, :, ::-1] 57 | # import ipdb; ipdb.set_trace() 58 | visualizer = Visualizer(image, self.metadata, instance_mode=self.instance_mode) 59 | if "panoptic_seg" in predictions: 60 | panoptic_seg, segments_info = predictions["panoptic_seg"] 61 | vis_output = visualizer.draw_panoptic_seg_predictions( 62 | panoptic_seg.to(self.cpu_device), segments_info 63 | ) 64 | else: 65 | if "sem_seg" in predictions: 66 | vis_output = visualizer.draw_sem_seg( 67 | predictions["sem_seg"].argmax(dim=0).to(self.cpu_device) 68 | ) 69 | if "instances" in predictions: 70 | instances = predictions["instances"].to(self.cpu_device) 71 | vis_output = visualizer.draw_instance_predictions(predictions=instances) 72 | 73 | return predictions, vis_output 74 | 75 | def _frame_from_video(self, video): 76 | while video.isOpened(): 77 | success, frame = video.read() 78 | if success: 79 | yield frame 80 | else: 81 | break 82 | 83 | def run_on_video(self, video): 84 | """ 85 | Visualizes predictions on frames of the input video. 86 | Args: 87 | video (cv2.VideoCapture): a :class:`VideoCapture` object, whose source can be 88 | either a webcam or a video file. 89 | Yields: 90 | ndarray: BGR visualizations of each video frame. 91 | """ 92 | video_visualizer = VideoVisualizer(self.metadata, self.instance_mode) 93 | 94 | def process_predictions(frame, predictions): 95 | frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) 96 | if "panoptic_seg" in predictions: 97 | panoptic_seg, segments_info = predictions["panoptic_seg"] 98 | vis_frame = video_visualizer.draw_panoptic_seg_predictions( 99 | frame, panoptic_seg.to(self.cpu_device), segments_info 100 | ) 101 | elif "instances" in predictions: 102 | predictions = predictions["instances"].to(self.cpu_device) 103 | vis_frame = video_visualizer.draw_instance_predictions(frame, predictions) 104 | elif "sem_seg" in predictions: 105 | vis_frame = video_visualizer.draw_sem_seg( 106 | frame, predictions["sem_seg"].argmax(dim=0).to(self.cpu_device) 107 | ) 108 | 109 | # Converts Matplotlib RGB format to OpenCV BGR format 110 | vis_frame = cv2.cvtColor(vis_frame.get_image(), cv2.COLOR_RGB2BGR) 111 | return vis_frame 112 | 113 | frame_gen = self._frame_from_video(video) 114 | if self.parallel: 115 | buffer_size = self.predictor.default_buffer_size 116 | 117 | frame_data = deque() 118 | 119 | for cnt, frame in enumerate(frame_gen): 120 | frame_data.append(frame) 121 | self.predictor.put(frame) 122 | 123 | if cnt >= buffer_size: 124 | frame = frame_data.popleft() 125 | predictions = self.predictor.get() 126 | yield process_predictions(frame, predictions) 127 | 128 | while len(frame_data): 129 | frame = frame_data.popleft() 130 | predictions = self.predictor.get() 131 | yield process_predictions(frame, predictions) 132 | else: 133 | for frame in frame_gen: 134 | yield process_predictions(frame, self.predictor(frame)) 135 | 136 | 137 | class AsyncPredictor: 138 | """ 139 | A predictor that runs the model asynchronously, possibly on >1 GPUs. 140 | Because rendering the visualization takes considerably amount of time, 141 | this helps improve throughput a little bit when rendering videos. 142 | """ 143 | 144 | class _StopToken: 145 | pass 146 | 147 | class _PredictWorker(mp.Process): 148 | def __init__(self, cfg, task_queue, result_queue): 149 | self.cfg = cfg 150 | self.task_queue = task_queue 151 | self.result_queue = result_queue 152 | super().__init__() 153 | 154 | def run(self): 155 | predictor = DefaultPredictor(self.cfg) 156 | 157 | while True: 158 | task = self.task_queue.get() 159 | if isinstance(task, AsyncPredictor._StopToken): 160 | break 161 | idx, data = task 162 | result = predictor(data) 163 | self.result_queue.put((idx, result)) 164 | 165 | def __init__(self, cfg, num_gpus: int = 1): 166 | """ 167 | Args: 168 | cfg (CfgNode): 169 | num_gpus (int): if 0, will run on CPU 170 | """ 171 | num_workers = max(num_gpus, 1) 172 | self.task_queue = mp.Queue(maxsize=num_workers * 3) 173 | self.result_queue = mp.Queue(maxsize=num_workers * 3) 174 | self.procs = [] 175 | for gpuid in range(max(num_gpus, 1)): 176 | cfg = cfg.clone() 177 | cfg.defrost() 178 | cfg.MODEL.DEVICE = "cuda:{}".format(gpuid) if num_gpus > 0 else "cpu" 179 | self.procs.append( 180 | AsyncPredictor._PredictWorker(cfg, self.task_queue, self.result_queue) 181 | ) 182 | 183 | self.put_idx = 0 184 | self.get_idx = 0 185 | self.result_rank = [] 186 | self.result_data = [] 187 | 188 | for p in self.procs: 189 | p.start() 190 | atexit.register(self.shutdown) 191 | 192 | def put(self, image): 193 | self.put_idx += 1 194 | self.task_queue.put((self.put_idx, image)) 195 | 196 | def get(self): 197 | self.get_idx += 1 # the index needed for this request 198 | if len(self.result_rank) and self.result_rank[0] == self.get_idx: 199 | res = self.result_data[0] 200 | del self.result_data[0], self.result_rank[0] 201 | return res 202 | 203 | while True: 204 | # make sure the results are returned in the correct order 205 | idx, res = self.result_queue.get() 206 | if idx == self.get_idx: 207 | return res 208 | insert = bisect.bisect(self.result_rank, idx) 209 | self.result_rank.insert(insert, idx) 210 | self.result_data.insert(insert, res) 211 | 212 | def __len__(self): 213 | return self.put_idx - self.get_idx 214 | 215 | def __call__(self, image): 216 | self.put(image) 217 | return self.get() 218 | 219 | def shutdown(self): 220 | for _ in self.procs: 221 | self.task_queue.put(AsyncPredictor._StopToken()) 222 | 223 | @property 224 | def default_buffer_size(self): 225 | return len(self.procs) * 5 226 | --------------------------------------------------------------------------------