├── hgformer
├── evaluation
│ ├── __init__.py
│ └── instance_evaluation.py
├── utils
│ ├── __init__.py
│ └── misc.py
├── modeling
│ ├── backbone
│ │ └── __init__.py
│ ├── meta_arch
│ │ ├── __init__.py
│ │ ├── group_former_head.py
│ │ └── mask_former_head.py
│ ├── pixel_decoder
│ │ ├── __init__.py
│ │ └── ops
│ │ │ ├── make.sh
│ │ │ ├── modules
│ │ │ ├── __init__.py
│ │ │ └── ms_deform_attn.py
│ │ │ ├── functions
│ │ │ ├── __init__.py
│ │ │ └── ms_deform_attn_func.py
│ │ │ ├── src
│ │ │ ├── vision.cpp
│ │ │ ├── cuda
│ │ │ │ ├── ms_deform_attn_cuda.h
│ │ │ │ └── ms_deform_attn_cuda.cu
│ │ │ ├── cpu
│ │ │ │ ├── ms_deform_attn_cpu.h
│ │ │ │ └── ms_deform_attn_cpu.cpp
│ │ │ └── ms_deform_attn.h
│ │ │ ├── setup.py
│ │ │ └── test.py
│ ├── transformer_decoder
│ │ ├── __init__.py
│ │ ├── position_encoding.py
│ │ └── maskformer_transformer_decoder.py
│ └── __init__.py
├── data
│ ├── dataset_mappers
│ │ ├── __init__.py
│ │ ├── mask_former_panoptic_dataset_mapper.py
│ │ ├── mask_former_instance_dataset_mapper.py
│ │ └── mask_former_semantic_dataset_mapper.py
│ ├── __init__.py
│ ├── samplers
│ │ ├── __init__.py
│ │ ├── grouped_batch_sampler.py
│ │ └── balanced_sampler.py
│ └── datasets
│ │ ├── __init__.py
│ │ ├── register_gta.py
│ │ ├── register_synthia.py
│ │ ├── register_bdd.py
│ │ ├── register_mapillary_19.py
│ │ ├── register_city_c.py
│ │ └── register_city_c_vis.py
├── __init__.py
├── test_time_augmentation.py
└── config.py
├── requirements.txt
├── configs
├── cityscapes
│ ├── maskformer2_swin_large_IN21k_384_bs16_20k.yaml
│ ├── hgformer_swin_large_IN21K_384_bs16_20k.yaml
│ ├── maskformer2_swin_tiny_bs16_20k.yaml
│ ├── hgformer_swin_tiny_bs16_20k.yaml
│ ├── maskformer2_R50_bs16_20k_gn.yaml
│ ├── Base-Cityscapes-SemanticSegmentation.yaml
│ └── hgformer_R50_bs16_20k.yaml
├── mapillary
│ ├── maskformer2_swin_large_IN21k_384_bs16_20k_mapillary.yaml
│ ├── hgformer_swin_tiny_bs16_20k_mapillary.yaml
│ ├── maskformer2_swin_tiny_bs16_20k_mapillary.yaml
│ ├── hgformer_swin_large_IN21k_384_bs16_20k_mapillary.yaml
│ ├── maskformer2_R50_bs16_90k_gn_mapillary_20k.yaml
│ ├── Base-mapillary19-SemanticSegmentation.yaml
│ └── hgformer_R50_bs16_20k_mapillary.yaml
└── city_c
│ ├── hgformer_swin_large_IN21K_384_bs16_20k.yaml
│ ├── maskformer2_swin_large_IN21k_384_bs16_90k.yaml
│ ├── hgformer_swin_tiny_bs16_20k.yaml
│ └── maskformer2_swin_tiny_bs16_20k.yaml
├── .gitignore
├── tools
├── convert-pretrained-swin-model-to-d2.py
├── convert-torchvision-to-d2.py
├── evaluate_coco_boundary_ap.py
├── README.md
├── visualize_data.py
└── analyze_model.py
├── README.md
├── datasets
├── split_data
│ ├── gta
│ │ ├── resize_img.py
│ │ └── split_gta.py
│ └── synthia
│ │ └── split_synthia.py
├── prepare_gta_sem_seg.py
├── generate_cityscapes_c.py
├── find_truncated_images.py
├── prepare_mapillary_sem_seg.py
├── prepare_synthia_sem_seg.py
└── README.md
├── INSTALL.md
├── GETTING_STARTED.md
├── MODEL_ZOO.md
└── demo
├── inference.py
└── predictor.py
/hgformer/evaluation/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/hgformer/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 |
--------------------------------------------------------------------------------
/hgformer/modeling/backbone/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 |
--------------------------------------------------------------------------------
/hgformer/data/dataset_mappers/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 |
--------------------------------------------------------------------------------
/hgformer/modeling/meta_arch/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 |
--------------------------------------------------------------------------------
/hgformer/modeling/pixel_decoder/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 |
--------------------------------------------------------------------------------
/hgformer/data/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from . import datasets
--------------------------------------------------------------------------------
/hgformer/data/samplers/__init__.py:
--------------------------------------------------------------------------------
1 | from .balanced_sampler import (
2 | BalancedTrainingSampler,
3 | )
4 |
5 |
6 | __all__ = [
7 | "BalancedTrainingSampler",
8 | ]
9 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | cython
2 | scipy
3 | numpy==1.23.1
4 | setuptools==58.0.4
5 | shapely
6 | timm
7 | h5py
8 | submitit
9 | scikit-image
10 | ftfy
11 | einops
12 | regex
13 | mmcv
14 | imagecorruptions
--------------------------------------------------------------------------------
/hgformer/data/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from . import (
3 | # register_acdc,
4 | register_gta,
5 | register_city_c,
6 | register_bdd,
7 | register_synthia,
8 | register_mapillary_19,
9 | register_city_c_vis,
10 | )
11 |
--------------------------------------------------------------------------------
/hgformer/modeling/transformer_decoder/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from .maskformer_transformer_decoder import StandardTransformerDecoder
3 | from .mask2former_transformer_decoder import MultiScaleMaskedTransformerDecoder
4 | from .groupFormer_transformer_decoder import GroupFormerDecoder
5 | from .mask2former_transformer_decoder_wo_maskatten import MultiScaleMaskedTransformerDecoderWoMaskAtten
6 |
--------------------------------------------------------------------------------
/configs/cityscapes/maskformer2_swin_large_IN21k_384_bs16_20k.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: maskformer2_R50_bs16_20k_gn.yaml
2 | MODEL:
3 | BACKBONE:
4 | NAME: "D2SwinTransformer"
5 | SWIN:
6 | EMBED_DIM: 192
7 | DEPTHS: [2, 2, 18, 2]
8 | NUM_HEADS: [6, 12, 24, 48]
9 | WINDOW_SIZE: 12
10 | APE: False
11 | DROP_PATH_RATE: 0.3
12 | PATCH_NORM: True
13 | PRETRAIN_IMG_SIZE: 384
14 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 | PIXEL_MEAN: [123.675, 116.280, 103.530]
16 | PIXEL_STD: [58.395, 57.120, 57.375]
17 | MASK_FORMER:
18 | NUM_OBJECT_QUERIES: 100
19 | SOLVER:
20 | MAX_ITER: 20000
--------------------------------------------------------------------------------
/configs/mapillary/maskformer2_swin_large_IN21k_384_bs16_20k_mapillary.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: maskformer2_R50_bs16_90k_gn_mapillary_20k.yaml
2 | MODEL:
3 | BACKBONE:
4 | NAME: "D2SwinTransformer"
5 | SWIN:
6 | EMBED_DIM: 192
7 | DEPTHS: [2, 2, 18, 2]
8 | NUM_HEADS: [6, 12, 24, 48]
9 | WINDOW_SIZE: 12
10 | APE: False
11 | DROP_PATH_RATE: 0.3
12 | PATCH_NORM: True
13 | PRETRAIN_IMG_SIZE: 384
14 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 | PIXEL_MEAN: [123.675, 116.280, 103.530]
16 | PIXEL_STD: [58.395, 57.120, 57.375]
17 | MASK_FORMER:
18 | NUM_OBJECT_QUERIES: 100
19 | SOLVER:
20 | MAX_ITER: 20000
--------------------------------------------------------------------------------
/configs/mapillary/hgformer_swin_tiny_bs16_20k_mapillary.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: hgformer_R50_bs16_20k_mapillary.yaml
2 | MODEL:
3 | BACKBONE:
4 | NAME: "D2SwinTransformer"
5 | SWIN:
6 | EMBED_DIM: 96
7 | DEPTHS: [2, 2, 6, 2]
8 | NUM_HEADS: [3, 6, 12, 24]
9 | WINDOW_SIZE: 7
10 | APE: False
11 | DROP_PATH_RATE: 0.3
12 | PATCH_NORM: True
13 | WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
14 | PIXEL_MEAN: [123.675, 116.280, 103.530]
15 | PIXEL_STD: [58.395, 57.120, 57.375]
16 |
17 | SOLVER:
18 | MAX_ITER: 20000
19 | IMS_PER_BATCH: 16
20 |
21 | TEST:
22 | CLUSTER_SOFTMAX: True
23 | PRED_STAGE: "spix_pixelexclude0125+stage3"
--------------------------------------------------------------------------------
/configs/mapillary/maskformer2_swin_tiny_bs16_20k_mapillary.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: maskformer2_R50_bs16_90k_gn_mapillary_20k.yaml
2 | MODEL:
3 | BACKBONE:
4 | NAME: "D2SwinTransformer"
5 | SWIN:
6 | EMBED_DIM: 96
7 | DEPTHS: [2, 2, 6, 2]
8 | NUM_HEADS: [3, 6, 12, 24]
9 | WINDOW_SIZE: 7
10 | APE: False
11 | DROP_PATH_RATE: 0.3
12 | PATCH_NORM: True
13 | WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
14 | PIXEL_MEAN: [123.675, 116.280, 103.530]
15 | PIXEL_STD: [58.395, 57.120, 57.375]
16 |
17 | DATALOADER:
18 | FILTER_EMPTY_ANNOTATIONS: True
19 | NUM_WORKERS: 4
20 | VERSION: 2
21 | SOLVER:
22 | MAX_ITER: 20000
23 |
24 | CUDNN_BENCHMARK: True
25 |
--------------------------------------------------------------------------------
/configs/city_c/hgformer_swin_large_IN21K_384_bs16_20k.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: hgformer_swin_tiny_bs16_20k.yaml
2 | MODEL:
3 | BACKBONE:
4 | NAME: "D2SwinTransformer"
5 | SWIN:
6 | EMBED_DIM: 192
7 | DEPTHS: [2, 2, 18, 2]
8 | NUM_HEADS: [6, 12, 24, 48]
9 | WINDOW_SIZE: 12
10 | APE: False
11 | DROP_PATH_RATE: 0.3
12 | PATCH_NORM: True
13 | PRETRAIN_IMG_SIZE: 384
14 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 | PIXEL_MEAN: [123.675, 116.280, 103.530]
16 | PIXEL_STD: [58.395, 57.120, 57.375]
17 |
18 | SOLVER:
19 | MAX_ITER: 20000
20 | # IMS_PER_BATCH: 2
21 |
22 | TEST:
23 | CLUSTER_SOFTMAX: True
24 | PRED_STAGE: "spix_all_stage_exclude012"
--------------------------------------------------------------------------------
/configs/cityscapes/hgformer_swin_large_IN21K_384_bs16_20k.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: hgformer_R50_bs16_20k.yaml
2 | MODEL:
3 | BACKBONE:
4 | NAME: "D2SwinTransformer"
5 | SWIN:
6 | EMBED_DIM: 192
7 | DEPTHS: [2, 2, 18, 2]
8 | NUM_HEADS: [6, 12, 24, 48]
9 | WINDOW_SIZE: 12
10 | APE: False
11 | DROP_PATH_RATE: 0.3
12 | PATCH_NORM: True
13 | PRETRAIN_IMG_SIZE: 384
14 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 | PIXEL_MEAN: [123.675, 116.280, 103.530]
16 | PIXEL_STD: [58.395, 57.120, 57.375]
17 |
18 | SOLVER:
19 | MAX_ITER: 20000
20 | # IMS_PER_BATCH: 2
21 |
22 | TEST:
23 | CLUSTER_SOFTMAX: True
24 | PRED_STAGE: "spix_pixelexclude0125+stage3"
--------------------------------------------------------------------------------
/configs/mapillary/hgformer_swin_large_IN21k_384_bs16_20k_mapillary.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: hgformer_R50_bs16_20k_mapillary.yaml
2 | MODEL:
3 | BACKBONE:
4 | NAME: "D2SwinTransformer"
5 | SWIN:
6 | EMBED_DIM: 192
7 | DEPTHS: [2, 2, 18, 2]
8 | NUM_HEADS: [6, 12, 24, 48]
9 | WINDOW_SIZE: 12
10 | APE: False
11 | DROP_PATH_RATE: 0.3
12 | PATCH_NORM: True
13 | PRETRAIN_IMG_SIZE: 384
14 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 | PIXEL_MEAN: [123.675, 116.280, 103.530]
16 | PIXEL_STD: [58.395, 57.120, 57.375]
17 |
18 | SOLVER:
19 | MAX_ITER: 20000
20 | # IMS_PER_BATCH: 2
21 |
22 | TEST:
23 | CLUSTER_SOFTMAX: True
24 | PRED_STAGE: "spix_pixelexclude0125+stage3"
--------------------------------------------------------------------------------
/hgformer/modeling/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from .backbone.swin import D2SwinTransformer, D2SwinTransformerFreeze
3 | from .pixel_decoder.fpn import BasePixelDecoder
4 | from .pixel_decoder.msdeformattn import MSDeformAttnPixelDecoder
5 | from .pixel_decoder.msdeformattn import MSDeformAttnPixelDecoderv2
6 | from .pixel_decoder.msdeformattn import MSDeformAttnPixelDecoderv3
7 | from .pixel_decoder.msdeformattn import MSDeformAttnPixelDecodervSingleLayer
8 | from .meta_arch.mask_former_head import MaskFormerHead
9 | from .meta_arch.group_former_head import GroupFormerHead
10 | from .meta_arch.per_pixel_baseline import PerPixelBaselineHead, PerPixelBaselinePlusHead
11 |
--------------------------------------------------------------------------------
/configs/cityscapes/maskformer2_swin_tiny_bs16_20k.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: maskformer2_R50_bs16_20k_gn.yaml
2 | MODEL:
3 | BACKBONE:
4 | NAME: "D2SwinTransformer"
5 | SWIN:
6 | EMBED_DIM: 96
7 | DEPTHS: [2, 2, 6, 2]
8 | NUM_HEADS: [3, 6, 12, 24]
9 | WINDOW_SIZE: 7
10 | APE: False
11 | DROP_PATH_RATE: 0.3
12 | PATCH_NORM: True
13 | WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
14 | PIXEL_MEAN: [123.675, 116.280, 103.530]
15 | PIXEL_STD: [58.395, 57.120, 57.375]
16 | #DATASETS:
17 | # TRAIN: ("cityscapes_fine_sem_seg_train",)
18 | # TEST: ("cityscapes_fine_sem_seg_val",)
19 | DATALOADER:
20 | FILTER_EMPTY_ANNOTATIONS: True
21 | NUM_WORKERS: 4
22 | VERSION: 2
23 | SOLVER:
24 | MAX_ITER: 20000
25 |
26 | CUDNN_BENCHMARK: True
27 |
--------------------------------------------------------------------------------
/configs/cityscapes/hgformer_swin_tiny_bs16_20k.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: hgformer_R50_bs16_20k.yaml
2 | MODEL:
3 | BACKBONE:
4 | NAME: "D2SwinTransformer"
5 | SWIN:
6 | EMBED_DIM: 96
7 | DEPTHS: [2, 2, 6, 2]
8 | NUM_HEADS: [3, 6, 12, 24]
9 | WINDOW_SIZE: 7
10 | APE: False
11 | DROP_PATH_RATE: 0.3
12 | PATCH_NORM: True
13 | WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
14 | PIXEL_MEAN: [123.675, 116.280, 103.530]
15 | PIXEL_STD: [58.395, 57.120, 57.375]
16 | #DATASETS:
17 | # TRAIN: ("cityscapes_fine_sem_seg_train",)
18 | # TEST: ("synthia_val", "cityscapes_fine_sem_seg_val", "bdd_val", "mapillary_val", "gta_trainid_val")
19 | SOLVER:
20 | MAX_ITER: 20000
21 | IMS_PER_BATCH: 16
22 |
23 | TEST:
24 | CLUSTER_SOFTMAX: True
25 | PRED_STAGE: "spix_pixelexclude0125+stage3"
--------------------------------------------------------------------------------
/hgformer/modeling/pixel_decoder/ops/make.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # ------------------------------------------------------------------------------------------------
3 | # Deformable DETR
4 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6 | # ------------------------------------------------------------------------------------------------
7 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
8 | # ------------------------------------------------------------------------------------------------
9 |
10 | # Copyright (c) Facebook, Inc. and its affiliates.
11 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
12 |
13 | python setup.py build install
14 |
--------------------------------------------------------------------------------
/hgformer/modeling/pixel_decoder/ops/modules/__init__.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------------------------------
2 | # Deformable DETR
3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5 | # ------------------------------------------------------------------------------------------------
6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
7 | # ------------------------------------------------------------------------------------------------
8 |
9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 |
12 | from .ms_deform_attn import MSDeformAttn
13 |
--------------------------------------------------------------------------------
/hgformer/modeling/pixel_decoder/ops/functions/__init__.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------------------------------
2 | # Deformable DETR
3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5 | # ------------------------------------------------------------------------------------------------
6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
7 | # ------------------------------------------------------------------------------------------------
8 |
9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 |
12 | from .ms_deform_attn_func import MSDeformAttnFunction
13 |
14 |
--------------------------------------------------------------------------------
/hgformer/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from . import data # register all new datasets
3 | from . import modeling
4 |
5 | # config
6 | from .config import add_maskformer2_config
7 |
8 | # dataset loading
9 |
10 | from .data.dataset_mappers.mask_former_instance_dataset_mapper import (
11 | MaskFormerInstanceDatasetMapper,
12 | )
13 | from .data.dataset_mappers.mask_former_panoptic_dataset_mapper import (
14 | MaskFormerPanopticDatasetMapper,
15 | )
16 | from .data.dataset_mappers.mask_former_semantic_dataset_mapper import (
17 | MaskFormerSemanticDatasetMapper,
18 | )
19 |
20 |
21 | # models
22 | from .maskformer_model import MaskFormer
23 | from .test_time_augmentation import SemanticSegmentorWithTTA
24 | from .groupformer_model import GroupFormer
25 |
26 | # evaluation
27 | from .evaluation.instance_evaluation import InstanceSegEvaluator
28 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # output dir
2 | output
3 | instant_test_output
4 | inference_test_output
5 |
6 |
7 | *.png
8 | *.json
9 | *.diff
10 | *.jpg
11 | !/projects/DensePose/doc/images/*.jpg
12 |
13 | # compilation and distribution
14 | __pycache__
15 | _ext
16 | *.pyc
17 | *.pyd
18 | *.so
19 | *.dll
20 | *.egg-info/
21 | build/
22 | dist/
23 | wheels/
24 |
25 | # pytorch/python/numpy formats
26 | *.pth
27 | *.pkl
28 | *.npy
29 | *.ts
30 | model_ts*.txt
31 |
32 | # ipython/jupyter notebooks
33 | *.ipynb
34 | **/.ipynb_checkpoints/
35 |
36 | # Editor temporaries
37 | *.swn
38 | *.swo
39 | *.swp
40 | *~
41 |
42 | # editor settings
43 | .idea
44 | .vscode
45 | _darcs
46 |
47 | # project dirs
48 | /detectron2/model_zoo/configs
49 | # /datasets/*
50 | !/datasets/*.*
51 | /projects/*/datasets
52 | /models
53 | /snippet
54 | /GroupViT
55 | /work_dirs
56 | /work_dirs_1
57 | test*.sh
58 | start*.sh
59 | slurm*
60 | /detectron2
61 |
--------------------------------------------------------------------------------
/tools/convert-pretrained-swin-model-to-d2.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
3 |
4 | import pickle as pkl
5 | import sys
6 |
7 | import torch
8 |
9 | """
10 | Usage:
11 | # download pretrained swin model:
12 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth
13 | # run the conversion
14 | ./convert-pretrained-model-to-d2.py swin_tiny_patch4_window7_224.pth swin_tiny_patch4_window7_224.pkl
15 | # Then, use swin_tiny_patch4_window7_224.pkl with the following changes in config:
16 | MODEL:
17 | WEIGHTS: "/path/to/swin_tiny_patch4_window7_224.pkl"
18 | INPUT:
19 | FORMAT: "RGB"
20 | """
21 |
22 | if __name__ == "__main__":
23 | input = sys.argv[1]
24 |
25 | obj = torch.load(input, map_location="cpu")["model"]
26 |
27 | res = {"model": obj, "__author__": "third_party", "matching_heuristics": True}
28 |
29 | with open(sys.argv[2], "wb") as f:
30 | pkl.dump(res, f)
31 |
--------------------------------------------------------------------------------
/hgformer/modeling/pixel_decoder/ops/src/vision.cpp:
--------------------------------------------------------------------------------
1 | /*!
2 | **************************************************************************************************
3 | * Deformable DETR
4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6 | **************************************************************************************************
7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
8 | **************************************************************************************************
9 | */
10 |
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 |
16 | #include "ms_deform_attn.h"
17 |
18 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
19 | m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward");
20 | m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward");
21 | }
22 |
--------------------------------------------------------------------------------
/hgformer/data/datasets/register_gta.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | import os
3 |
4 | from detectron2.data import DatasetCatalog, MetadataCatalog
5 | from detectron2.data.datasets import load_sem_seg
6 | from detectron2.data.datasets.builtin_meta import _get_builtin_metadata
7 |
8 | # ==== Predefined splits for raw gta images ===========
9 |
10 | GTA_Trainid = {
11 | "gta_trainid_val": ("gta/images/valid/", "gta/labels_detectron2/valid/"),
12 | }
13 |
14 | def register_all_gta_sem_seg(root):
15 | for key, (image_dir, gt_dir) in GTA_Trainid.items():
16 | meta = _get_builtin_metadata("cityscapes")
17 | image_dir = os.path.join(root, image_dir)
18 | gt_dir = os.path.join(root, gt_dir)
19 |
20 | DatasetCatalog.register(
21 | key, lambda x=image_dir, y=gt_dir: load_sem_seg(y, x, gt_ext="png", image_ext="png")
22 | )
23 | MetadataCatalog.get(key).set(
24 | image_dir=image_dir,
25 | gt_dir=gt_dir,
26 | evaluator_type="sem_seg",
27 | ignore_label=255,
28 | **meta,
29 | )
30 |
31 | _root = os.getenv("DETECTRON2_DATASETS", "datasets")
32 |
33 | register_all_gta_sem_seg(_root)
--------------------------------------------------------------------------------
/hgformer/data/datasets/register_synthia.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | import os
3 |
4 | from detectron2.data import DatasetCatalog, MetadataCatalog
5 | from detectron2.data.datasets import load_sem_seg
6 | from detectron2.data.datasets.builtin_meta import _get_builtin_metadata
7 | # from .acdc import load_acdc_semantic
8 | # from detectron2.data.datasets import load_sem_seg
9 |
10 |
11 | _RAW_BDD_SPLITS = {
12 | "synthia_train": ("synthia/RGB/train", "synthia/labels_detectron2/train"),
13 | "synthia_val": ("synthia/RGB/val", "synthia/labels_detectron2/val")
14 | }
15 |
16 | def register_all_synthia(root):
17 | for key, (image_dir, gt_dir) in _RAW_BDD_SPLITS.items():
18 | meta = _get_builtin_metadata("cityscapes")
19 | image_dir = os.path.join(root, image_dir)
20 | gt_dir = os.path.join(root, gt_dir)
21 |
22 | DatasetCatalog.register(
23 | key, lambda x=image_dir, y=gt_dir: load_sem_seg(y, x, gt_ext="png", image_ext="png")
24 | )
25 | MetadataCatalog.get(key).set(
26 | image_dir=image_dir,
27 | gt_dir=gt_dir,
28 | evaluator_type="sem_seg",
29 | ignore_label=255,
30 | **meta,
31 | )
32 |
33 | _root = os.getenv("DETECTRON2_DATASETS", "datasets")
34 | register_all_synthia(_root)
--------------------------------------------------------------------------------
/hgformer/data/datasets/register_bdd.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | import os
3 |
4 | from detectron2.data import DatasetCatalog, MetadataCatalog
5 | from detectron2.data.datasets import load_sem_seg
6 | from detectron2.data.datasets.builtin_meta import _get_builtin_metadata
7 |
8 |
9 | _RAW_BDD_SPLITS = {
10 | "bdd_train": ("bdd/images/10k/train", "bdd/labels/sem_seg/masks/train"),
11 | "bdd_val": ("bdd/images/10k/val", "bdd/labels/sem_seg/masks/val")
12 | }
13 |
14 | def register_all_bdd(root):
15 | for key, (image_dir, gt_dir) in _RAW_BDD_SPLITS.items():
16 | meta = _get_builtin_metadata("cityscapes")
17 | image_dir = os.path.join(root, image_dir)
18 | gt_dir = os.path.join(root, gt_dir)
19 |
20 | # DatasetCatalog.register(
21 | # key, lambda x=image_dir, y=gt_dir: load_sem_seg(x, y)
22 | # )
23 | DatasetCatalog.register(
24 | key, lambda x=image_dir, y=gt_dir: load_sem_seg(y, x, gt_ext="png", image_ext="jpg")
25 | )
26 | MetadataCatalog.get(key).set(
27 | image_dir=image_dir,
28 | gt_dir=gt_dir,
29 | evaluator_type="sem_seg",
30 | ignore_label=255,
31 | **meta,
32 | )
33 |
34 | _root = os.getenv("DETECTRON2_DATASETS", "datasets")
35 | register_all_bdd(_root)
--------------------------------------------------------------------------------
/hgformer/data/datasets/register_mapillary_19.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | import os
3 |
4 | from detectron2.data import DatasetCatalog, MetadataCatalog
5 | from detectron2.data.datasets import load_sem_seg
6 | from detectron2.data.datasets.builtin_meta import _get_builtin_metadata
7 | # from detectron2.data.datasets import load_sem_seg
8 |
9 |
10 | _RAW_BDD_SPLITS = {
11 | "mapillary_train": ("mapillary/training/images", "mapillary/labels_detectron2/training"),
12 | "mapillary_val": ("mapillary/validation/images", "mapillary/labels_detectron2/validation")
13 | }
14 |
15 | def register_all_mapillary_19(root):
16 | for key, (image_dir, gt_dir) in _RAW_BDD_SPLITS.items():
17 | meta = _get_builtin_metadata("cityscapes")
18 | image_dir = os.path.join(root, image_dir)
19 | gt_dir = os.path.join(root, gt_dir)
20 |
21 | DatasetCatalog.register(
22 | key, lambda x=image_dir, y=gt_dir: load_sem_seg(y, x, gt_ext="png", image_ext="jpg")
23 | )
24 | MetadataCatalog.get(key).set(
25 | image_dir=image_dir,
26 | gt_dir=gt_dir,
27 | evaluator_type="sem_seg",
28 | ignore_label=255,
29 | **meta,
30 | )
31 |
32 | _root = os.getenv("DETECTRON2_DATASETS", "datasets")
33 | register_all_mapillary_19(_root)
--------------------------------------------------------------------------------
/configs/city_c/maskformer2_swin_large_IN21k_384_bs16_90k.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: ../cityscapes/maskformer2_R50_bs16_20k_gn.yaml
2 | MODEL:
3 | BACKBONE:
4 | NAME: "D2SwinTransformer"
5 | SWIN:
6 | EMBED_DIM: 192
7 | DEPTHS: [2, 2, 18, 2]
8 | NUM_HEADS: [6, 12, 24, 48]
9 | WINDOW_SIZE: 12
10 | APE: False
11 | DROP_PATH_RATE: 0.3
12 | PATCH_NORM: True
13 | PRETRAIN_IMG_SIZE: 384
14 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 | PIXEL_MEAN: [123.675, 116.280, 103.530]
16 | PIXEL_STD: [58.395, 57.120, 57.375]
17 | MASK_FORMER:
18 | NUM_OBJECT_QUERIES: 100
19 | DATASETS:
20 | TRAIN: ("cityscapes_fine_sem_seg_train", )
21 | TEST: ("cityscapes_fine_sem_seg_val", "cityscapes_fine_gaussian_noise_5_val", "cityscapes_fine_shot_noise_5_val", "cityscapes_fine_impulse_noise_5_val", "cityscapes_fine_defocus_blur_5_val", "cityscapes_fine_glass_blur_5_val", "cityscapes_fine_motion_blur_5_val", "cityscapes_fine_zoom_blur_5_val", "cityscapes_fine_snow_5_val", "cityscapes_fine_frost_5_val", "cityscapes_fine_fog_5_val", "cityscapes_fine_brightness_5_val", "cityscapes_fine_contrast_5_val", "cityscapes_fine_elastic_transform_5_val", "cityscapes_fine_pixelate_5_val", "cityscapes_fine_jpeg_compression_5_val", "cityscapes_fine_speckle_noise_5_val", "cityscapes_fine_gaussian_blur_5_val", "cityscapes_fine_spatter_5_val", "cityscapes_fine_saturate_5_val")
22 |
--------------------------------------------------------------------------------
/configs/city_c/hgformer_swin_tiny_bs16_20k.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: ../cityscapes/hgformer_R50_bs16_20k.yaml
2 | MODEL:
3 | BACKBONE:
4 | NAME: "D2SwinTransformer"
5 | SWIN:
6 | EMBED_DIM: 96
7 | DEPTHS: [2, 2, 6, 2]
8 | NUM_HEADS: [3, 6, 12, 24]
9 | WINDOW_SIZE: 7
10 | APE: False
11 | DROP_PATH_RATE: 0.3
12 | PATCH_NORM: True
13 | WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
14 | PIXEL_MEAN: [123.675, 116.280, 103.530]
15 | PIXEL_STD: [58.395, 57.120, 57.375]
16 |
17 | SOLVER:
18 | MAX_ITER: 20000
19 | # IMS_PER_BATCH: 2
20 | DATASETS:
21 | TRAIN: ("cityscapes_fine_sem_seg_train", )
22 | TEST: ("cityscapes_fine_sem_seg_val", "cityscapes_fine_gaussian_noise_5_val", "cityscapes_fine_shot_noise_5_val", "cityscapes_fine_impulse_noise_5_val", "cityscapes_fine_defocus_blur_5_val", "cityscapes_fine_glass_blur_5_val", "cityscapes_fine_motion_blur_5_val", "cityscapes_fine_zoom_blur_5_val", "cityscapes_fine_snow_5_val", "cityscapes_fine_frost_5_val", "cityscapes_fine_fog_5_val", "cityscapes_fine_brightness_5_val", "cityscapes_fine_contrast_5_val", "cityscapes_fine_elastic_transform_5_val", "cityscapes_fine_pixelate_5_val", "cityscapes_fine_jpeg_compression_5_val", "cityscapes_fine_speckle_noise_5_val", "cityscapes_fine_gaussian_blur_5_val", "cityscapes_fine_spatter_5_val", "cityscapes_fine_saturate_5_val")
23 | TEST:
24 | CLUSTER_SOFTMAX: True
25 | PRED_STAGE: "spix_pixelexclude0125+stage3"
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # HGFormer: Hierarchical Grouping Transformer for Domain Generalized Semantic Segmentation
2 |
3 | This is the official code for the [HGFormer](https://openaccess.thecvf.com/content/CVPR2023/papers/Ding_HGFormer_Hierarchical_Grouping_Transformer_for_Domain_Generalized_Semantic_Segmentation_CVPR_2023_paper.pdf) (CVPR 2023)
4 |
5 | ## Installation
6 |
7 | See [installation instructions](INSTALL.md).
8 |
9 | ## Getting Started
10 |
11 | See [Preparing Datasets for HGFormer](datasets/README.md).
12 |
13 | See [Getting Started with HGFormer](GETTING_STARTED.md).
14 |
15 | ## Pre-trained Models and Baselines
16 |
17 | We provide a large set of baseline results and trained models available for download in the [HGFormer Model Zoo](MODEL_ZOO.md).
18 |
19 | ## Citing HGFormer
20 |
21 | If you use HGFormer in your research, please use the following BibTeX entry.
22 |
23 | ```BibTeX
24 | @inproceedings{ding2023hgformer,
25 | title={HGFormer: Hierarchical Grouping Transformer for Domain Generalized Semantic Segmentation},
26 | author={Ding, Jian and Xue, Nan and Xia, Gui-Song and Schiele, Bernt and Dai, Dengxin},
27 | booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
28 | pages={15413--15423},
29 | year={2023}
30 | }
31 | ```
32 |
33 | ## Acknowledgement
34 |
35 | Code is largely based on Mask2Former (https://github.com/facebookresearch/Mask2Former).
36 |
--------------------------------------------------------------------------------
/configs/city_c/maskformer2_swin_tiny_bs16_20k.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: ../cityscapes/maskformer2_R50_bs16_20k_gn.yaml
2 | MODEL:
3 | BACKBONE:
4 | NAME: "D2SwinTransformer"
5 | SWIN:
6 | EMBED_DIM: 96
7 | DEPTHS: [2, 2, 6, 2]
8 | NUM_HEADS: [3, 6, 12, 24]
9 | WINDOW_SIZE: 7
10 | APE: False
11 | DROP_PATH_RATE: 0.3
12 | PATCH_NORM: True
13 | WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
14 | PIXEL_MEAN: [123.675, 116.280, 103.530]
15 | PIXEL_STD: [58.395, 57.120, 57.375]
16 |
17 | DATALOADER:
18 | FILTER_EMPTY_ANNOTATIONS: True
19 | NUM_WORKERS: 4
20 | VERSION: 2
21 | SOLVER:
22 | MAX_ITER: 20000
23 |
24 | CUDNN_BENCHMARK: True
25 | DATASETS:
26 | TRAIN: ("cityscapes_fine_sem_seg_train", )
27 | TEST: ("cityscapes_fine_sem_seg_val", "cityscapes_fine_gaussian_noise_5_val", "cityscapes_fine_shot_noise_5_val", "cityscapes_fine_impulse_noise_5_val", "cityscapes_fine_defocus_blur_5_val", "cityscapes_fine_glass_blur_5_val", "cityscapes_fine_motion_blur_5_val", "cityscapes_fine_zoom_blur_5_val", "cityscapes_fine_snow_5_val", "cityscapes_fine_frost_5_val", "cityscapes_fine_fog_5_val", "cityscapes_fine_brightness_5_val", "cityscapes_fine_contrast_5_val", "cityscapes_fine_elastic_transform_5_val", "cityscapes_fine_pixelate_5_val", "cityscapes_fine_jpeg_compression_5_val", "cityscapes_fine_speckle_noise_5_val", "cityscapes_fine_gaussian_blur_5_val", "cityscapes_fine_spatter_5_val", "cityscapes_fine_saturate_5_val")
28 |
--------------------------------------------------------------------------------
/hgformer/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.h:
--------------------------------------------------------------------------------
1 | /*!
2 | **************************************************************************************************
3 | * Deformable DETR
4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6 | **************************************************************************************************
7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
8 | **************************************************************************************************
9 | */
10 |
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 |
16 | #pragma once
17 | #include
18 |
19 | at::Tensor ms_deform_attn_cuda_forward(
20 | const at::Tensor &value,
21 | const at::Tensor &spatial_shapes,
22 | const at::Tensor &level_start_index,
23 | const at::Tensor &sampling_loc,
24 | const at::Tensor &attn_weight,
25 | const int im2col_step);
26 |
27 | std::vector ms_deform_attn_cuda_backward(
28 | const at::Tensor &value,
29 | const at::Tensor &spatial_shapes,
30 | const at::Tensor &level_start_index,
31 | const at::Tensor &sampling_loc,
32 | const at::Tensor &attn_weight,
33 | const at::Tensor &grad_output,
34 | const int im2col_step);
35 |
36 |
--------------------------------------------------------------------------------
/hgformer/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.h:
--------------------------------------------------------------------------------
1 | /*!
2 | **************************************************************************************************
3 | * Deformable DETR
4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6 | **************************************************************************************************
7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
8 | **************************************************************************************************
9 | */
10 |
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 |
16 | #pragma once
17 | #include
18 |
19 | at::Tensor
20 | ms_deform_attn_cpu_forward(
21 | const at::Tensor &value,
22 | const at::Tensor &spatial_shapes,
23 | const at::Tensor &level_start_index,
24 | const at::Tensor &sampling_loc,
25 | const at::Tensor &attn_weight,
26 | const int im2col_step);
27 |
28 | std::vector
29 | ms_deform_attn_cpu_backward(
30 | const at::Tensor &value,
31 | const at::Tensor &spatial_shapes,
32 | const at::Tensor &level_start_index,
33 | const at::Tensor &sampling_loc,
34 | const at::Tensor &attn_weight,
35 | const at::Tensor &grad_output,
36 | const int im2col_step);
37 |
38 |
39 |
--------------------------------------------------------------------------------
/configs/cityscapes/maskformer2_R50_bs16_20k_gn.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: Base-Cityscapes-SemanticSegmentation.yaml
2 | MODEL:
3 | META_ARCHITECTURE: "MaskFormer"
4 | RESNETS:
5 | NORM: "GN"
6 | SEM_SEG_HEAD:
7 | NAME: "MaskFormerHead"
8 | IGNORE_VALUE: 255
9 | NUM_CLASSES: 19
10 | LOSS_WEIGHT: 1.0
11 | CONVS_DIM: 256
12 | MASK_DIM: 256
13 | NORM: "GN"
14 | # pixel decoder
15 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
16 | IN_FEATURES: ["res2", "res3", "res4", "res5"]
17 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
18 | COMMON_STRIDE: 4
19 | TRANSFORMER_ENC_LAYERS: 6
20 | MASK_FORMER:
21 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
22 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
23 | DEEP_SUPERVISION: True
24 | NO_OBJECT_WEIGHT: 0.1
25 | CLASS_WEIGHT: 2.0
26 | MASK_WEIGHT: 5.0
27 | DICE_WEIGHT: 5.0
28 | HIDDEN_DIM: 256
29 | NUM_OBJECT_QUERIES: 100
30 | NHEADS: 8
31 | DROPOUT: 0.0
32 | DIM_FEEDFORWARD: 2048
33 | ENC_LAYERS: 0
34 | PRE_NORM: False
35 | ENFORCE_INPUT_PROJ: False
36 | SIZE_DIVISIBILITY: 32
37 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query
38 | TRAIN_NUM_POINTS: 12544
39 | OVERSAMPLE_RATIO: 3.0
40 | IMPORTANCE_SAMPLE_RATIO: 0.75
41 | TEST:
42 | SEMANTIC_ON: True
43 | INSTANCE_ON: False
44 | PANOPTIC_ON: False
45 | OVERLAP_THRESHOLD: 0.8
46 | OBJECT_MASK_THRESHOLD: 0.8
47 | SOLVER:
48 | IMS_PER_BATCH: 16
49 | BASE_LR: 0.0001
50 | MAX_ITER: 20000
--------------------------------------------------------------------------------
/configs/mapillary/maskformer2_R50_bs16_90k_gn_mapillary_20k.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: Base-mapillary19-SemanticSegmentation.yaml
2 | MODEL:
3 | META_ARCHITECTURE: "MaskFormer"
4 | RESNETS:
5 | NORM: "GN"
6 | SEM_SEG_HEAD:
7 | NAME: "MaskFormerHead"
8 | IGNORE_VALUE: 255
9 | NUM_CLASSES: 19
10 | LOSS_WEIGHT: 1.0
11 | CONVS_DIM: 256
12 | MASK_DIM: 256
13 | NORM: "GN"
14 | # pixel decoder
15 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
16 | IN_FEATURES: ["res2", "res3", "res4", "res5"]
17 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
18 | COMMON_STRIDE: 4
19 | TRANSFORMER_ENC_LAYERS: 6
20 | MASK_FORMER:
21 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
22 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
23 | DEEP_SUPERVISION: True
24 | NO_OBJECT_WEIGHT: 0.1
25 | CLASS_WEIGHT: 2.0
26 | MASK_WEIGHT: 5.0
27 | DICE_WEIGHT: 5.0
28 | HIDDEN_DIM: 256
29 | NUM_OBJECT_QUERIES: 100
30 | NHEADS: 8
31 | DROPOUT: 0.0
32 | DIM_FEEDFORWARD: 2048
33 | ENC_LAYERS: 0
34 | PRE_NORM: False
35 | ENFORCE_INPUT_PROJ: False
36 | SIZE_DIVISIBILITY: 32
37 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query
38 | TRAIN_NUM_POINTS: 12544
39 | OVERSAMPLE_RATIO: 3.0
40 | IMPORTANCE_SAMPLE_RATIO: 0.75
41 | TEST:
42 | SEMANTIC_ON: True
43 | INSTANCE_ON: False
44 | PANOPTIC_ON: False
45 | OVERLAP_THRESHOLD: 0.8
46 | OBJECT_MASK_THRESHOLD: 0.8
47 | SOLVER:
48 | IMS_PER_BATCH: 16
49 | BASE_LR: 0.0001
50 | MAX_ITER: 20000
--------------------------------------------------------------------------------
/hgformer/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.cpp:
--------------------------------------------------------------------------------
1 | /*!
2 | **************************************************************************************************
3 | * Deformable DETR
4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6 | **************************************************************************************************
7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
8 | **************************************************************************************************
9 | */
10 |
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 |
16 | #include
17 |
18 | #include
19 | #include
20 |
21 |
22 | at::Tensor
23 | ms_deform_attn_cpu_forward(
24 | const at::Tensor &value,
25 | const at::Tensor &spatial_shapes,
26 | const at::Tensor &level_start_index,
27 | const at::Tensor &sampling_loc,
28 | const at::Tensor &attn_weight,
29 | const int im2col_step)
30 | {
31 | AT_ERROR("Not implement on cpu");
32 | }
33 |
34 | std::vector
35 | ms_deform_attn_cpu_backward(
36 | const at::Tensor &value,
37 | const at::Tensor &spatial_shapes,
38 | const at::Tensor &level_start_index,
39 | const at::Tensor &sampling_loc,
40 | const at::Tensor &attn_weight,
41 | const at::Tensor &grad_output,
42 | const int im2col_step)
43 | {
44 | AT_ERROR("Not implement on cpu");
45 | }
46 |
47 |
--------------------------------------------------------------------------------
/tools/convert-torchvision-to-d2.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Copyright (c) Facebook, Inc. and its affiliates.
3 |
4 | import pickle as pkl
5 | import sys
6 |
7 | import torch
8 |
9 | """
10 | Usage:
11 | # download one of the ResNet{18,34,50,101,152} models from torchvision:
12 | wget https://download.pytorch.org/models/resnet50-19c8e357.pth -O r50.pth
13 | # run the conversion
14 | ./convert-torchvision-to-d2.py r50.pth r50.pkl
15 | # Then, use r50.pkl with the following changes in config:
16 | MODEL:
17 | WEIGHTS: "/path/to/r50.pkl"
18 | PIXEL_MEAN: [123.675, 116.280, 103.530]
19 | PIXEL_STD: [58.395, 57.120, 57.375]
20 | RESNETS:
21 | DEPTH: 50
22 | STRIDE_IN_1X1: False
23 | INPUT:
24 | FORMAT: "RGB"
25 | """
26 |
27 | if __name__ == "__main__":
28 | input = sys.argv[1]
29 |
30 | obj = torch.load(input, map_location="cpu")
31 |
32 | newmodel = {}
33 | for k in list(obj.keys()):
34 | old_k = k
35 | if "layer" not in k:
36 | k = "stem." + k
37 | for t in [1, 2, 3, 4]:
38 | k = k.replace("layer{}".format(t), "res{}".format(t + 1))
39 | for t in [1, 2, 3]:
40 | k = k.replace("bn{}".format(t), "conv{}.norm".format(t))
41 | k = k.replace("downsample.0", "shortcut")
42 | k = k.replace("downsample.1", "shortcut.norm")
43 | print(old_k, "->", k)
44 | newmodel[k] = obj.pop(old_k).detach().numpy()
45 |
46 | res = {"model": newmodel, "__author__": "torchvision", "matching_heuristics": True}
47 |
48 | with open(sys.argv[2], "wb") as f:
49 | pkl.dump(res, f)
50 | if obj:
51 | print("Unconverted keys:", obj.keys())
52 |
--------------------------------------------------------------------------------
/datasets/split_data/gta/resize_img.py:
--------------------------------------------------------------------------------
1 | import os
2 | from PIL import Image
3 | import numpy as np
4 | import cv2
5 |
6 | def GetFileFromThisRootDir(dir,ext = None):
7 | allfiles = []
8 | needExtFilter = (ext != None)
9 | for root,dirs,files in os.walk(dir):
10 | for filespath in files:
11 | filepath = os.path.join(root, filespath)
12 | extension = os.path.splitext(filepath)[1][1:]
13 | if needExtFilter and extension in ext:
14 | allfiles.append(filepath)
15 | elif not needExtFilter:
16 | allfiles.append(filepath)
17 | return allfiles
18 |
19 | def resize_split(split):
20 | filenames = GetFileFromThisRootDir(f'datasets/GTA/images/{split}')
21 | for filename in filenames:
22 | basename = os.path.basename(filename)
23 | img = Image.open(filename)
24 | gtname = os.path.join(f'datasets/GTA/labels/{split}', basename)
25 | gt = Image.open(gtname)
26 | print(f'filename: {filename}')
27 | if not os.path.exists(f'datasets/GTA/labels/{split}_resize'):
28 | os.makedirs(f'datasets/GTA/labels/{split}_resize')
29 | if (img.width != gt.width) or (img.height != gt.height):
30 | # read img
31 | gt_np = np.asarray(gt)
32 | # resize img
33 | width, height = img.width, img.height
34 | resized_gt_np = cv2.resize(gt_np, (width, height), interpolation=cv2.INTER_NEAREST)
35 | # import ipdb;
36 | # ipdb.set_trace()
37 | # save img
38 | outname = os.path.join(f'datasets/GTA/labels/{split}_resize', basename)
39 | cv2.imwrite(outname, resized_gt_np)
40 |
41 | if __name__ == '__main__':
42 | resize_split('valid')
43 | # resize_split('train')
44 | # resize_split('test')
--------------------------------------------------------------------------------
/tools/evaluate_coco_boundary_ap.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
3 | # Modified by Bowen Cheng from: https://github.com/bowenc0221/boundary-iou-api/blob/master/tools/coco_instance_evaluation.py
4 |
5 | """
6 | Evaluation for COCO val2017:
7 | python ./tools/coco_instance_evaluation.py \
8 | --gt-json-file COCO_GT_JSON \
9 | --dt-json-file COCO_DT_JSON
10 | """
11 | import argparse
12 | import json
13 |
14 | from boundary_iou.coco_instance_api.coco import COCO
15 | from boundary_iou.coco_instance_api.cocoeval import COCOeval
16 |
17 |
18 | def main():
19 | parser = argparse.ArgumentParser()
20 | parser.add_argument("--gt-json-file", default="")
21 | parser.add_argument("--dt-json-file", default="")
22 | parser.add_argument("--iou-type", default="boundary")
23 | parser.add_argument("--dilation-ratio", default="0.020", type=float)
24 | args = parser.parse_args()
25 | print(args)
26 |
27 | annFile = args.gt_json_file
28 | resFile = args.dt_json_file
29 | dilation_ratio = args.dilation_ratio
30 | if args.iou_type == "boundary":
31 | get_boundary = True
32 | else:
33 | get_boundary = False
34 | cocoGt = COCO(annFile, get_boundary=get_boundary, dilation_ratio=dilation_ratio)
35 |
36 | # remove box predictions
37 | resFile = json.load(open(resFile))
38 | for c in resFile:
39 | c.pop("bbox", None)
40 |
41 | cocoDt = cocoGt.loadRes(resFile)
42 | cocoEval = COCOeval(cocoGt, cocoDt, iouType=args.iou_type, dilation_ratio=dilation_ratio)
43 | cocoEval.evaluate()
44 | cocoEval.accumulate()
45 | cocoEval.summarize()
46 |
47 |
48 | if __name__ == '__main__':
49 | main()
50 |
--------------------------------------------------------------------------------
/INSTALL.md:
--------------------------------------------------------------------------------
1 | ## Installation
2 |
3 | ### Requirements
4 | - Linux or macOS with Python ≥ 3.6
5 | - PyTorch ≥ 1.9 and [torchvision](https://github.com/pytorch/vision/) that matches the PyTorch installation.
6 | Install them together at [pytorch.org](https://pytorch.org) to make sure of this. Note, please check
7 | PyTorch version matches that is required by Detectron2.
8 | - Detectron2: follow [Detectron2 installation instructions](https://detectron2.readthedocs.io/tutorials/install.html).
9 | - OpenCV is optional but needed by demo and visualization
10 | - `pip install -r requirements.txt`
11 |
12 | ### CUDA kernel for MSDeformAttn
13 | After preparing the required environment, run the following command to compile CUDA kernel for MSDeformAttn:
14 |
15 | `CUDA_HOME` must be defined and points to the directory of the installed CUDA toolkit.
16 |
17 | ```bash
18 | cd hgformer/modeling/pixel_decoder/ops
19 | python setup.py build install
20 | ```
21 |
22 | #### Building on another system
23 | To build on a system that does not have a GPU device but provide the drivers:
24 | ```bash
25 | TORCH_CUDA_ARCH_LIST='8.0' FORCE_CUDA=1 python setup.py build install
26 | ```
27 |
28 | ### Example conda environment setup
29 | ```bash
30 | conda create --name hgformer python=3.8 -y
31 | conda activate hgformer
32 | conda install pytorch==1.9.0 torchvision==0.10.0 cudatoolkit=11.1 -c pytorch -c nvidia
33 | pip install -U opencv-python
34 |
35 | # under your working directory
36 | python -m pip install detectron2 -f \
37 | https://dl.fbaipublicfiles.com/detectron2/wheels/cu111/torch1.9/index.html
38 |
39 | pip install git+https://github.com/mcordts/cityscapesScripts.git
40 |
41 | cd ..
42 | git clone https://github.com/dingjiansw101/HGFormer.git
43 | cd HGFormer
44 | pip install -r requirements.txt
45 | cd hgformer/modeling/pixel_decoder/ops
46 | sh make.sh
47 | ```
48 |
--------------------------------------------------------------------------------
/configs/mapillary/Base-mapillary19-SemanticSegmentation.yaml:
--------------------------------------------------------------------------------
1 | MODEL:
2 | BACKBONE:
3 | FREEZE_AT: 0
4 | NAME: "build_resnet_backbone"
5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
6 | PIXEL_MEAN: [123.675, 116.280, 103.530]
7 | PIXEL_STD: [58.395, 57.120, 57.375]
8 | RESNETS:
9 | DEPTH: 50
10 | STEM_TYPE: "basic" # not used
11 | STEM_OUT_CHANNELS: 64
12 | STRIDE_IN_1X1: False
13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 | NORM: "SyncBN" # use syncbn for cityscapes dataset
15 | RES5_MULTI_GRID: [1, 1, 1] # not used
16 | DATASETS:
17 | TRAIN: ("mapillary_train",)
18 | TEST: ("mapillary_val", "gta_trainid_val", "synthia_val", "cityscapes_fine_sem_seg_val", "bdd_val")
19 |
20 | SOLVER:
21 | IMS_PER_BATCH: 16
22 | BASE_LR: 0.0001
23 | MAX_ITER: 90000
24 | WARMUP_FACTOR: 1.0
25 | WARMUP_ITERS: 0
26 | WEIGHT_DECAY: 0.05
27 | OPTIMIZER: "ADAMW"
28 | LR_SCHEDULER_NAME: "WarmupPolyLR"
29 | BACKBONE_MULTIPLIER: 0.1
30 | CLIP_GRADIENTS:
31 | ENABLED: True
32 | CLIP_TYPE: "full_model"
33 | CLIP_VALUE: 0.01
34 | NORM_TYPE: 2.0
35 | AMP:
36 | ENABLED: True
37 | INPUT:
38 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1024) for x in range(5, 21)]"]
39 | MIN_SIZE_TRAIN_SAMPLING: "choice"
40 | MIN_SIZE_TEST: 1024
41 | MAX_SIZE_TRAIN: 4096
42 | MAX_SIZE_TEST: 2048
43 | CROP:
44 | ENABLED: True
45 | TYPE: "absolute"
46 | SIZE: (512, 1024)
47 | SINGLE_CATEGORY_MAX_AREA: 1.0
48 | COLOR_AUG_SSD: True
49 | SIZE_DIVISIBILITY: -1
50 | FORMAT: "RGB"
51 | DATASET_MAPPER_NAME: "mask_former_semantic"
52 | TEST:
53 | EVAL_PERIOD: 90000
54 | AUG:
55 | ENABLED: False
56 | MIN_SIZES: [512, 768, 1024, 1280, 1536, 1792]
57 | MAX_SIZE: 4096
58 | FLIP: True
59 | DATALOADER:
60 | FILTER_EMPTY_ANNOTATIONS: True
61 | NUM_WORKERS: 4
62 | VERSION: 2
63 |
--------------------------------------------------------------------------------
/datasets/prepare_gta_sem_seg.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | import os
4 | from pathlib import Path
5 |
6 | import numpy as np
7 | import tqdm
8 | from PIL import Image
9 | from multiprocessing import Pool
10 |
11 | id_to_trainid = {7: 0, 8: 1, 11: 2, 12: 3, 13: 4, 17: 5,
12 | 19: 6, 20: 7, 21: 8, 22: 9, 23: 10, 24: 11, 25: 12,
13 | 26: 13, 27: 14, 28: 15, 31: 16, 32: 17, 33: 18}
14 |
15 |
16 | def convert(input, outputpath):
17 | lab = np.asarray(Image.open(input))
18 | assert lab.dtype == np.uint8
19 | output = np.zeros_like(lab, dtype=np.uint8) + 255
20 | for obj_id in np.unique(lab):
21 | if obj_id in id_to_trainid:
22 | output[lab == obj_id] = id_to_trainid[obj_id]
23 |
24 | Image.fromarray(output).save(outputpath)
25 |
26 | def worker(file_tuple):
27 | file, output_file = file_tuple
28 | lab = np.asarray(Image.open(file))
29 | assert lab.dtype == np.uint8
30 | output = np.zeros_like(lab, dtype=np.uint8) + 255
31 | for obj_id in np.unique(lab):
32 | if obj_id in id_to_trainid:
33 | output[lab == obj_id] = id_to_trainid[obj_id]
34 |
35 | Image.fromarray(output).save(output_file)
36 |
37 | if __name__ == "__main__":
38 | dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "GTA"
39 | for name in ["train", "valid", "test"]:
40 | annotation_dir = dataset_dir / "labels" / name
41 | output_dir = dataset_dir / "labels_detectron2" / name
42 | output_dir.mkdir(parents=True, exist_ok=True)
43 |
44 | file_list = []
45 | for file in tqdm.tqdm(list(annotation_dir.iterdir())):
46 | output_file = output_dir / file.name
47 | file_list.append((file, output_file))
48 | # convert(file, output_file)
49 |
50 | pool = Pool(32)
51 | pool.map(worker, file_list)
52 | print(f'done {name}')
53 |
--------------------------------------------------------------------------------
/configs/cityscapes/Base-Cityscapes-SemanticSegmentation.yaml:
--------------------------------------------------------------------------------
1 | MODEL:
2 | BACKBONE:
3 | FREEZE_AT: 0
4 | NAME: "build_resnet_backbone"
5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
6 | PIXEL_MEAN: [123.675, 116.280, 103.530]
7 | PIXEL_STD: [58.395, 57.120, 57.375]
8 | RESNETS:
9 | DEPTH: 50
10 | STEM_TYPE: "basic" # not used
11 | STEM_OUT_CHANNELS: 64
12 | STRIDE_IN_1X1: False
13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 | NORM: "SyncBN" # use syncbn for cityscapes dataset
15 | RES5_MULTI_GRID: [1, 1, 1] # not used
16 |
17 | DATASETS:
18 | TRAIN: ("cityscapes_fine_sem_seg_train",)
19 | TEST: ("cityscapes_fine_sem_seg_val", "mapillary_val", "bdd_val", "gta_trainid_val", "synthia_val")
20 | SOLVER:
21 | IMS_PER_BATCH: 16
22 | BASE_LR: 0.0001
23 | MAX_ITER: 90000
24 | WARMUP_FACTOR: 1.0
25 | WARMUP_ITERS: 0
26 | WEIGHT_DECAY: 0.05
27 | OPTIMIZER: "ADAMW"
28 | LR_SCHEDULER_NAME: "WarmupPolyLR"
29 | BACKBONE_MULTIPLIER: 0.1
30 | # dense period for job array on gpu22
31 | CHECKPOINT_PERIOD: 1000
32 | CLIP_GRADIENTS:
33 | ENABLED: True
34 | CLIP_TYPE: "full_model"
35 | CLIP_VALUE: 0.01
36 | NORM_TYPE: 2.0
37 | AMP:
38 | ENABLED: True
39 | INPUT:
40 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1024) for x in range(5, 21)]"]
41 | MIN_SIZE_TRAIN_SAMPLING: "choice"
42 | MIN_SIZE_TEST: 1024
43 | MAX_SIZE_TRAIN: 4096
44 | MAX_SIZE_TEST: 2048
45 | CROP:
46 | ENABLED: True
47 | TYPE: "absolute"
48 | SIZE: (512, 1024)
49 | SINGLE_CATEGORY_MAX_AREA: 1.0
50 | COLOR_AUG_SSD: True
51 | SIZE_DIVISIBILITY: -1
52 | FORMAT: "RGB"
53 | DATASET_MAPPER_NAME: "mask_former_semantic"
54 | TEST:
55 | EVAL_PERIOD: 5000
56 | AUG:
57 | ENABLED: False
58 | MIN_SIZES: [512, 768, 1024, 1280, 1536, 1792]
59 | MAX_SIZE: 4096
60 | FLIP: True
61 | DATALOADER:
62 | FILTER_EMPTY_ANNOTATIONS: True
63 | NUM_WORKERS: 4
64 | VERSION: 2
65 |
66 | #CUDNN_BENCHMARK: True
--------------------------------------------------------------------------------
/hgformer/data/datasets/register_city_c.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | import os
3 |
4 | from detectron2.data import DatasetCatalog, MetadataCatalog
5 | from detectron2.data.datasets import load_sem_seg
6 | from detectron2.data.datasets.builtin_meta import _get_builtin_metadata
7 | # from .acdc import load_acdc_semantic
8 | from detectron2.data.datasets.cityscapes import load_cityscapes_semantic
9 |
10 | corruptions = ['clean', 'gaussian_noise', 'shot_noise', 'impulse_noise', 'defocus_blur',
11 | 'glass_blur', 'motion_blur', 'zoom_blur', 'snow', 'frost', 'fog',
12 | 'brightness', 'contrast', 'elastic_transform', 'pixelate', 'jpeg_compression',
13 | 'speckle_noise', 'gaussian_blur', 'spatter', 'saturate']
14 | # ==== Predefined splits for raw cityscapes c images ===========
15 |
16 | _RAW_ACDC_SPLITS = {}
17 | for noise in corruptions:
18 | if noise == 'clean':
19 | cur_data = {f"cityscapes_fine_{noise}_val": (f"cityscapes-c/{noise}/", "cityscapes/gtFine/val/")}
20 | else:
21 | for severity in range(5):
22 | severity_str = str(severity+1)
23 | cur_data = {f"cityscapes_fine_{noise}_{severity_str}_val": (f"cityscapes-c/{noise}/{severity_str}", "cityscapes/gtFine/val/")}
24 | _RAW_ACDC_SPLITS.update(cur_data)
25 | def register_all_city_c(root):
26 | for key, (image_dir, gt_dir) in _RAW_ACDC_SPLITS.items():
27 | meta = _get_builtin_metadata("cityscapes")
28 | image_dir = os.path.join(root, image_dir)
29 | gt_dir = os.path.join(root, gt_dir)
30 | # sem_key = key.format(task="sem_seg")
31 | DatasetCatalog.register(
32 | key, lambda x=image_dir, y=gt_dir: load_cityscapes_semantic(x, y)
33 | )
34 | MetadataCatalog.get(key).set(
35 | image_dir=image_dir,
36 | gt_dir=gt_dir,
37 | evaluator_type="cityscapes_sem_seg",
38 | ignore_label=255,
39 | **meta,
40 | )
41 |
42 | _root = os.getenv("DETECTRON2_DATASETS", "datasets")
43 | register_all_city_c(_root)
44 |
--------------------------------------------------------------------------------
/hgformer/data/samplers/grouped_batch_sampler.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | import numpy as np
3 | from torch.utils.data.sampler import BatchSampler, Sampler
4 |
5 |
6 | class GroupedBatchSampler(BatchSampler):
7 | """
8 | Wraps another sampler to yield a mini-batch of indices.
9 | It enforces that the batch only contain elements from the same group.
10 | It also tries to provide mini-batches which follows an ordering which is
11 | as close as possible to the ordering from the original sampler.
12 | """
13 |
14 | def __init__(self, sampler, group_ids, batch_size):
15 | """
16 | Args:
17 | sampler (Sampler): Base sampler.
18 | group_ids (list[int]): If the sampler produces indices in range [0, N),
19 | `group_ids` must be a list of `N` ints which contains the group id of each sample.
20 | The group ids must be a set of integers in the range [0, num_groups).
21 | batch_size (int): Size of mini-batch.
22 | """
23 | if not isinstance(sampler, Sampler):
24 | raise ValueError(
25 | "sampler should be an instance of "
26 | "torch.utils.data.Sampler, but got sampler={}".format(sampler)
27 | )
28 | self.sampler = sampler
29 | self.group_ids = np.asarray(group_ids)
30 | assert self.group_ids.ndim == 1
31 | self.batch_size = batch_size
32 | groups = np.unique(self.group_ids).tolist()
33 |
34 | # buffer the indices of each group until batch size is reached
35 | self.buffer_per_group = {k: [] for k in groups}
36 |
37 | def __iter__(self):
38 | for idx in self.sampler:
39 | group_id = self.group_ids[idx]
40 | group_buffer = self.buffer_per_group[group_id]
41 | group_buffer.append(idx)
42 | if len(group_buffer) == self.batch_size:
43 | yield group_buffer[:] # yield a copy of the list
44 | del group_buffer[:]
45 |
46 | def __len__(self):
47 | raise NotImplementedError("len() of GroupedBatchSampler is not well-defined.")
48 |
--------------------------------------------------------------------------------
/hgformer/modeling/pixel_decoder/ops/src/ms_deform_attn.h:
--------------------------------------------------------------------------------
1 | /*!
2 | **************************************************************************************************
3 | * Deformable DETR
4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6 | **************************************************************************************************
7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
8 | **************************************************************************************************
9 | */
10 |
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 |
16 | #pragma once
17 |
18 | #include "cpu/ms_deform_attn_cpu.h"
19 |
20 | #ifdef WITH_CUDA
21 | #include "cuda/ms_deform_attn_cuda.h"
22 | #endif
23 |
24 |
25 | at::Tensor
26 | ms_deform_attn_forward(
27 | const at::Tensor &value,
28 | const at::Tensor &spatial_shapes,
29 | const at::Tensor &level_start_index,
30 | const at::Tensor &sampling_loc,
31 | const at::Tensor &attn_weight,
32 | const int im2col_step)
33 | {
34 | if (value.type().is_cuda())
35 | {
36 | #ifdef WITH_CUDA
37 | return ms_deform_attn_cuda_forward(
38 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step);
39 | #else
40 | AT_ERROR("Not compiled with GPU support");
41 | #endif
42 | }
43 | AT_ERROR("Not implemented on the CPU");
44 | }
45 |
46 | std::vector
47 | ms_deform_attn_backward(
48 | const at::Tensor &value,
49 | const at::Tensor &spatial_shapes,
50 | const at::Tensor &level_start_index,
51 | const at::Tensor &sampling_loc,
52 | const at::Tensor &attn_weight,
53 | const at::Tensor &grad_output,
54 | const int im2col_step)
55 | {
56 | if (value.type().is_cuda())
57 | {
58 | #ifdef WITH_CUDA
59 | return ms_deform_attn_cuda_backward(
60 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step);
61 | #else
62 | AT_ERROR("Not compiled with GPU support");
63 | #endif
64 | }
65 | AT_ERROR("Not implemented on the CPU");
66 | }
67 |
68 |
--------------------------------------------------------------------------------
/hgformer/data/datasets/register_city_c_vis.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | import os
3 |
4 | from detectron2.data import DatasetCatalog, MetadataCatalog
5 | from detectron2.data.datasets import load_sem_seg
6 | from detectron2.data.datasets.builtin_meta import _get_builtin_metadata
7 | # from .acdc import load_acdc_semantic
8 | from detectron2.data.datasets.cityscapes import load_cityscapes_semantic
9 |
10 | corruptions = ['clean', 'gaussian_noise', 'shot_noise', 'impulse_noise', 'defocus_blur',
11 | 'glass_blur', 'motion_blur', 'zoom_blur', 'snow', 'frost', 'fog',
12 | 'brightness', 'contrast', 'elastic_transform', 'pixelate', 'jpeg_compression',
13 | 'speckle_noise', 'gaussian_blur', 'spatter', 'saturate']
14 | # ==== Predefined splits for raw cityscapes c images ===========
15 | _RAW_ACDC_SPLITS = {
16 | "city_c_gaussiannoise5_vis": ("gauss_noise/5/", "cityscapes/gtFine/val/"),
17 | "city_c_gaussiannoise4_vis": ("gauss_noise/4/", "cityscapes/gtFine/val/"),
18 | "city_c_gaussiannoise3_vis": ("gauss_noise/3/", "cityscapes/gtFine/val/"),
19 | "city_c_gaussiannoise2_vis": ("gauss_noise/2/", "cityscapes/gtFine/val/"),
20 | "city_c_gaussiannoise1_vis": ("gauss_noise/1/", "cityscapes/gtFine/val/"),
21 | "city_c_gaussiannoise0_vis": ("gauss_noise/0/", "cityscapes/gtFine/val/"),
22 | "city_c_tmp_gaussiannoise4_vis": ("city_c_tmp/gaussian_noise/4/", "cityscapes/gtFine/val/"),
23 | "city_c_tmp_clean_vis": ("city_c_tmp/clean/", "cityscapes/gtFine/val/"),
24 |
25 | }
26 |
27 | def register_all_city_c_vis(root):
28 | for key, (image_dir, gt_dir) in _RAW_ACDC_SPLITS.items():
29 | meta = _get_builtin_metadata("cityscapes")
30 | image_dir = os.path.join(root, image_dir)
31 | gt_dir = os.path.join(root, gt_dir)
32 | # sem_key = key.format(task="sem_seg")
33 | DatasetCatalog.register(
34 | key, lambda x=image_dir, y=gt_dir: load_cityscapes_semantic(x, y)
35 | )
36 | MetadataCatalog.get(key).set(
37 | image_dir=image_dir,
38 | gt_dir=gt_dir,
39 | evaluator_type="cityscapes_sem_seg",
40 | ignore_label=255,
41 | **meta,
42 | )
43 |
44 | _root = os.getenv("DETECTRON2_DATASETS", "datasets")
45 | register_all_city_c_vis(_root)
46 |
--------------------------------------------------------------------------------
/datasets/generate_cityscapes_c.py:
--------------------------------------------------------------------------------
1 | from imagecorruptions import corrupt
2 | from imagecorruptions import get_corruption_names
3 | import os
4 | import cv2
5 | from multiprocessing import Pool
6 | import numpy as np
7 | import random
8 | import mmcv
9 |
10 | random.seed(8) # for reproducibility
11 | np.random.seed(8)
12 | corruptions = ['gaussian_noise', 'shot_noise', 'impulse_noise', 'defocus_blur',
13 | 'glass_blur', 'motion_blur', 'zoom_blur', 'snow', 'frost', 'fog',
14 | 'brightness', 'contrast', 'elastic_transform', 'pixelate', 'jpeg_compression',
15 | 'speckle_noise', 'gaussian_blur', 'spatter', 'saturate']
16 |
17 | img_dir = 'datasets/cityscapes-c/clean'
18 | num_imgs = 500
19 | img_names = []
20 | prog_bar = mmcv.ProgressBar(num_imgs)
21 | img_dict = {}
22 | for img_path in mmcv.scandir(img_dir, suffix='png', recursive=True):
23 | img_name = os.path.join(img_dir, img_path)
24 | img = mmcv.imread(img_name)
25 | img_dict[img_name] = img
26 | prog_bar.update()
27 |
28 | def perturb(i, p, s):
29 | img = corrupt(i, corruption_name=p, severity=s)
30 | return img
31 |
32 | def worker(optuple):
33 | srcfile, p, s, perturbed_img_path = optuple
34 | img = img_dict[srcfile]
35 | perturbed_img = perturb(img, p, s)
36 | mmcv.imwrite(perturbed_img, perturbed_img_path, auto_mkdir=True)
37 |
38 | def convert_img_path(ori_path, suffix):
39 | new_path = ori_path.replace('clean', suffix)
40 | assert new_path != ori_path
41 | return new_path
42 |
43 | if __name__ == '__main__':
44 |
45 | pool = Pool(32)
46 | filelist = []
47 | for p in corruptions:
48 | print("\n ### gen corruption:{} ###".format(p))
49 | for img_path in mmcv.scandir(img_dir, suffix='png', recursive=True):
50 | srcfile = os.path.join(img_dir, img_path)
51 | for s in range(5):
52 | img_suffix = p + "/" + str(s+1)
53 | out_dir = img_dir.replace('clean', img_suffix)
54 | assert out_dir != img_dir
55 | if not os.path.exists(out_dir):
56 | os.makedirs(out_dir)
57 | perturbed_img_path = convert_img_path(srcfile, img_suffix)
58 | filelist.append((srcfile, p, s+1, perturbed_img_path))
59 | # import ipdb; ipdb.set_trace()
60 | pool.map(worker, filelist)
--------------------------------------------------------------------------------
/datasets/find_truncated_images.py:
--------------------------------------------------------------------------------
1 | from PIL import Image
2 | import os
3 | import numpy as np
4 |
5 | # https://en.wikipedia.org/wiki/YUV#SDTV_with_BT.601
6 | _M_RGB2YUV = [[0.299, 0.587, 0.114], [-0.14713, -0.28886, 0.436], [0.615, -0.51499, -0.10001]]
7 | _M_YUV2RGB = [[1.0, 0.0, 1.13983], [1.0, -0.39465, -0.58060], [1.0, 2.03211, 0.0]]
8 |
9 | def GetFileFromThisRootDir(dir,ext = None):
10 | allfiles = []
11 | needExtFilter = (ext != None)
12 | for root,dirs,files in os.walk(dir):
13 | for filespath in files:
14 | filepath = os.path.join(root, filespath)
15 | extension = os.path.splitext(filepath)[1][1:]
16 | if needExtFilter and extension in ext:
17 | allfiles.append(filepath)
18 | elif not needExtFilter:
19 | allfiles.append(filepath)
20 | return allfiles
21 |
22 | def convert_PIL_to_numpy(image, format):
23 | """
24 | Convert PIL image to numpy array of target format.
25 |
26 | Args:
27 | image (PIL.Image): a PIL image
28 | format (str): the format of output image
29 |
30 | Returns:
31 | (np.ndarray): also see `read_image`
32 | """
33 | if format is not None:
34 | # PIL only supports RGB, so convert to RGB and flip channels over below
35 | conversion_format = format
36 | if format in ["BGR", "YUV-BT.601"]:
37 | conversion_format = "RGB"
38 | image = image.convert(conversion_format)
39 | image = np.asarray(image)
40 | # PIL squeezes out the channel dimension for "L", so make it HWC
41 | if format == "L":
42 | image = np.expand_dims(image, -1)
43 |
44 | # handle formats not supported by PIL
45 | elif format == "BGR":
46 | # flip channels if needed
47 | image = image[:, :, ::-1]
48 | elif format == "YUV-BT.601":
49 | image = image / 255.0
50 | image = np.dot(image, np.array(_M_RGB2YUV).T)
51 |
52 | return image
53 |
54 | filepath = "/BS/databases15/GTA/images/train"
55 |
56 | filenames = GetFileFromThisRootDir(filepath)
57 | count = 0
58 | for file in filenames:
59 | img = Image.open(file)
60 | print(f'filename: {file}')
61 | try:
62 | img_np = convert_PIL_to_numpy(img, format="RGB")
63 | except:
64 | # import ipdb; ipdb.set_trace()
65 | count = count + 1
66 | print(f"count: {count}")
67 | print(f"count: {count}")
--------------------------------------------------------------------------------
/datasets/split_data/synthia/split_synthia.py:
--------------------------------------------------------------------------------
1 | from multiprocessing import Pool
2 | import shutil
3 | import os
4 | shutil._USE_CP_SENDFILE = False
5 |
6 | def worker(path_pair):
7 | srcpath, dstpath = path_pair
8 | # print(f'srcpath{srcpath}')
9 | # print(f'dstpath{dstpath}')
10 | # shutil.copyfile(srcpath, dstpath)
11 | shutil.move(srcpath, dstpath)
12 |
13 | if __name__ == '__main__':
14 | pool = Pool(32)
15 | image_path = r'datasets/synthia/RGB'
16 | label_path = r'datasets/synthia/GT/LABELS'
17 |
18 | # dst_image_path = r'datasets/synthia_split/RGB'
19 | # dst_label_path = r'datasets/synthia_split/GT'
20 |
21 | dst_image_path = image_path
22 | dst_label_path = label_path
23 |
24 | with open('datasets/split_data/synthia_split_train.txt', 'r') as f:
25 | train_list = f.readlines()
26 | train_list = [x.strip() for x in train_list]
27 |
28 | with open('datasets/split_data/synthia_split_val.txt', 'r') as f:
29 | val_list = f.readlines()
30 | val_list = [x.strip() for x in val_list]
31 |
32 | train_pairs = []
33 |
34 | if not os.path.exists(os.path.join(dst_image_path, 'train')):
35 | os.makedirs(os.path.join(dst_image_path, 'train'))
36 |
37 | if not os.path.exists(os.path.join(dst_label_path, 'train')):
38 | os.makedirs(os.path.join(dst_label_path, 'train'))
39 |
40 | for file in train_list:
41 | srcfile = os.path.join(image_path, file)
42 | dstfile = os.path.join(dst_image_path, 'train', file)
43 | train_pairs.append((srcfile, dstfile))
44 |
45 | srclabel = os.path.join(label_path, file)
46 | dstlabel = os.path.join(dst_label_path, 'train', file)
47 | train_pairs.append((srclabel, dstlabel))
48 | pool.map(worker, train_pairs)
49 |
50 | val_pairs = []
51 |
52 | if not os.path.exists(os.path.join(dst_image_path, 'val')):
53 | os.makedirs(os.path.join(dst_image_path, 'val'))
54 |
55 | if not os.path.exists(os.path.join(dst_label_path, 'val')):
56 | os.makedirs(os.path.join(dst_label_path, 'val'))
57 |
58 | for file in val_list:
59 | srcfile = os.path.join(image_path, file)
60 | dstfile = os.path.join(dst_image_path, 'val', file)
61 | val_pairs.append((srcfile, dstfile))
62 |
63 | srclabel = os.path.join(label_path, file)
64 | dstlabel = os.path.join(dst_label_path, 'val', file)
65 | val_pairs.append((srclabel, dstlabel))
66 | pool.map(worker, val_pairs)
67 |
--------------------------------------------------------------------------------
/configs/cityscapes/hgformer_R50_bs16_20k.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: Base-Cityscapes-SemanticSegmentation.yaml
2 | MODEL:
3 | META_ARCHITECTURE: "GroupFormer"
4 | RESNETS:
5 | NORM: "GN"
6 | SEM_SEG_HEAD:
7 | NAME: "MaskFormerHead"
8 | IGNORE_VALUE: 255
9 | NUM_CLASSES: 19
10 | LOSS_WEIGHT: 1.0
11 | CONVS_DIM: 256
12 | MASK_DIM: 256
13 | NORM: "GN"
14 | # pixel decoder
15 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
16 | IN_FEATURES: ["res2", "res3", "res4", "res5"]
17 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
18 | COMMON_STRIDE: 4
19 | TRANSFORMER_ENC_LAYERS: 6
20 | NUM_GROUP_TOKENS: [512, 32]
21 | NUM_OUTPUT_GROUPS: [512, 32]
22 | # DOWNSAMPLE_RATE: 16 # 0.31
23 | # DOWNSAMPLE_RATE: 8 #
24 | DOWNSAMPLE_RATE: 4 # 0.32s
25 |
26 | # SPIX_RES: [16, 16]
27 | MASK_FORMER:
28 | # TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
29 | TRANSFORMER_DECODER_NAME: "GroupFormerDecoder"
30 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
31 | DEEP_SUPERVISION: True
32 | # DEEP_MASK_SUPERVISION: False
33 | NO_OBJECT_WEIGHT: 0.1
34 | STAGE_WEIGHTS: [1.0]
35 | CLASS_WEIGHT: 2.0
36 | MASK_WEIGHT: 5.0
37 | DICE_WEIGHT: 5.0
38 | SPIX_MASK_WEIGHT: 0.0
39 | SPIX_CLASS_WEIGHT: 2.0
40 | CONTRASTIVE_LOSS: True
41 | CONTRASTIVE_WEIGH: 6.0
42 | CONTRASTIVE_TAU: 0.1
43 | HIDDEN_DIM: 256
44 | NUM_OBJECT_QUERIES: 100
45 | NHEADS: 8
46 | DROPOUT: 0.0
47 | DIM_FEEDFORWARD: 2048
48 | ENC_LAYERS: 0
49 | PRE_NORM: False
50 | ENFORCE_INPUT_PROJ: False
51 | SIZE_DIVISIBILITY: 32
52 | DEC_LAYERS: 6 # 9 decoder layers, add one for the loss on learnable query
53 | SPIX_SELF_ATTEN_LAYERS: 6
54 | TRAIN_NUM_POINTS: 12544
55 | OVERSAMPLE_RATIO: 3.0
56 | IMPORTANCE_SAMPLE_RATIO: 0.75
57 | TEST:
58 | SEMANTIC_ON: True
59 | INSTANCE_ON: False
60 | PANOPTIC_ON: False
61 | OVERLAP_THRESHOLD: 0.8
62 | OBJECT_MASK_THRESHOLD: 0.8
63 | SOLVER:
64 | IMS_PER_BATCH: 16
65 | BASE_LR: 0.0001
66 | MAX_ITER: 20000
67 | WARMUP_FACTOR: 1.0
68 | WARMUP_ITERS: 0
69 | WEIGHT_DECAY: 0.05
70 | OPTIMIZER: "ADAMW"
71 | LR_SCHEDULER_NAME: "WarmupPolyLR"
72 | BACKBONE_MULTIPLIER: 0.1
73 | CLIP_GRADIENTS:
74 | ENABLED: True
75 | CLIP_TYPE: "full_model"
76 | CLIP_VALUE: 0.01
77 | NORM_TYPE: 2.0
78 | AMP:
79 | ENABLED: False
80 |
81 | DATALOADER:
82 | FILTER_EMPTY_ANNOTATIONS: True
83 | NUM_WORKERS: 4
84 | VERSION: 2
85 |
86 | CUDNN_BENCHMARK: True
87 | TEST:
88 | CLUSTER_SOFTMAX: True
89 | PRED_STAGE: "spix_pixelexclude0125+stage3"
--------------------------------------------------------------------------------
/GETTING_STARTED.md:
--------------------------------------------------------------------------------
1 | # Getting Started with HGFormer
2 |
3 | This document provides a brief intro of the usage of HGFormer.
4 |
5 | Please see [Getting Started with Detectron2](https://github.com/facebookresearch/detectron2/blob/master/GETTING_STARTED.md) for full usage.
6 |
7 | ## Evaluation with Pre-trained Models
8 |
9 | Download [models](https://drive.google.com/drive/folders/1fUWaIhXtSxHLdTFxnuOSldLUe_ferauh?usp=drive_link).
10 |
11 | ### Cityscapes -> ACDC
12 |
13 | ```
14 | python demo/inference.py --config-file configs/cityscapes/hgformer_swin_tiny_bs16_20k.yaml \
15 | --input datasets/acdc/rgb_anon/all/test --output path_to_output \
16 | --opts MODEL.WEIGHTS path_to_checkpoint
17 | ```
18 | After running the command, you will find the results on ```path_to_output```. Then you can follow the instructions on [ACDC evaluation server](https://acdc.vision.ee.ethz.ch/login?target=%2Fsubmit) to get your scores.
19 | You can replace ```all``` with a specific type ```fog, snow, night, rain```, if you want to evaluate on a specific type
20 |
21 | ### Cityscapes -> Cityscapes-c
22 |
23 | ```
24 | python test_city_c_level5.py --num-gpus 8 --config-file configs/city_c/hgformer_swin_tiny_bs16_20k.yaml \
25 | --eval-only MODEL.WEIGHTS path_to_checkpoint OUTPUT_DIR path_to_output
26 | ```
27 |
28 | ### Cityscapes -> Others
29 |
30 | ```
31 | python plain_train_net.py --num-gpus 8 --config-file configs/cityscapes/hgformer_swin_tiny_bs16_20k.yaml \
32 | --eval-only MODEL.WEIGHTS path_to_checkpoint OUTPUT_DIR path_to_output
33 | ```
34 |
35 | ### Mapillary -> Others
36 |
37 | ```
38 | python plain_train_net.py --num-gpus 8 --config-file configs/mapillary/hgformer_swin_tiny_bs16_20k_mapillary.yaml \
39 | --eval-only MODEL.WEIGHTS path_to_checkpoint OUTPUT_DIR path_to_output
40 | ```
41 |
42 | ## Training in Command Line
43 |
44 |
45 | To train a model, first
46 | setup the corresponding datasets following
47 | [datasets/README.md](./datasets/README.md), then prepare the models pre-trained on ImageNet classificaiton following [tools/README.md](./tools/README.md). Finally run:
48 | ```
49 | python plain_train_net.py --num-gpus 8 \
50 | --config-file configs/cityscapes/hgformer_swin_tiny_bs16_20k.yaml OUTPUT_DIR path_to_output
51 | ```
52 |
53 | The configs are made for 8-GPU training.
54 | Since we use ADAMW optimizer, it is not clear how to scale learning rate with batch size.
55 | To train on 1 GPU, you need to figure out learning rate and batch size by yourself:
56 | ```
57 | python plain_train_net.py \
58 | --config-file configs/cityscapes/hgformer_swin_tiny_bs16_20k.yaml \
59 | --num-gpus 1 SOLVER.IMS_PER_BATCH SET_TO_SOME_REASONABLE_VALUE SOLVER.BASE_LR SET_TO_SOME_REASONABLE_VALUE
60 | ```
61 |
--------------------------------------------------------------------------------
/configs/mapillary/hgformer_R50_bs16_20k_mapillary.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: Base-mapillary19-SemanticSegmentation.yaml
2 | MODEL:
3 | META_ARCHITECTURE: "GroupFormer"
4 | RESNETS:
5 | NORM: "GN"
6 | SEM_SEG_HEAD:
7 | NAME: "MaskFormerHead"
8 | IGNORE_VALUE: 255
9 | NUM_CLASSES: 19
10 | LOSS_WEIGHT: 1.0
11 | CONVS_DIM: 256
12 | MASK_DIM: 256
13 | NORM: "GN"
14 | # pixel decoder
15 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
16 | IN_FEATURES: ["res2", "res3", "res4", "res5"]
17 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
18 | COMMON_STRIDE: 4
19 | TRANSFORMER_ENC_LAYERS: 6
20 | NUM_GROUP_TOKENS: [512, 32]
21 | NUM_OUTPUT_GROUPS: [512, 32]
22 | # DOWNSAMPLE_RATE: 16 # 0.31
23 | # DOWNSAMPLE_RATE: 8 # mapillary: (16, 22)
24 | DOWNSAMPLE_RATE: 4 # 0.32s mapillary: (32, 44), cityscapes: (32, 64)
25 |
26 | # SPIX_RES: [16, 16]
27 | MASK_FORMER:
28 | # TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
29 | TRANSFORMER_DECODER_NAME: "GroupFormerDecoder"
30 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
31 | DEEP_SUPERVISION: True
32 | # DEEP_MASK_SUPERVISION: False
33 | NO_OBJECT_WEIGHT: 0.1
34 | STAGE_WEIGHTS: [1.0]
35 | CLASS_WEIGHT: 2.0
36 | MASK_WEIGHT: 5.0
37 | DICE_WEIGHT: 5.0
38 | SPIX_MASK_WEIGHT: 0.0
39 | SPIX_CLASS_WEIGHT: 2.0
40 | CONTRASTIVE_LOSS: True
41 | CONTRASTIVE_WEIGH: 6.0
42 | CONTRASTIVE_TAU: 0.1
43 | HIDDEN_DIM: 256
44 | NUM_OBJECT_QUERIES: 100
45 | NHEADS: 8
46 | DROPOUT: 0.0
47 | DIM_FEEDFORWARD: 2048
48 | ENC_LAYERS: 0
49 | PRE_NORM: False
50 | ENFORCE_INPUT_PROJ: False
51 | SIZE_DIVISIBILITY: 32
52 | # SIZE_DIVISIBILITY: 64
53 | DEC_LAYERS: 6 # 9 decoder layers, add one for the loss on learnable query
54 | SPIX_SELF_ATTEN_LAYERS: 6
55 | TRAIN_NUM_POINTS: 12544
56 | OVERSAMPLE_RATIO: 3.0
57 | IMPORTANCE_SAMPLE_RATIO: 0.75
58 | TEST:
59 | SEMANTIC_ON: True
60 | INSTANCE_ON: False
61 | PANOPTIC_ON: False
62 | OVERLAP_THRESHOLD: 0.8
63 | OBJECT_MASK_THRESHOLD: 0.8
64 | SOLVER:
65 | IMS_PER_BATCH: 16
66 | BASE_LR: 0.0001
67 | MAX_ITER: 20000
68 | WARMUP_FACTOR: 1.0
69 | WARMUP_ITERS: 0
70 | WEIGHT_DECAY: 0.05
71 | OPTIMIZER: "ADAMW"
72 | LR_SCHEDULER_NAME: "WarmupPolyLR"
73 | BACKBONE_MULTIPLIER: 0.1
74 | CLIP_GRADIENTS:
75 | ENABLED: True
76 | CLIP_TYPE: "full_model"
77 | CLIP_VALUE: 0.01
78 | NORM_TYPE: 2.0
79 | AMP:
80 | ENABLED: False
81 | DATALOADER:
82 | FILTER_EMPTY_ANNOTATIONS: True
83 | NUM_WORKERS: 4
84 | VERSION: 2
85 |
86 | CUDNN_BENCHMARK: True
87 |
88 | TEST:
89 | CLUSTER_SOFTMAX: True
90 | PRED_STAGE: "spix_pixelexclude0125+stage3"
--------------------------------------------------------------------------------
/datasets/split_data/gta/split_gta.py:
--------------------------------------------------------------------------------
1 | from multiprocessing import Pool
2 | import shutil
3 | import os
4 | shutil._USE_CP_SENDFILE = False
5 | def worker(path_pair):
6 | srcpath, dstpath = path_pair
7 | # shutil.copyfile(srcpath, dstpath)
8 | shutil.move(srcpath, dstpath)
9 |
10 | if __name__ == '__main__':
11 | pool = Pool(32)
12 | image_path = r'datasets/GTA/images'
13 | label_path = r'datasets/GTA/labels'
14 |
15 | with open('datasets/split_data/gtav_split_train.txt', 'r') as f:
16 | train_list = f.readlines()
17 | train_list = [x.strip() for x in train_list]
18 |
19 | with open('datasets/split_data/gtav_split_val.txt', 'r') as f:
20 | val_list = f.readlines()
21 | val_list = [x.strip() for x in val_list]
22 |
23 | with open('datasets/split_data/gtav_split_test.txt', 'r') as f:
24 | test_list = f.readlines()
25 | test_list = [x.strip() for x in test_list]
26 |
27 | train_pairs = []
28 |
29 | if not os.path.exists(os.path.join(image_path, 'train')):
30 | os.makedirs(os.path.join(image_path, 'train'))
31 |
32 | if not os.path.exists(os.path.join(label_path, 'train')):
33 | os.makedirs(os.path.join(label_path, 'train'))
34 |
35 | for file in train_list:
36 | srcfile = os.path.join(image_path, file)
37 | dstfile = os.path.join(image_path, 'train', file)
38 | train_pairs.append((srcfile, dstfile))
39 |
40 | srclabel = os.path.join(label_path, file)
41 | dstlabel = os.path.join(label_path, 'train', file)
42 | train_pairs.append((srclabel, dstlabel))
43 | pool.map(worker, train_pairs)
44 |
45 | val_pairs = []
46 |
47 | if not os.path.exists(os.path.join(image_path, 'valid')):
48 | os.makedirs(os.path.join(image_path, 'valid'))
49 |
50 | if not os.path.exists(os.path.join(label_path, 'valid')):
51 | os.makedirs(os.path.join(label_path, 'valid'))
52 |
53 | for file in val_list:
54 | srcfile = os.path.join(image_path, file)
55 | dstfile = os.path.join(image_path, 'valid', file)
56 | val_pairs.append((srcfile, dstfile))
57 |
58 | srclabel = os.path.join(label_path, file)
59 | dstlabel = os.path.join(label_path, 'valid', file)
60 | val_pairs.append((srclabel, dstlabel))
61 | pool.map(worker, val_pairs)
62 |
63 | test_pairs = []
64 |
65 | if not os.path.exists(os.path.join(image_path, 'test')):
66 | os.makedirs(os.path.join(image_path, 'test'))
67 |
68 | if not os.path.exists(os.path.join(label_path, 'test')):
69 | os.makedirs(os.path.join(label_path, 'test'))
70 |
71 | for file in test_list:
72 | srcfile = os.path.join(image_path, file)
73 | dstfile = os.path.join(image_path, 'test', file)
74 | test_pairs.append((srcfile, dstfile))
75 |
76 | srclabel = os.path.join(label_path, file)
77 | dstlabel = os.path.join(label_path, 'test', file)
78 | test_pairs.append((srclabel, dstlabel))
79 | pool.map(worker, test_pairs)
--------------------------------------------------------------------------------
/tools/README.md:
--------------------------------------------------------------------------------
1 | This directory contains few tools for HGFormer.
2 |
3 | * `convert-torchvision-to-d2.py`
4 |
5 | Tool to convert torchvision pre-trained weights for D2.
6 |
7 | ```
8 | wget https://download.pytorch.org/models/resnet101-63fe2227.pth
9 | python tools/convert-torchvision-to-d2.py resnet101-63fe2227.pth R-101.pkl
10 | ```
11 |
12 | * `convert-pretrained-swin-model-to-d2.py`
13 |
14 | Tool to convert Swin Transformer pre-trained weights for D2.
15 |
16 | ```
17 | pip install timm
18 |
19 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth
20 | python tools/convert-pretrained-swin-model-to-d2.py swin_tiny_patch4_window7_224.pth swin_tiny_patch4_window7_224.pkl
21 |
22 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_small_patch4_window7_224.pth
23 | python tools/convert-pretrained-swin-model-to-d2.py swin_small_patch4_window7_224.pth swin_small_patch4_window7_224.pkl
24 |
25 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window12_384_22k.pth
26 | python tools/convert-pretrained-swin-model-to-d2.py swin_base_patch4_window12_384_22k.pth swin_base_patch4_window12_384_22k.pkl
27 |
28 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth
29 | python tools/convert-pretrained-swin-model-to-d2.py swin_large_patch4_window12_384_22k.pth swin_large_patch4_window12_384_22k.pkl
30 | ```
31 |
32 | * `evaluate_pq_for_semantic_segmentation.py`
33 |
34 | Tool to evaluate PQ (PQ-stuff) for semantic segmentation predictions.
35 |
36 | Usage:
37 |
38 | ```
39 | python tools/evaluate_pq_for_semantic_segmentation.py --dataset-name ade20k_sem_seg_val --json-file OUTPUT_DIR/inference/sem_seg_predictions.json
40 | ```
41 |
42 | where `OUTPUT_DIR` is set in the config file.
43 |
44 | * `evaluate_coco_boundary_ap.py`
45 |
46 | Tool to evaluate Boundary AP for instance segmentation predictions.
47 |
48 | Usage:
49 |
50 | ```
51 | python tools/coco_instance_evaluation.py --gt-json-file COCO_GT_JSON --dt-json-file COCO_DT_JSON
52 | ```
53 |
54 | To install Boundary IoU API, run:
55 |
56 | ```
57 | pip install git+https://github.com/bowenc0221/boundary-iou-api.git
58 | ```
59 |
60 | * `analyze_model.py`
61 |
62 | Tool to analyze model parameters and flops.
63 |
64 | Usage for semantic segmentation (ADE20K only, use with caution!):
65 |
66 | ```
67 | python tools/analyze_model.py --num-inputs 1 --tasks flop --use-fixed-input-size --config-file CONFIG_FILE
68 | ```
69 |
70 | Note that, for semantic segmentation (ADE20K only), we use a dummy image with fixed size that equals to `cfg.INPUT.CROP.SIZE[0] x cfg.INPUT.CROP.SIZE[0]`.
71 | Please do not use `--use-fixed-input-size` for calculating FLOPs on other datasets like Cityscapes!
72 |
73 | Usage for panoptic and instance segmentation:
74 |
75 | ```
76 | python tools/analyze_model.py --num-inputs 100 --tasks flop --config-file CONFIG_FILE
77 | ```
78 |
79 | Note that, for panoptic and instance segmentation, we compute the average flops over 100 real validation images.
80 |
--------------------------------------------------------------------------------
/hgformer/modeling/transformer_decoder/position_encoding.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py
3 | """
4 | Various positional encodings for the transformer.
5 | """
6 | import math
7 |
8 | import torch
9 | from torch import nn
10 |
11 |
12 | class PositionEmbeddingSine(nn.Module):
13 | """
14 | This is a more standard version of the position embedding, very similar to the one
15 | used by the Attention is all you need paper, generalized to work on images.
16 | """
17 |
18 | def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
19 | super().__init__()
20 | self.num_pos_feats = num_pos_feats
21 | self.temperature = temperature
22 | self.normalize = normalize
23 | if scale is not None and normalize is False:
24 | raise ValueError("normalize should be True if scale is passed")
25 | if scale is None:
26 | scale = 2 * math.pi
27 | self.scale = scale
28 |
29 | def forward(self, x, mask=None):
30 | if mask is None:
31 | mask = torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool)
32 | # mask: e.g. shape [2, 16, 32], [B, H, W]
33 | not_mask = ~mask
34 | y_embed = not_mask.cumsum(1, dtype=torch.float32) # [B, H, W]
35 | x_embed = not_mask.cumsum(2, dtype=torch.float32) # [B, H, W]
36 | if self.normalize:
37 | eps = 1e-6
38 | y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale # normalize the coordinates, then multiply 2pi
39 | x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
40 |
41 | dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) # [128]
42 | # import ipdb; ipdb.set_trace()
43 | dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
44 |
45 | pos_x = x_embed[:, :, :, None] / dim_t # [B, H, W, num_pos_feats]
46 | pos_y = y_embed[:, :, :, None] / dim_t
47 | pos_x = torch.stack(
48 | (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4
49 | ).flatten(3) # [B, H, W, num_pos_feats]
50 | # import ipdb; ipdb.set_trace()
51 | pos_y = torch.stack(
52 | (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4
53 | ).flatten(3)
54 |
55 | # import ipdb; ipdb.set_trace()
56 | pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) # [B, 2*num_pos_feats, H, W], 2 * num_pos_feats is equal to the number of feat channels
57 | return pos
58 |
59 | def __repr__(self, _repr_indent=4):
60 | head = "Positional encoding " + self.__class__.__name__
61 | body = [
62 | "num_pos_feats: {}".format(self.num_pos_feats),
63 | "temperature: {}".format(self.temperature),
64 | "normalize: {}".format(self.normalize),
65 | "scale: {}".format(self.scale),
66 | ]
67 | # _repr_indent = 4
68 | lines = [head] + [" " * _repr_indent + line for line in body]
69 | return "\n".join(lines)
70 |
--------------------------------------------------------------------------------
/MODEL_ZOO.md:
--------------------------------------------------------------------------------
1 | # HGFormer Model Zoo and Baselines
2 |
3 | #### Detectron2 ImageNet Pretrained Models
4 |
5 | It's common to initialize from backbone models pre-trained on ImageNet classification tasks.
6 |
7 | To prepare the backbones pre-trained on ImageNet classification, please following [tools/README.md](./tools/README.md)
8 |
9 | #### License
10 |
11 | All models available for download through this document are licensed under the
12 | [Creative Commons Attribution-NonCommercial 4.0 International License](https://creativecommons.org/licenses/by-nc/4.0/).
13 |
14 | ## Cityscapes -> ACDC
15 | | Method | Backbone | Fog | Night | Rain | Snow | All | Download |
16 | |:-----------:|:---------:|:-----:|:-----:|:-----:|:-----:|:-----:|:--------:|
17 | | Mask2former | Swin-Tiny | 54.06 | 38.11 | 59.54 | 55.76 | 53.65 | [model](https://drive.google.com/drive/folders/1eL38sFGdUNV8o9EbFsheurHjdm-CNg5K?usp=sharing) |
18 | | HGFormer | Swin-Tiny | 59.82 | 41.88 | 60.92 | 60.82 | 56.95 | [model](https://drive.google.com/drive/folders/1Rq1PnaYTFACpZX_-oXTq7laCa0zwbfFR?usp=drive_link) |
19 |
20 |
21 | ## Cityscapes -> Cityscapes-C (level 5)
22 | | Method | Backbone | Average | Motion | Defoc | Glass | Gauss | Gauss | Impul | Shot | Speck | Bright | Contr | Satur | JPEG | Snow | Spatt | Fog | Frost | Download |
23 | |:-----------:|:-----------:|:---------:|:------:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|:------:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|:--------:|
24 | | Mask2former | Swin-Tiny | 41.68 | 51.61 | 51.52 | 39.69 | 46.71 | 6.89 | 7.68 | 12.75 | 44.10 | 72.71 | 58.60 | 69.14 | 22.86 | 26.10 | 58.35 | 67.12 | 31.11 | [model](https://drive.google.com/drive/folders/1eL38sFGdUNV8o9EbFsheurHjdm-CNg5K?usp=sharing) |
25 | | HGFormer | Swin-Tiny | 43.81 | 52.51 | 53.03 | 39.02 | 47.93 | 16.45 | 16.03 | 20.55 | 48.44 | 74.51 | 57.14 | 70.53 | 27.32 | 25.66 | 59.19 | 66.49 | 26.11 | [model](https://drive.google.com/drive/folders/1Rq1PnaYTFACpZX_-oXTq7laCa0zwbfFR?usp=drive_link) |
26 | ## Cityscapes -> Others
27 | | Method | Backbone | Mapillary | BDD | GTA | Synthia | Average | Download |
28 | |:-----------:|:---------:|:---------:|:-----:|:-----:|:-------:|:--------:|:--------:|
29 | | Mask2former | Swin-Tiny | 65.28 | 49.87 | 51.38 | 34.76 | 50.32 | [model](https://drive.google.com/drive/folders/1eL38sFGdUNV8o9EbFsheurHjdm-CNg5K?usp=sharing) |
30 | | HGFormer | Swin-Tiny | 67.22 | 52.69 | 51.94 | 32.98 | 51.21 |[model](https://drive.google.com/drive/folders/1Rq1PnaYTFACpZX_-oXTq7laCa0zwbfFR?usp=drive_link) |
31 | ## Mapillary -> Others
32 |
33 | | Method | Backbone | GTA | Synthia | Cityscapes | BDD | Average | Download |
34 | |:-----------:|:---------:|:-----:|:-------:|:----------:|:-----:|:-------:|:--------:|
35 | | Mask2former | Swin-Tiny | 57.81 | 40.14 | 68.23 | 59.05 | 56.31 | [model](https://drive.google.com/drive/folders/1xqvAcQZs2NZhUD5dG2KGPmYBnlkH4u-s?usp=drive_link) |
36 | | HGFormer | Swin-Tiny | 60.79 | 39.15 | 69.28 | 62.22 | 57.86 | [model](https://drive.google.com/drive/folders/1XJgHBKT7J-_Gzqgzo3EiX0wAnjXMNCGG?usp=drive_link) |
37 |
38 | ## Disclaimer
39 | The numbers differ slightly from the results reported in the paper because we presented an average of three runs in the paper.
--------------------------------------------------------------------------------
/hgformer/modeling/pixel_decoder/ops/setup.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------------------------------
2 | # Deformable DETR
3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5 | # ------------------------------------------------------------------------------------------------
6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
7 | # ------------------------------------------------------------------------------------------------
8 |
9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 |
12 | import os
13 | import glob
14 |
15 | import torch
16 |
17 | from torch.utils.cpp_extension import CUDA_HOME
18 | from torch.utils.cpp_extension import CppExtension
19 | from torch.utils.cpp_extension import CUDAExtension
20 |
21 | from setuptools import find_packages
22 | from setuptools import setup
23 |
24 | requirements = ["torch", "torchvision"]
25 |
26 | def get_extensions():
27 | this_dir = os.path.dirname(os.path.abspath(__file__))
28 | extensions_dir = os.path.join(this_dir, "src")
29 |
30 | main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
31 | source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
32 | source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
33 |
34 | sources = main_file + source_cpu
35 | extension = CppExtension
36 | extra_compile_args = {"cxx": []}
37 | define_macros = []
38 |
39 | # Force cuda since torch ask for a device, not if cuda is in fact available.
40 | if (os.environ.get('FORCE_CUDA') or torch.cuda.is_available()) and CUDA_HOME is not None:
41 | extension = CUDAExtension
42 | sources += source_cuda
43 | define_macros += [("WITH_CUDA", None)]
44 | extra_compile_args["nvcc"] = [
45 | "-DCUDA_HAS_FP16=1",
46 | "-D__CUDA_NO_HALF_OPERATORS__",
47 | "-D__CUDA_NO_HALF_CONVERSIONS__",
48 | "-D__CUDA_NO_HALF2_OPERATORS__",
49 | ]
50 | else:
51 | if CUDA_HOME is None:
52 | raise NotImplementedError('CUDA_HOME is None. Please set environment variable CUDA_HOME.')
53 | else:
54 | raise NotImplementedError('No CUDA runtime is found. Please set FORCE_CUDA=1 or test it by running torch.cuda.is_available().')
55 |
56 | sources = [os.path.join(extensions_dir, s) for s in sources]
57 | include_dirs = [extensions_dir]
58 | ext_modules = [
59 | extension(
60 | "MultiScaleDeformableAttention",
61 | sources,
62 | include_dirs=include_dirs,
63 | define_macros=define_macros,
64 | extra_compile_args=extra_compile_args,
65 | )
66 | ]
67 | return ext_modules
68 |
69 | setup(
70 | name="MultiScaleDeformableAttention",
71 | version="1.0",
72 | author="Weijie Su",
73 | url="https://github.com/fundamentalvision/Deformable-DETR",
74 | description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention",
75 | packages=find_packages(exclude=("configs", "tests",)),
76 | ext_modules=get_extensions(),
77 | cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
78 | )
79 |
--------------------------------------------------------------------------------
/datasets/prepare_mapillary_sem_seg.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | # Copyright (c) Facebook, Inc. and its affiliates.
4 | import os
5 | from pathlib import Path
6 |
7 | import numpy as np
8 | import tqdm
9 | from PIL import Image
10 | from multiprocessing import Pool
11 |
12 | ignore_label = 255
13 |
14 | id_to_ignore_or_group = {}
15 |
16 | # def gen_id_to_ignore():
17 | # global id_to_ignore_or_group
18 | for i in range(66):
19 | id_to_ignore_or_group[i] = ignore_label
20 |
21 | ### Convert each class to a corresponding cityscapes class
22 | ### Road
23 | # Road
24 | id_to_ignore_or_group[13] = 0
25 | # Lane Marking - General
26 | id_to_ignore_or_group[24] = 0
27 | # Manhole
28 | id_to_ignore_or_group[41] = 0
29 |
30 | ### Sidewalk
31 | # Curb
32 | id_to_ignore_or_group[2] = 1
33 | # Sidewalk
34 | id_to_ignore_or_group[15] = 1
35 |
36 | ### Building
37 | # Building
38 | id_to_ignore_or_group[17] = 2
39 |
40 | ### Wall
41 | # Wall
42 | id_to_ignore_or_group[6] = 3
43 |
44 | ### Fence
45 | # Fence
46 | id_to_ignore_or_group[3] = 4
47 |
48 | ### Pole
49 | # Pole
50 | id_to_ignore_or_group[45] = 5
51 | # Utility Pole
52 | id_to_ignore_or_group[47] = 5
53 |
54 | ### Traffic Light
55 | # Traffic Light
56 | id_to_ignore_or_group[48] = 6
57 |
58 | ### Traffic Sign
59 | # Traffic Sign
60 | id_to_ignore_or_group[50] = 7
61 |
62 | ### Vegetation
63 | # Vegitation
64 | id_to_ignore_or_group[30] = 8
65 |
66 | ### Terrain
67 | # Terrain
68 | id_to_ignore_or_group[29] = 9
69 |
70 | ### Sky
71 | # Sky
72 | id_to_ignore_or_group[27] = 10
73 |
74 | ### Person
75 | # Person
76 | id_to_ignore_or_group[19] = 11
77 |
78 | ### Rider
79 | # Bicyclist
80 | id_to_ignore_or_group[20] = 12
81 | # Motorcyclist
82 | id_to_ignore_or_group[21] = 12
83 | # Other Rider
84 | id_to_ignore_or_group[22] = 12
85 |
86 | ### Car
87 | # Car
88 | id_to_ignore_or_group[55] = 13
89 |
90 | ### Truck
91 | # Truck
92 | id_to_ignore_or_group[61] = 14
93 |
94 | ### Bus
95 | # Bus
96 | id_to_ignore_or_group[54] = 15
97 |
98 | ### Train
99 | # On Rails
100 | id_to_ignore_or_group[58] = 16
101 |
102 | ### Motorcycle
103 | # Motorcycle
104 | id_to_ignore_or_group[57] = 17
105 |
106 | ### Bicycle
107 | # Bicycle
108 | id_to_ignore_or_group[52] = 18
109 |
110 |
111 |
112 | def convert(filetuple):
113 | input, outputpath = filetuple
114 | lab = np.asarray(Image.open(input))
115 | assert lab.dtype == np.uint8
116 | output = np.zeros_like(lab, dtype=np.uint8) + 255
117 | for obj_id in np.unique(lab):
118 | # print(f'obj_id{obj_id}')
119 | # print(f'{id_to_ignore_or_group}')
120 | if obj_id in id_to_ignore_or_group:
121 | output[lab == obj_id] = id_to_ignore_or_group[obj_id]
122 |
123 | Image.fromarray(output).save(outputpath)
124 |
125 | if __name__ == "__main__":
126 | dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "mapillary"
127 | pool = Pool(32)
128 | # gen_id_to_ignore()
129 | # import ipdb; ipdb.set_trace()
130 | for name in ["training", "validation"]:
131 | annotation_dir = dataset_dir / name / "labels"
132 | output_dir = dataset_dir / "labels_detectron2" / name
133 | output_dir.mkdir(parents=True, exist_ok=True)
134 | filelist = []
135 | for file in tqdm.tqdm(list(annotation_dir.iterdir())):
136 | output_file = output_dir / file.name
137 | # convert(file, output_file)
138 | filelist.append((file, output_file))
139 | pool.map(convert, filelist)
--------------------------------------------------------------------------------
/datasets/prepare_synthia_sem_seg.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | # Copyright (c) Facebook, Inc. and its affiliates.
4 | import os
5 | from pathlib import Path
6 |
7 | import numpy as np
8 | import tqdm
9 | from PIL import Image
10 | from multiprocessing import Pool
11 | import cv2
12 | import imageio
13 | import imageio.v2 as imageio
14 | ignore_label = 255
15 |
16 | # mapping based on README.txt from SYNTHIA_RAND_CITYSCAPES
17 | trainid_to_trainid = {
18 | 0: ignore_label, # void
19 | 1: 10, # sky
20 | 2: 2, # building
21 | 3: 0, # road
22 | 4: 1, # sidewalk
23 | 5: 4, # fence
24 | 6: 8, # vegetation
25 | 7: 5, # pole
26 | 8: 13, # car
27 | 9: 7, # traffic sign
28 | 10: 11, # pedestrian - person
29 | 11: 18, # bicycle
30 | 12: 17, # motorcycle
31 | 13: ignore_label, # parking-slot
32 | 14: ignore_label, # road-work
33 | 15: 6, # traffic light
34 | 16: 9, # terrain
35 | 17: 12, # rider
36 | 18: 14, # truck
37 | 19: 15, # bus
38 | 20: 16, # train
39 | 21: 3, # wall
40 | 22: ignore_label # Lanemarking
41 | }
42 |
43 | # def convert(filetupe):
44 | # input, outputpath = filetupe
45 | # # lab = np.asarray(Image.open(input))
46 | # # lab = imageio.imread(input, format='PNG-FI')
47 | # lab = imageio.imread(input, format='PNG')
48 | #
49 | # # print(input)
50 | # # lab = cv2.imread(str(input), cv2.IMREAD_UNCHANGED)[:, :, -1]
51 | # lab = np.array(lab, dtype=np.uint8)[:, :, 0]
52 | # assert lab.dtype == np.uint8
53 | # output = np.zeros_like(lab, dtype=np.uint8) + 255
54 | # for obj_id in np.unique(lab):
55 | # if obj_id in trainid_to_trainid:
56 | # output[lab == obj_id] = trainid_to_trainid[obj_id]
57 | #
58 | # Image.fromarray(output).save(outputpath)
59 |
60 |
61 | def convert(filetupe):
62 | file, new_file = filetupe
63 | # re-assign labels to match the format of Cityscapes
64 | # PIL does not work with the image format, but cv2 does
65 | label = cv2.imread(str(file), cv2.IMREAD_UNCHANGED)[:, :, -1]
66 |
67 | label_copy = 255 * np.ones(label.shape, dtype=np.uint8)
68 | sample_class_stats = {}
69 | for k, v in trainid_to_trainid.items():
70 | k_mask = label == k
71 | label_copy[k_mask] = v
72 | n = int(np.sum(k_mask))
73 | if n > 0:
74 | sample_class_stats[v] = n
75 | # new_file = file.replace('.png', '_labelTrainIds.png')
76 | # assert file != new_file
77 | # sample_class_stats['file'] = new_file
78 | Image.fromarray(label_copy, mode='L').save(new_file)
79 | # return sample_class_stats
80 |
81 | if __name__ == "__main__":
82 | dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "synthia"
83 | pool = Pool(32)
84 | for name in ["train", "val"]:
85 | # for name in ["train"]:
86 | annotation_dir = dataset_dir / "GT" / "LABELS" / name
87 | output_dir = dataset_dir / "labels_detectron2" / name
88 | output_dir.mkdir(parents=True, exist_ok=True)
89 | filelist = []
90 | for file in tqdm.tqdm(list(annotation_dir.iterdir())):
91 | output_file = output_dir / file.name
92 | # convert(file, output_file)
93 | filelist.append((file, output_file))
94 | pool.map(convert, filelist)
--------------------------------------------------------------------------------
/hgformer/modeling/pixel_decoder/ops/functions/ms_deform_attn_func.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------------------------------
2 | # Deformable DETR
3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5 | # ------------------------------------------------------------------------------------------------
6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
7 | # ------------------------------------------------------------------------------------------------
8 |
9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 |
12 | from __future__ import absolute_import
13 | from __future__ import print_function
14 | from __future__ import division
15 |
16 | import torch
17 | import torch.nn.functional as F
18 | from torch.autograd import Function
19 | from torch.autograd.function import once_differentiable
20 |
21 | try:
22 | import MultiScaleDeformableAttention as MSDA
23 | except ModuleNotFoundError as e:
24 | info_string = (
25 | "\n\nPlease compile MultiScaleDeformableAttention CUDA op with the following commands:\n"
26 | "\t`cd mask2former/modeling/pixel_decoder/ops`\n"
27 | "\t`sh make.sh`\n"
28 | )
29 | raise ModuleNotFoundError(info_string)
30 |
31 |
32 | class MSDeformAttnFunction(Function):
33 | @staticmethod
34 | def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step):
35 | ctx.im2col_step = im2col_step
36 | output = MSDA.ms_deform_attn_forward(
37 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step)
38 | ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights)
39 | return output
40 |
41 | @staticmethod
42 | @once_differentiable
43 | def backward(ctx, grad_output):
44 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors
45 | grad_value, grad_sampling_loc, grad_attn_weight = \
46 | MSDA.ms_deform_attn_backward(
47 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step)
48 |
49 | return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
50 |
51 |
52 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
53 | # for debug and test only,
54 | # need to use cuda version instead
55 | N_, S_, M_, D_ = value.shape
56 | _, Lq_, M_, L_, P_, _ = sampling_locations.shape
57 | value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
58 | sampling_grids = 2 * sampling_locations - 1
59 | sampling_value_list = []
60 | for lid_, (H_, W_) in enumerate(value_spatial_shapes):
61 | # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
62 | value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_)
63 | # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
64 | sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)
65 | # N_*M_, D_, Lq_, P_
66 | sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,
67 | mode='bilinear', padding_mode='zeros', align_corners=False)
68 | sampling_value_list.append(sampling_value_l_)
69 | # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
70 | attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_)
71 | output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)
72 | return output.transpose(1, 2).contiguous()
73 |
--------------------------------------------------------------------------------
/hgformer/test_time_augmentation.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | import copy
3 | import logging
4 | from itertools import count
5 |
6 | import numpy as np
7 | import torch
8 | from fvcore.transforms import HFlipTransform
9 | from torch import nn
10 | from torch.nn.parallel import DistributedDataParallel
11 |
12 | from detectron2.data.detection_utils import read_image
13 | from detectron2.modeling import DatasetMapperTTA
14 |
15 |
16 | __all__ = [
17 | "SemanticSegmentorWithTTA",
18 | ]
19 |
20 |
21 | class SemanticSegmentorWithTTA(nn.Module):
22 | """
23 | A SemanticSegmentor with test-time augmentation enabled.
24 | Its :meth:`__call__` method has the same interface as :meth:`SemanticSegmentor.forward`.
25 | """
26 |
27 | def __init__(self, cfg, model, tta_mapper=None, batch_size=1):
28 | """
29 | Args:
30 | cfg (CfgNode):
31 | model (SemanticSegmentor): a SemanticSegmentor to apply TTA on.
32 | tta_mapper (callable): takes a dataset dict and returns a list of
33 | augmented versions of the dataset dict. Defaults to
34 | `DatasetMapperTTA(cfg)`.
35 | batch_size (int): batch the augmented images into this batch size for inference.
36 | """
37 | super().__init__()
38 | if isinstance(model, DistributedDataParallel):
39 | model = model.module
40 | self.cfg = cfg.clone()
41 |
42 | self.model = model
43 |
44 | if tta_mapper is None:
45 | tta_mapper = DatasetMapperTTA(cfg)
46 | self.tta_mapper = tta_mapper
47 | self.batch_size = batch_size
48 |
49 | def __call__(self, batched_inputs):
50 | """
51 | Same input/output format as :meth:`SemanticSegmentor.forward`
52 | """
53 |
54 | def _maybe_read_image(dataset_dict):
55 | ret = copy.copy(dataset_dict)
56 | if "image" not in ret:
57 | image = read_image(ret.pop("file_name"), self.model.input_format)
58 | image = torch.from_numpy(np.ascontiguousarray(image.transpose(2, 0, 1))) # CHW
59 | ret["image"] = image
60 | if "height" not in ret and "width" not in ret:
61 | ret["height"] = image.shape[1]
62 | ret["width"] = image.shape[2]
63 | return ret
64 |
65 | processed_results = []
66 | for x in batched_inputs:
67 | result = self._inference_one_image(_maybe_read_image(x))
68 | processed_results.append(result)
69 | return processed_results
70 |
71 | def _inference_one_image(self, input):
72 | """
73 | Args:
74 | input (dict): one dataset dict with "image" field being a CHW tensor
75 | Returns:
76 | dict: one output dict
77 | """
78 | orig_shape = (input["height"], input["width"])
79 | augmented_inputs, tfms = self._get_augmented_inputs(input)
80 |
81 | final_predictions = None
82 | count_predictions = 0
83 | for input, tfm in zip(augmented_inputs, tfms):
84 | count_predictions += 1
85 | with torch.no_grad():
86 | if final_predictions is None:
87 | if any(isinstance(t, HFlipTransform) for t in tfm.transforms):
88 | final_predictions = self.model([input])[0].pop("sem_seg").flip(dims=[2])
89 | else:
90 | final_predictions = self.model([input])[0].pop("sem_seg")
91 | else:
92 | if any(isinstance(t, HFlipTransform) for t in tfm.transforms):
93 | final_predictions += self.model([input])[0].pop("sem_seg").flip(dims=[2])
94 | else:
95 | final_predictions += self.model([input])[0].pop("sem_seg")
96 |
97 | final_predictions = final_predictions / count_predictions
98 | return {"sem_seg": final_predictions}
99 |
100 | def _get_augmented_inputs(self, input):
101 | augmented_inputs = self.tta_mapper(input)
102 | tfms = [x.pop("transforms") for x in augmented_inputs]
103 | return augmented_inputs, tfms
104 |
--------------------------------------------------------------------------------
/hgformer/modeling/meta_arch/group_former_head.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | import logging
3 | from copy import deepcopy
4 | from typing import Callable, Dict, List, Optional, Tuple, Union
5 |
6 | import fvcore.nn.weight_init as weight_init
7 | from torch import nn
8 | from torch.nn import functional as F
9 |
10 | from detectron2.config import configurable
11 | from detectron2.layers import Conv2d, ShapeSpec, get_norm
12 | from detectron2.modeling import SEM_SEG_HEADS_REGISTRY
13 |
14 | from ..transformer_decoder.maskformer_transformer_decoder import build_transformer_decoder
15 | from ..pixel_decoder.fpn import build_pixel_decoder
16 |
17 |
18 | @SEM_SEG_HEADS_REGISTRY.register()
19 | class GroupFormerHead(nn.Module):
20 |
21 | @configurable
22 | def __init__(
23 | self,
24 | input_shape: Dict[str, ShapeSpec],
25 | *,
26 | num_classes: int,
27 | pixel_decoder: nn.Module,
28 | loss_weight: float = 1.0,
29 | ignore_value: int = -1,
30 | # extra parameters
31 | transformer_predictor: nn.Module,
32 | transformer_in_feature: str,
33 | ):
34 | """
35 | NOTE: this interface is experimental.
36 | Args:
37 | input_shape: shapes (channels and stride) of the input features
38 | num_classes: number of classes to predict
39 | pixel_decoder: the pixel decoder module
40 | loss_weight: loss weight
41 | ignore_value: category id to be ignored during training.
42 | transformer_predictor: the transformer decoder that makes prediction
43 | transformer_in_feature: input feature name to the transformer_predictor
44 | """
45 | super().__init__()
46 | input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride)
47 | self.in_features = [k for k, v in input_shape]
48 | feature_strides = [v.stride for k, v in input_shape]
49 | feature_channels = [v.channels for k, v in input_shape]
50 |
51 | self.ignore_value = ignore_value
52 | self.common_stride = 4
53 | self.loss_weight = loss_weight
54 |
55 | self.pixel_decoder = pixel_decoder
56 | self.predictor = transformer_predictor
57 | self.transformer_in_feature = transformer_in_feature
58 |
59 | self.num_classes = num_classes
60 |
61 | @classmethod
62 | def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
63 | # figure out in_channels to transformer predictor
64 | if cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "transformer_encoder":
65 | transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
66 | elif cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "pixel_embedding":
67 | transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM
68 | elif cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "multi_scale_pixel_decoder": # for maskformer2
69 | transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
70 | else:
71 | transformer_predictor_in_channels = input_shape[cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE].channels
72 |
73 | return {
74 | "input_shape": {
75 | k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES
76 | },
77 | "ignore_value": cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
78 | "num_classes": cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES,
79 | "pixel_decoder": build_pixel_decoder(cfg, input_shape),
80 | "loss_weight": cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT,
81 | "transformer_in_feature": cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE,
82 | "transformer_predictor": build_transformer_decoder(
83 | cfg,
84 | transformer_predictor_in_channels,
85 | mask_classification=True,
86 | ),
87 | }
88 |
89 | def forward(self, features, mask=None):
90 | return self.layers(features, mask)
91 |
92 | def layers(self, features, mask=None):
93 | multi_scale_features = self.pixel_decoder.forward_features(features)
94 | if self.transformer_in_feature == "multi_scale_pixel_decoder":
95 | predictions = self.predictor(multi_scale_features, mask)
96 | else:
97 | raise NotImplementedError
98 | return predictions
99 |
--------------------------------------------------------------------------------
/hgformer/utils/misc.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/util/misc.py
3 | """
4 | Misc functions, including distributed helpers.
5 |
6 | Mostly copy-paste from torchvision references.
7 | """
8 | from typing import List, Optional
9 |
10 | import torch
11 | import torch.distributed as dist
12 | import torchvision
13 | from torch import Tensor
14 |
15 |
16 | def _max_by_axis(the_list):
17 | # type: (List[List[int]]) -> List[int]
18 | maxes = the_list[0]
19 | for sublist in the_list[1:]:
20 | for index, item in enumerate(sublist):
21 | maxes[index] = max(maxes[index], item)
22 | return maxes
23 |
24 |
25 | class NestedTensor(object):
26 | def __init__(self, tensors, mask: Optional[Tensor]):
27 | self.tensors = tensors
28 | self.mask = mask
29 |
30 | def to(self, device):
31 | # type: (Device) -> NestedTensor # noqa
32 | cast_tensor = self.tensors.to(device)
33 | mask = self.mask
34 | if mask is not None:
35 | assert mask is not None
36 | cast_mask = mask.to(device)
37 | else:
38 | cast_mask = None
39 | return NestedTensor(cast_tensor, cast_mask)
40 |
41 | def decompose(self):
42 | return self.tensors, self.mask
43 |
44 | def __repr__(self):
45 | return str(self.tensors)
46 |
47 |
48 | def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
49 | # TODO make this more general
50 | if tensor_list[0].ndim == 3:
51 | if torchvision._is_tracing():
52 | # nested_tensor_from_tensor_list() does not export well to ONNX
53 | # call _onnx_nested_tensor_from_tensor_list() instead
54 | return _onnx_nested_tensor_from_tensor_list(tensor_list)
55 |
56 | # TODO make it support different-sized images
57 | max_size = _max_by_axis([list(img.shape) for img in tensor_list])
58 | # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list]))
59 | batch_shape = [len(tensor_list)] + max_size
60 | b, c, h, w = batch_shape
61 | dtype = tensor_list[0].dtype
62 | device = tensor_list[0].device
63 | tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
64 | mask = torch.ones((b, h, w), dtype=torch.bool, device=device)
65 | for img, pad_img, m in zip(tensor_list, tensor, mask):
66 | pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
67 | m[: img.shape[1], : img.shape[2]] = False
68 | else:
69 | raise ValueError("not supported")
70 | return NestedTensor(tensor, mask)
71 |
72 |
73 | # _onnx_nested_tensor_from_tensor_list() is an implementation of
74 | # nested_tensor_from_tensor_list() that is supported by ONNX tracing.
75 | @torch.jit.unused
76 | def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor:
77 | max_size = []
78 | for i in range(tensor_list[0].dim()):
79 | max_size_i = torch.max(
80 | torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32)
81 | ).to(torch.int64)
82 | max_size.append(max_size_i)
83 | max_size = tuple(max_size)
84 |
85 | # work around for
86 | # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
87 | # m[: img.shape[1], :img.shape[2]] = False
88 | # which is not yet supported in onnx
89 | padded_imgs = []
90 | padded_masks = []
91 | for img in tensor_list:
92 | padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))]
93 | padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0]))
94 | padded_imgs.append(padded_img)
95 |
96 | m = torch.zeros_like(img[0], dtype=torch.int, device=img.device)
97 | padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1)
98 | padded_masks.append(padded_mask.to(torch.bool))
99 |
100 | tensor = torch.stack(padded_imgs)
101 | mask = torch.stack(padded_masks)
102 |
103 | return NestedTensor(tensor, mask=mask)
104 |
105 |
106 | def is_dist_avail_and_initialized():
107 | if not dist.is_available():
108 | return False
109 | if not dist.is_initialized():
110 | return False
111 | return True
112 |
--------------------------------------------------------------------------------
/hgformer/modeling/pixel_decoder/ops/test.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------------------------------
2 | # Deformable DETR
3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5 | # ------------------------------------------------------------------------------------------------
6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
7 | # ------------------------------------------------------------------------------------------------
8 |
9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 |
12 | from __future__ import absolute_import
13 | from __future__ import print_function
14 | from __future__ import division
15 |
16 | import time
17 | import torch
18 | import torch.nn as nn
19 | from torch.autograd import gradcheck
20 |
21 | from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch
22 |
23 |
24 | N, M, D = 1, 2, 2
25 | Lq, L, P = 2, 2, 2
26 | shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda()
27 | level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1]))
28 | S = sum([(H*W).item() for H, W in shapes])
29 |
30 |
31 | torch.manual_seed(3)
32 |
33 |
34 | @torch.no_grad()
35 | def check_forward_equal_with_pytorch_double():
36 | value = torch.rand(N, S, M, D).cuda() * 0.01
37 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
38 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
39 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
40 | im2col_step = 2
41 | output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu()
42 | output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu()
43 | fwdok = torch.allclose(output_cuda, output_pytorch)
44 | max_abs_err = (output_cuda - output_pytorch).abs().max()
45 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
46 |
47 | print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
48 |
49 |
50 | @torch.no_grad()
51 | def check_forward_equal_with_pytorch_float():
52 | value = torch.rand(N, S, M, D).cuda() * 0.01
53 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
54 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
55 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
56 | im2col_step = 2
57 | output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu()
58 | output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu()
59 | fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3)
60 | max_abs_err = (output_cuda - output_pytorch).abs().max()
61 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
62 |
63 | print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
64 |
65 |
66 | def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True):
67 |
68 | value = torch.rand(N, S, M, channels).cuda() * 0.01
69 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
70 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
71 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
72 | im2col_step = 2
73 | func = MSDeformAttnFunction.apply
74 |
75 | value.requires_grad = grad_value
76 | sampling_locations.requires_grad = grad_sampling_loc
77 | attention_weights.requires_grad = grad_attn_weight
78 |
79 | gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step))
80 |
81 | print(f'* {gradok} check_gradient_numerical(D={channels})')
82 |
83 |
84 | if __name__ == '__main__':
85 | check_forward_equal_with_pytorch_double()
86 | check_forward_equal_with_pytorch_float()
87 |
88 | for channels in [30, 32, 64, 71, 1025, 2048, 3096]:
89 | check_gradient_numerical(channels, True, True, True)
90 |
91 |
92 |
93 |
--------------------------------------------------------------------------------
/tools/visualize_data.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Copyright (c) Facebook, Inc. and its affiliates.
3 | import argparse
4 | import os
5 | from itertools import chain
6 | import cv2
7 | import tqdm
8 |
9 | from detectron2.config import get_cfg
10 | from detectron2.data import DatasetCatalog, MetadataCatalog, build_detection_train_loader
11 | from detectron2.data import detection_utils as utils
12 | from detectron2.data.build import filter_images_with_few_keypoints
13 | from detectron2.utils.logger import setup_logger
14 | from detectron2.utils.visualizer import Visualizer
15 | from detectron2.projects.deeplab import add_deeplab_config, build_lr_scheduler
16 |
17 | # MaskFormer
18 | from hgformer import (
19 | COCOInstanceNewBaselineDatasetMapper,
20 | COCOPanopticNewBaselineDatasetMapper,
21 | InstanceSegEvaluator,
22 | MaskFormerInstanceDatasetMapper,
23 | MaskFormerPanopticDatasetMapper,
24 | MaskFormerSemanticDatasetMapper,
25 | SemanticSegmentorWithTTA,
26 | add_maskformer2_config,
27 | )
28 |
29 | def setup(args):
30 | cfg = get_cfg()
31 | add_deeplab_config(cfg)
32 | add_maskformer2_config(cfg)
33 | if args.config_file:
34 | cfg.merge_from_file(args.config_file)
35 | cfg.merge_from_list(args.opts)
36 | cfg.DATALOADER.NUM_WORKERS = 0
37 | cfg.freeze()
38 | return cfg
39 |
40 |
41 | def parse_args(in_args=None):
42 | parser = argparse.ArgumentParser(description="Visualize ground-truth data")
43 | parser.add_argument(
44 | "--source",
45 | choices=["annotation", "dataloader"],
46 | required=True,
47 | help="visualize the annotations or the data loader (with pre-processing)",
48 | )
49 | parser.add_argument("--config-file", metavar="FILE", help="path to config file")
50 | parser.add_argument("--output-dir", default="./", help="path to output directory")
51 | parser.add_argument("--show", action="store_true", help="show output in a window")
52 | parser.add_argument(
53 | "opts",
54 | help="Modify config options using the command-line",
55 | default=None,
56 | nargs=argparse.REMAINDER,
57 | )
58 | return parser.parse_args(in_args)
59 |
60 |
61 | if __name__ == "__main__":
62 | args = parse_args()
63 | logger = setup_logger()
64 | logger.info("Arguments: " + str(args))
65 | cfg = setup(args)
66 |
67 | dirname = args.output_dir
68 | os.makedirs(dirname, exist_ok=True)
69 | metadata = MetadataCatalog.get(cfg.DATASETS.TRAIN[0])
70 |
71 | def output(vis, fname):
72 | if args.show:
73 | print(fname)
74 | cv2.imshow("window", vis.get_image()[:, :, ::-1])
75 | cv2.waitKey()
76 | else:
77 | filepath = os.path.join(dirname, fname)
78 | print("Saving to {} ...".format(filepath))
79 | vis.save(filepath)
80 |
81 | scale = 1.0
82 | if args.source == "dataloader":
83 | mapper = MaskFormerSemanticDatasetMapper(cfg, True)
84 | train_data_loader = build_detection_train_loader(cfg, mapper=mapper)
85 | for batch in train_data_loader:
86 | for per_image in batch:
87 | # Pytorch tensor is in (C, H, W) format
88 | img = per_image["image"].permute(1, 2, 0).cpu().detach().numpy()
89 | img = utils.convert_image_to_rgb(img, cfg.INPUT.FORMAT)
90 |
91 | visualizer = Visualizer(img, metadata=metadata, scale=scale)
92 | # import ipdb; ipdb.set_trace()
93 | target_fields = per_image["instances"].get_fields()
94 | # import ipdb; ipdb.set_trace()
95 | labels = [metadata.stuff_classes[i] for i in target_fields["gt_classes"]]
96 |
97 |
98 |
99 | vis = visualizer.output
100 | # output(vis, str(per_image["image_id"]) + ".jpg")
101 | output(vis, os.path.basename(per_image['file_name']))
102 |
103 |
104 |
105 | # vis = visualizer.overlay_instances(
106 | # labels=labels,
107 | # # boxes=target_fields.get("gt_boxes", None),
108 | # masks=target_fields.get("gt_masks", None),
109 | # # keypoints=target_fields.get("gt_keypoints", None),
110 | # )
111 | # # output(vis, str(per_image["image_id"]) + ".jpg")
112 | # output(vis, os.path.basename(per_image['file_name']))
113 | else:
114 | dicts = list(chain.from_iterable([DatasetCatalog.get(k) for k in cfg.DATASETS.TRAIN]))
115 | if cfg.MODEL.KEYPOINT_ON:
116 | dicts = filter_images_with_few_keypoints(dicts, 1)
117 | for dic in tqdm.tqdm(dicts):
118 | img = utils.read_image(dic["file_name"], "RGB")
119 | visualizer = Visualizer(img, metadata=metadata, scale=scale)
120 | vis = visualizer.draw_dataset_dict(dic)
121 | output(vis, os.path.basename(dic["file_name"]))
122 |
--------------------------------------------------------------------------------
/hgformer/evaluation/instance_evaluation.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | import contextlib
3 | import copy
4 | import io
5 | import itertools
6 | import json
7 | import logging
8 | import numpy as np
9 | import os
10 | import pickle
11 | from collections import OrderedDict
12 | import pycocotools.mask as mask_util
13 | import torch
14 | from pycocotools.coco import COCO
15 | from pycocotools.cocoeval import COCOeval
16 | from tabulate import tabulate
17 |
18 | import detectron2.utils.comm as comm
19 | from detectron2.config import CfgNode
20 | from detectron2.data import MetadataCatalog
21 | from detectron2.data.datasets.coco import convert_to_coco_json
22 | from detectron2.evaluation.coco_evaluation import COCOEvaluator, _evaluate_predictions_on_coco
23 | from detectron2.evaluation.fast_eval_api import COCOeval_opt
24 | from detectron2.structures import Boxes, BoxMode, pairwise_iou
25 | from detectron2.utils.file_io import PathManager
26 | from detectron2.utils.logger import create_small_table
27 |
28 |
29 | # modified from COCOEvaluator for instance segmetnat
30 | class InstanceSegEvaluator(COCOEvaluator):
31 | """
32 | Evaluate AR for object proposals, AP for instance detection/segmentation, AP
33 | for keypoint detection outputs using COCO's metrics.
34 | See http://cocodataset.org/#detection-eval and
35 | http://cocodataset.org/#keypoints-eval to understand its metrics.
36 | The metrics range from 0 to 100 (instead of 0 to 1), where a -1 or NaN means
37 | the metric cannot be computed (e.g. due to no predictions made).
38 |
39 | In addition to COCO, this evaluator is able to support any bounding box detection,
40 | instance segmentation, or keypoint detection dataset.
41 | """
42 |
43 | def _eval_predictions(self, predictions, img_ids=None):
44 | """
45 | Evaluate predictions. Fill self._results with the metrics of the tasks.
46 | """
47 | self._logger.info("Preparing results for COCO format ...")
48 | coco_results = list(itertools.chain(*[x["instances"] for x in predictions]))
49 | tasks = self._tasks or self._tasks_from_predictions(coco_results)
50 |
51 | # unmap the category ids for COCO
52 | if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"):
53 | dataset_id_to_contiguous_id = self._metadata.thing_dataset_id_to_contiguous_id
54 | # all_contiguous_ids = list(dataset_id_to_contiguous_id.values())
55 | # num_classes = len(all_contiguous_ids)
56 | # assert min(all_contiguous_ids) == 0 and max(all_contiguous_ids) == num_classes - 1
57 |
58 | reverse_id_mapping = {v: k for k, v in dataset_id_to_contiguous_id.items()}
59 | for result in coco_results:
60 | category_id = result["category_id"]
61 | # assert category_id < num_classes, (
62 | # f"A prediction has class={category_id}, "
63 | # f"but the dataset only has {num_classes} classes and "
64 | # f"predicted class id should be in [0, {num_classes - 1}]."
65 | # )
66 | assert category_id in reverse_id_mapping, (
67 | f"A prediction has class={category_id}, "
68 | f"but the dataset only has class ids in {dataset_id_to_contiguous_id}."
69 | )
70 | result["category_id"] = reverse_id_mapping[category_id]
71 |
72 | if self._output_dir:
73 | file_path = os.path.join(self._output_dir, "coco_instances_results.json")
74 | self._logger.info("Saving results to {}".format(file_path))
75 | with PathManager.open(file_path, "w") as f:
76 | f.write(json.dumps(coco_results))
77 | f.flush()
78 |
79 | if not self._do_evaluation:
80 | self._logger.info("Annotations are not available for evaluation.")
81 | return
82 |
83 | self._logger.info(
84 | "Evaluating predictions with {} COCO API...".format(
85 | "unofficial" if self._use_fast_impl else "official"
86 | )
87 | )
88 | for task in sorted(tasks):
89 | assert task in {"bbox", "segm", "keypoints"}, f"Got unknown task: {task}!"
90 | coco_eval = (
91 | _evaluate_predictions_on_coco(
92 | self._coco_api,
93 | coco_results,
94 | task,
95 | kpt_oks_sigmas=self._kpt_oks_sigmas,
96 | use_fast_impl=self._use_fast_impl,
97 | img_ids=img_ids,
98 | max_dets_per_image=self._max_dets_per_image,
99 | )
100 | if len(coco_results) > 0
101 | else None # cocoapi does not handle empty results very well
102 | )
103 |
104 | res = self._derive_coco_results(
105 | coco_eval, task, class_names=self._metadata.get("thing_classes")
106 | )
107 | self._results[task] = res
108 |
--------------------------------------------------------------------------------
/demo/inference.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # Modified by Bowen Cheng from: https://github.com/facebookresearch/detectron2/blob/master/demo/demo.py
3 | import argparse
4 | import glob
5 | import multiprocessing as mp
6 | import os
7 |
8 | # fmt: off
9 | import sys
10 | sys.path.insert(1, os.path.join(sys.path[0], '..'))
11 | # fmt: on
12 |
13 | import tempfile
14 | import time
15 | import warnings
16 |
17 | import cv2
18 | import numpy as np
19 | import tqdm
20 |
21 | from detectron2.config import get_cfg
22 | from detectron2.data.detection_utils import read_image
23 | from detectron2.projects.deeplab import add_deeplab_config
24 | from detectron2.utils.logger import setup_logger
25 |
26 | from hgformer import add_maskformer2_config
27 | from predictor import VisualizationDemo
28 |
29 |
30 | # constants
31 | WINDOW_NAME = "mask2former demo"
32 |
33 | def GetFileFromThisRootDir(dir,ext = None):
34 | allfiles = []
35 | needExtFilter = (ext != None)
36 | for root,dirs,files in os.walk(dir):
37 | for filespath in files:
38 | filepath = os.path.join(root, filespath)
39 | extension = os.path.splitext(filepath)[1][1:]
40 | if needExtFilter and extension in ext:
41 | allfiles.append(filepath)
42 | elif not needExtFilter:
43 | allfiles.append(filepath)
44 | return allfiles
45 |
46 | def setup_cfg(args):
47 | # load config from file and command-line arguments
48 | cfg = get_cfg()
49 | add_deeplab_config(cfg)
50 | add_maskformer2_config(cfg)
51 | cfg.merge_from_file(args.config_file)
52 | cfg.merge_from_list(args.opts)
53 | cfg.freeze()
54 | return cfg
55 |
56 |
57 | def get_parser():
58 | parser = argparse.ArgumentParser(description="maskformer2 demo for builtin configs")
59 | parser.add_argument(
60 | "--config-file",
61 | default="configs/coco/panoptic-segmentation/maskformer2_R50_bs16_50ep.yaml",
62 | metavar="FILE",
63 | help="path to config file",
64 | )
65 | parser.add_argument("--webcam", action="store_true", help="Take inputs from webcam.")
66 | parser.add_argument("--video-input", help="Path to video file.")
67 | parser.add_argument(
68 | "--input",
69 | nargs="+",
70 | help="A list of space separated input images; "
71 | "or a single glob pattern such as 'directory/*.jpg'",
72 | )
73 | parser.add_argument(
74 | "--output",
75 | help="A file or directory to save output visualizations. "
76 | "If not given, will show output in an OpenCV window.",
77 | )
78 |
79 | parser.add_argument(
80 | "--confidence-threshold",
81 | type=float,
82 | default=0.5,
83 | help="Minimum score for instance predictions to be shown",
84 | )
85 | parser.add_argument(
86 | "--opts",
87 | help="Modify config options using the command-line 'KEY VALUE' pairs",
88 | default=[],
89 | nargs=argparse.REMAINDER,
90 | )
91 | return parser
92 |
93 |
94 | def test_opencv_video_format(codec, file_ext):
95 | with tempfile.TemporaryDirectory(prefix="video_format_test") as dir:
96 | filename = os.path.join(dir, "test_file" + file_ext)
97 | writer = cv2.VideoWriter(
98 | filename=filename,
99 | fourcc=cv2.VideoWriter_fourcc(*codec),
100 | fps=float(30),
101 | frameSize=(10, 10),
102 | isColor=True,
103 | )
104 | [writer.write(np.zeros((10, 10, 3), np.uint8)) for _ in range(30)]
105 | writer.release()
106 | if os.path.isfile(filename):
107 | return True
108 | return False
109 |
110 |
111 | if __name__ == "__main__":
112 | mp.set_start_method("spawn", force=True)
113 | args = get_parser().parse_args()
114 | setup_logger(name="fvcore")
115 | logger = setup_logger()
116 | logger.info("Arguments: " + str(args))
117 |
118 | cfg = setup_cfg(args)
119 |
120 | demo = VisualizationDemo(cfg)
121 |
122 | # import ipdb; ipdb.set_trace()
123 | filelist = GetFileFromThisRootDir(args.input[0])
124 | for path in tqdm.tqdm(filelist, disable=not args.output):
125 | # use PIL, to be consistent with evaluation
126 | img = read_image(path, format="BGR")
127 | start_time = time.time()
128 | # predictions, visualized_output = demo.run_on_image(img)
129 | predictions = demo.predictor(img)
130 |
131 | # import ipdb; ipdb.set_trace()
132 | logger.info(
133 | "{}: {} in {:.2f}s".format(
134 | path,
135 | "detected {} instances".format(len(predictions["instances"]))
136 | if "instances" in predictions
137 | else "finished",
138 | time.time() - start_time,
139 | )
140 | )
141 |
142 | basename = os.path.basename(path)
143 | if not os.path.exists(args.output):
144 | os.makedirs(args.output)
145 | output_path = os.path.join(args.output, basename)
146 |
147 | outimg = predictions['sem_seg'].detach().cpu().numpy().argmax(0).astype(np.uint8)
148 | cv2.imwrite(output_path, outimg)
149 |
--------------------------------------------------------------------------------
/datasets/README.md:
--------------------------------------------------------------------------------
1 | # Prepare Datasets for HGFormer
2 |
3 | A dataset can be used by accessing [DatasetCatalog](https://detectron2.readthedocs.io/modules/data.html#detectron2.data.DatasetCatalog)
4 | for its data, or [MetadataCatalog](https://detectron2.readthedocs.io/modules/data.html#detectron2.data.MetadataCatalog) for its metadata (class names, etc).
5 | This document explains how to setup the builtin datasets so they can be used by the above APIs.
6 | [Use Custom Datasets](https://detectron2.readthedocs.io/tutorials/datasets.html) gives a deeper dive on how to use `DatasetCatalog` and `MetadataCatalog`,
7 | and how to add new datasets to them.
8 |
9 | HGFormer has builtin support for a few datasets.
10 | The datasets are assumed to exist in a directory specified by the environment variable
11 | `DETECTRON2_DATASETS`.
12 | Under this directory, detectron2 will look for datasets in the structure described below, if needed.
13 | ```
14 | $DETECTRON2_DATASETS/
15 | cityscapes/
16 | cityscapes-c/
17 | mapillary/
18 | acdc/
19 | bdd/
20 | gta/
21 | synthia/
22 | ```
23 |
24 | You can set the location for builtin datasets by `export DETECTRON2_DATASETS=/path/to/datasets`.
25 | If left unset, the default is `./datasets` relative to your current working directory.
26 |
27 |
28 | ## Expected dataset structure for [cityscapes](https://www.cityscapes-dataset.com/downloads/):
29 | ```
30 | cityscapes/
31 | gtFine/
32 | train/
33 | aachen/
34 | color.png, instanceIds.png, labelIds.png, polygons.json,
35 | labelTrainIds.png
36 | ...
37 | val/
38 | test/
39 | leftImg8bit/
40 | train/
41 | val/
42 | test/
43 | ```
44 |
45 | Install cityscapes scripts by:
46 | ```
47 | pip install git+https://github.com/mcordts/cityscapesScripts.git
48 | ```
49 |
50 | Note: to create labelTrainIds.png, first prepare the above structure, then run cityscapesescript with:
51 | ```
52 | CITYSCAPES_DATASET=/path/to/abovementioned/cityscapes python cityscapesscripts/preparation/createTrainIdLabelImgs.py
53 | ```
54 |
55 | ## Expected dataset structure for [ACDC](https://acdc.vision.ee.ethz.ch/download):
56 | ```
57 | acdc/
58 | rgb_anon/
59 | fog/
60 | test/
61 | night/
62 | test/
63 | rain/
64 | test/
65 | snow/
66 | test/
67 | all/
68 | test/
69 |
70 | ```
71 | You should create the folder of ```all``` and copy test images of all types to ```all/test```
72 |
73 | ## Expected dataset structure for [Mapillary](https://www.mapillary.com/dataset/vistas):
74 | ```
75 | mapillary/
76 | training/
77 | images/
78 | labels
79 | validation/
80 | images/
81 | labels/
82 | testing/
83 | images/
84 | labels/
85 | labels_detectron2/
86 | training/
87 | validation/
88 | ```
89 | Run `python datasets/prepare_mapillary_sem_seg.py`, to map the mapillary labels to the Cityscapes labels
90 |
91 |
92 | ## Expected dataset structure for [BDD](https://www.mapillary.com/dataset/vistas):
93 | ```
94 | bdd/
95 | images/
96 | 10k/
97 | train/
98 | val/
99 | labels/
100 | sem_seg/
101 | masks/
102 | train/
103 | val/
104 | ```
105 |
106 |
107 | ## Expected dataset structure for [Cityscapes-c]():
108 |
109 | ```
110 | cityscapes-c/
111 | clean/
112 | brightness/
113 | 1/
114 | 2/
115 | 3/
116 | 4/
117 | 5/
118 | ...
119 | ```
120 |
121 | The folder clean should include the cityscapes images of val set.
122 |
123 | The folders of corruption types (e.g. brightness) are generated by run `python datasets/generate_cityscapes_c.py`
124 |
125 |
126 | ## Expected dataset structure for [GTAV](https://download.visinf.tu-darmstadt.de/data/from_games/):
127 | ```
128 | gta/
129 | images/
130 | train/
131 | valid/
132 | test/
133 | labels/
134 | train/
135 | valid/
136 | test/
137 | labels_detectron2/
138 | train/
139 | valid/
140 | test/
141 | ```
142 | Downlaod the GTA from https://download.visinf.tu-darmstadt.de/data/from_games/
143 |
144 | Then unzip the images and labels.
145 |
146 | We split the dataset following [RobustNet](https://github.com/shachoi/RobustNet)
147 | ```
148 | python datasets/split_data/gta/split_gta.py
149 | ```
150 | For the GTA dataset, a small set of label maps (60 frames) has a different resolution than their corresponding image.
151 | Therefore, we need to resize these label maps.
152 | ```
153 | python datasets/split_data/gta/resize_img.py
154 | mv datasets/GTA/labels/valid_resize/* datasets/GTA/labels/valid/
155 | rm -rf datasets/GTA/labels/valid_resize/
156 | ```
157 | Finally, we map the labels for detectron2:
158 | ```
159 | python datasets/prepare_gta_sem_seg.py
160 | ```
161 |
162 | ## Expected dataset structure for [Synthia](https://synthia-dataset.net/downloads/):
163 | ```
164 | synthia/
165 | Depth/
166 | Depth
167 | GT/
168 | COLOR/
169 | LABELS/
170 | train/
171 | val/
172 | RGB/
173 | train/
174 | val/
175 | ```
176 | We follow the [RobustNet]() to split the dataset.
177 | ```
178 | python datasets/synthia/split_synthia.py
179 | ```
180 | We then map the labels from synthia to cityscapes.
181 | ```
182 | python datasets/prepare_synthia_sem_seg.py
183 | ```
184 |
185 |
--------------------------------------------------------------------------------
/hgformer/data/samplers/balanced_sampler.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | import itertools
3 | import logging
4 | import math
5 | from collections import defaultdict
6 | from typing import Optional
7 | import torch
8 | from torch.utils.data.sampler import Sampler
9 | from detectron2.utils import comm
10 |
11 | class BalancedTrainingSampler(Sampler):
12 | """
13 | This is modified from repeat sampler
14 | Similar to TrainingSampler, but a sample may appear more times than others based
15 | on its "repeat factor".
16 | """
17 |
18 | def __init__(self, repeat_factors, *, shuffle=True, seed=None):
19 | """
20 | Args:
21 | repeat_factors (Tensor): a float vector, the repeat factor for each indice. When it's
22 | full of ones, it is equivalent to ``TrainingSampler(len(repeat_factors), ...)``.
23 | shuffle (bool): whether to shuffle the indices or not
24 | seed (int): the initial seed of the shuffle. Must be the same
25 | across all workers. If None, will use a random seed shared
26 | among workers (require synchronization among all workers).
27 | """
28 | self._shuffle = shuffle
29 | if seed is None:
30 | seed = comm.shared_random_seed()
31 | self._seed = int(seed)
32 |
33 | self._rank = comm.get_rank()
34 | self._world_size = comm.get_world_size()
35 |
36 | # Split into whole number (_int_part) and fractional (_frac_part) parts.
37 | self._int_part = torch.trunc(repeat_factors)
38 | self._frac_part = repeat_factors - self._int_part
39 |
40 | @staticmethod
41 | def repeat_factors_from_category_frequency(dataset_dicts, repeat_thresh):
42 | """
43 | Compute (fractional) per-image repeat factors based on category frequency.
44 | The repeat factor for an image is a function of the frequency of the rarest
45 | category labeled in that image. The "frequency of category c" in [0, 1] is defined
46 | as the fraction of images in the training set (without repeats) in which category c
47 | appears.
48 | See :paper:`lvis` (>= v2) Appendix B.2.
49 |
50 | Args:
51 | dataset_dicts (list[dict]): annotations in Detectron2 dataset format.
52 | repeat_thresh (float): frequency threshold below which data is repeated.
53 | If the frequency is half of `repeat_thresh`, the image will be
54 | repeated twice.
55 |
56 | Returns:
57 | torch.Tensor:
58 | the i-th element is the repeat factor for the dataset image at index i.
59 | """
60 | # 1. For each category c, compute the fraction of images that contain it: f(c)
61 | category_freq = defaultdict(int)
62 | import ipdb; ipdb.set_trace()
63 | for dataset_dict in dataset_dicts: # For each image (without repeats)
64 | cat_ids = {ann["category_id"] for ann in dataset_dict["annotations"]}
65 | for cat_id in cat_ids:
66 | category_freq[cat_id] += 1
67 | num_images = len(dataset_dicts)
68 | for k, v in category_freq.items():
69 | category_freq[k] = v / num_images
70 |
71 | # 2. For each category c, compute the category-level repeat factor:
72 | # r(c) = max(1, sqrt(t / f(c)))
73 | category_rep = {
74 | cat_id: max(1.0, math.sqrt(repeat_thresh / cat_freq))
75 | for cat_id, cat_freq in category_freq.items()
76 | }
77 |
78 | # 3. For each image I, compute the image-level repeat factor:
79 | # r(I) = max_{c in I} r(c)
80 | rep_factors = []
81 | for dataset_dict in dataset_dicts:
82 | cat_ids = {ann["category_id"] for ann in dataset_dict["annotations"]}
83 | rep_factor = max({category_rep[cat_id] for cat_id in cat_ids}, default=1.0)
84 | rep_factors.append(rep_factor)
85 |
86 | return torch.tensor(rep_factors, dtype=torch.float32)
87 |
88 | def _get_epoch_indices(self, generator):
89 | """
90 | Create a list of dataset indices (with repeats) to use for one epoch.
91 |
92 | Args:
93 | generator (torch.Generator): pseudo random number generator used for
94 | stochastic rounding.
95 |
96 | Returns:
97 | torch.Tensor: list of dataset indices to use in one epoch. Each index
98 | is repeated based on its calculated repeat factor.
99 | """
100 | # Since repeat factors are fractional, we use stochastic rounding so
101 | # that the target repeat factor is achieved in expectation over the
102 | # course of training
103 | rands = torch.rand(len(self._frac_part), generator=generator)
104 | rep_factors = self._int_part + (rands < self._frac_part).float()
105 | # Construct a list of indices in which we repeat images as specified
106 | indices = []
107 | for dataset_index, rep_factor in enumerate(rep_factors):
108 | indices.extend([dataset_index] * int(rep_factor.item()))
109 | return torch.tensor(indices, dtype=torch.int64)
110 |
111 | def __iter__(self):
112 | start = self._rank
113 | yield from itertools.islice(self._infinite_indices(), start, None, self._world_size)
114 |
115 | def _infinite_indices(self):
116 | g = torch.Generator()
117 | g.manual_seed(self._seed)
118 | while True:
119 | # Sample indices with repeats determined by stochastic rounding; each
120 | # "epoch" may have a slightly different size due to the rounding.
121 | indices = self._get_epoch_indices(g)
122 | if self._shuffle:
123 | randperm = torch.randperm(len(indices), generator=g)
124 | yield from indices[randperm].tolist()
125 | else:
126 | yield from indices.tolist()
127 |
--------------------------------------------------------------------------------
/hgformer/modeling/meta_arch/mask_former_head.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | import logging
3 | from copy import deepcopy
4 | from typing import Callable, Dict, List, Optional, Tuple, Union
5 |
6 | import fvcore.nn.weight_init as weight_init
7 | from torch import nn
8 | from torch.nn import functional as F
9 |
10 | from detectron2.config import configurable
11 | from detectron2.layers import Conv2d, ShapeSpec, get_norm
12 | from detectron2.modeling import SEM_SEG_HEADS_REGISTRY
13 |
14 | from ..transformer_decoder.maskformer_transformer_decoder import build_transformer_decoder
15 | from ..pixel_decoder.fpn import build_pixel_decoder
16 |
17 |
18 | @SEM_SEG_HEADS_REGISTRY.register()
19 | class MaskFormerHead(nn.Module):
20 |
21 | _version = 2
22 |
23 | # def _load_from_state_dict(
24 | # self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
25 | # ):
26 | # version = local_metadata.get("version", None)
27 | # if version is None or version < 2:
28 | # # Do not warn if train from scratch
29 | # scratch = True
30 | # logger = logging.getLogger(__name__)
31 | # for k in list(state_dict.keys()):
32 | # newk = k
33 | # if "sem_seg_head" in k and not k.startswith(prefix + "predictor"):
34 | # newk = k.replace(prefix, prefix + "pixel_decoder.")
35 | # # logger.debug(f"{k} ==> {newk}")
36 | # if newk != k:
37 | # state_dict[newk] = state_dict[k]
38 | # del state_dict[k]
39 | # scratch = False
40 | #
41 | # if not scratch:
42 | # logger.warning(
43 | # f"Weight format of {self.__class__.__name__} have changed! "
44 | # "Please upgrade your models. Applying automatic conversion now ..."
45 | # )
46 |
47 | @configurable
48 | def __init__(
49 | self,
50 | input_shape: Dict[str, ShapeSpec],
51 | *,
52 | num_classes: int,
53 | pixel_decoder: nn.Module,
54 | loss_weight: float = 1.0,
55 | ignore_value: int = -1,
56 | # extra parameters
57 | transformer_predictor: nn.Module,
58 | transformer_in_feature: str,
59 | ):
60 | """
61 | NOTE: this interface is experimental.
62 | Args:
63 | input_shape: shapes (channels and stride) of the input features
64 | num_classes: number of classes to predict
65 | pixel_decoder: the pixel decoder module
66 | loss_weight: loss weight
67 | ignore_value: category id to be ignored during training.
68 | transformer_predictor: the transformer decoder that makes prediction
69 | transformer_in_feature: input feature name to the transformer_predictor
70 | """
71 | super().__init__()
72 | input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride)
73 | self.in_features = [k for k, v in input_shape]
74 | feature_strides = [v.stride for k, v in input_shape]
75 | feature_channels = [v.channels for k, v in input_shape]
76 |
77 | self.ignore_value = ignore_value
78 | self.common_stride = 4
79 | self.loss_weight = loss_weight
80 |
81 | self.pixel_decoder = pixel_decoder
82 | self.predictor = transformer_predictor
83 | self.transformer_in_feature = transformer_in_feature
84 |
85 | self.num_classes = num_classes
86 |
87 | @classmethod
88 | def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
89 | # figure out in_channels to transformer predictor
90 | if cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "transformer_encoder":
91 | transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
92 | elif cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "pixel_embedding":
93 | transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM
94 | elif cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "multi_scale_pixel_decoder": # for maskformer2
95 | transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
96 | else:
97 | transformer_predictor_in_channels = input_shape[cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE].channels
98 |
99 | return {
100 | "input_shape": {
101 | k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES
102 | },
103 | "ignore_value": cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
104 | "num_classes": cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES,
105 | "pixel_decoder": build_pixel_decoder(cfg, input_shape),
106 | "loss_weight": cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT,
107 | "transformer_in_feature": cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE,
108 | "transformer_predictor": build_transformer_decoder(
109 | cfg,
110 | transformer_predictor_in_channels,
111 | mask_classification=True,
112 | ),
113 | }
114 |
115 | def forward(self, features, mask=None):
116 | return self.layers(features, mask)
117 |
118 | def layers(self, features, mask=None):
119 | mask_features, transformer_encoder_features, multi_scale_features = self.pixel_decoder.forward_features(features)
120 | if self.transformer_in_feature == "multi_scale_pixel_decoder":
121 | predictions = self.predictor(multi_scale_features, mask_features, mask)
122 | else:
123 | if self.transformer_in_feature == "transformer_encoder":
124 | assert (
125 | transformer_encoder_features is not None
126 | ), "Please use the TransformerEncoderPixelDecoder."
127 | predictions = self.predictor(transformer_encoder_features, mask_features, mask)
128 | elif self.transformer_in_feature == "pixel_embedding":
129 | predictions = self.predictor(mask_features, mask_features, mask)
130 | else:
131 | predictions = self.predictor(features[self.transformer_in_feature], mask_features, mask)
132 | return predictions
133 |
--------------------------------------------------------------------------------
/tools/analyze_model.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Copyright (c) Facebook, Inc. and its affiliates.
3 | # Modified by Bowen Cheng from https://github.com/facebookresearch/detectron2/blob/main/tools/analyze_model.py
4 |
5 | import logging
6 | import numpy as np
7 | from collections import Counter
8 | import tqdm
9 | from fvcore.nn import flop_count_table # can also try flop_count_str
10 |
11 | from detectron2.checkpoint import DetectionCheckpointer
12 | from detectron2.config import CfgNode, LazyConfig, get_cfg, instantiate
13 | from detectron2.data import build_detection_test_loader
14 | from detectron2.engine import default_argument_parser
15 | from detectron2.modeling import build_model
16 | from detectron2.projects.deeplab import add_deeplab_config
17 | from detectron2.utils.analysis import (
18 | FlopCountAnalysis,
19 | activation_count_operators,
20 | parameter_count_table,
21 | )
22 | from detectron2.utils.logger import setup_logger
23 |
24 | # fmt: off
25 | import os
26 | import sys
27 | sys.path.insert(1, os.path.join(sys.path[0], '..'))
28 | # fmt: on
29 |
30 | from hgformer import add_maskformer2_config
31 |
32 | logger = logging.getLogger("detectron2")
33 |
34 |
35 | def setup(args):
36 | if args.config_file.endswith(".yaml"):
37 | cfg = get_cfg()
38 | add_deeplab_config(cfg)
39 | add_maskformer2_config(cfg)
40 | cfg.merge_from_file(args.config_file)
41 | cfg.DATALOADER.NUM_WORKERS = 0
42 | cfg.merge_from_list(args.opts)
43 | cfg.freeze()
44 | else:
45 | cfg = LazyConfig.load(args.config_file)
46 | cfg = LazyConfig.apply_overrides(cfg, args.opts)
47 | setup_logger(name="fvcore")
48 | setup_logger()
49 | return cfg
50 |
51 |
52 | def do_flop(cfg):
53 | if isinstance(cfg, CfgNode):
54 | data_loader = build_detection_test_loader(cfg, cfg.DATASETS.TEST[0])
55 | model = build_model(cfg)
56 | DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS)
57 | else:
58 | data_loader = instantiate(cfg.dataloader.test)
59 | model = instantiate(cfg.model)
60 | model.to(cfg.train.device)
61 | DetectionCheckpointer(model).load(cfg.train.init_checkpoint)
62 | model.eval()
63 |
64 | counts = Counter()
65 | total_flops = []
66 | for idx, data in zip(tqdm.trange(args.num_inputs), data_loader): # noqa
67 | if args.use_fixed_input_size and isinstance(cfg, CfgNode):
68 | import torch
69 | crop_size = cfg.INPUT.CROP.SIZE[0]
70 | data[0]["image"] = torch.zeros((3, crop_size, crop_size))
71 | flops = FlopCountAnalysis(model, data)
72 | if idx > 0:
73 | flops.unsupported_ops_warnings(False).uncalled_modules_warnings(False)
74 | counts += flops.by_operator()
75 | total_flops.append(flops.total())
76 |
77 | logger.info("Flops table computed from only one input sample:\n" + flop_count_table(flops))
78 | logger.info(
79 | "Average GFlops for each type of operators:\n"
80 | + str([(k, v / (idx + 1) / 1e9) for k, v in counts.items()])
81 | )
82 | logger.info(
83 | "Total GFlops: {:.1f}±{:.1f}".format(np.mean(total_flops) / 1e9, np.std(total_flops) / 1e9)
84 | )
85 |
86 |
87 | def do_activation(cfg):
88 | if isinstance(cfg, CfgNode):
89 | data_loader = build_detection_test_loader(cfg, cfg.DATASETS.TEST[0])
90 | model = build_model(cfg)
91 | DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS)
92 | else:
93 | data_loader = instantiate(cfg.dataloader.test)
94 | model = instantiate(cfg.model)
95 | model.to(cfg.train.device)
96 | DetectionCheckpointer(model).load(cfg.train.init_checkpoint)
97 | model.eval()
98 |
99 | counts = Counter()
100 | total_activations = []
101 | for idx, data in zip(tqdm.trange(args.num_inputs), data_loader): # noqa
102 | count = activation_count_operators(model, data)
103 | counts += count
104 | total_activations.append(sum(count.values()))
105 | logger.info(
106 | "(Million) Activations for Each Type of Operators:\n"
107 | + str([(k, v / idx) for k, v in counts.items()])
108 | )
109 | logger.info(
110 | "Total (Million) Activations: {}±{}".format(
111 | np.mean(total_activations), np.std(total_activations)
112 | )
113 | )
114 |
115 |
116 | def do_parameter(cfg):
117 | if isinstance(cfg, CfgNode):
118 | model = build_model(cfg)
119 | else:
120 | model = instantiate(cfg.model)
121 | logger.info("Parameter Count:\n" + parameter_count_table(model, max_depth=5))
122 |
123 |
124 | def do_structure(cfg):
125 | if isinstance(cfg, CfgNode):
126 | model = build_model(cfg)
127 | else:
128 | model = instantiate(cfg.model)
129 | logger.info("Model Structure:\n" + str(model))
130 |
131 |
132 | if __name__ == "__main__":
133 | parser = default_argument_parser(
134 | epilog="""
135 | Examples:
136 | To show parameters of a model:
137 | $ ./analyze_model.py --tasks parameter \\
138 | --config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml
139 | Flops and activations are data-dependent, therefore inputs and model weights
140 | are needed to count them:
141 | $ ./analyze_model.py --num-inputs 100 --tasks flop \\
142 | --config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \\
143 | MODEL.WEIGHTS /path/to/model.pkl
144 | """
145 | )
146 | parser.add_argument(
147 | "--tasks",
148 | choices=["flop", "activation", "parameter", "structure"],
149 | required=True,
150 | nargs="+",
151 | )
152 | parser.add_argument(
153 | "-n",
154 | "--num-inputs",
155 | default=100,
156 | type=int,
157 | help="number of inputs used to compute statistics for flops/activations, "
158 | "both are data dependent.",
159 | )
160 | parser.add_argument(
161 | "--use-fixed-input-size",
162 | action="store_true",
163 | help="use fixed input size when calculating flops",
164 | )
165 | args = parser.parse_args()
166 | assert not args.eval_only
167 | assert args.num_gpus == 1
168 |
169 | cfg = setup(args)
170 |
171 | for task in args.tasks:
172 | {
173 | "flop": do_flop,
174 | "activation": do_activation,
175 | "parameter": do_parameter,
176 | "structure": do_structure,
177 | }[task](cfg)
178 |
--------------------------------------------------------------------------------
/hgformer/data/dataset_mappers/mask_former_panoptic_dataset_mapper.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | import copy
3 | import logging
4 |
5 | import numpy as np
6 | import torch
7 | from torch.nn import functional as F
8 |
9 | from detectron2.config import configurable
10 | from detectron2.data import detection_utils as utils
11 | from detectron2.data import transforms as T
12 | from detectron2.structures import BitMasks, Instances
13 |
14 | from .mask_former_semantic_dataset_mapper import MaskFormerSemanticDatasetMapper
15 |
16 | __all__ = ["MaskFormerPanopticDatasetMapper"]
17 |
18 |
19 | class MaskFormerPanopticDatasetMapper(MaskFormerSemanticDatasetMapper):
20 | """
21 | A callable which takes a dataset dict in Detectron2 Dataset format,
22 | and map it into a format used by MaskFormer for panoptic segmentation.
23 |
24 | The callable currently does the following:
25 |
26 | 1. Read the image from "file_name"
27 | 2. Applies geometric transforms to the image and annotation
28 | 3. Find and applies suitable cropping to the image and annotation
29 | 4. Prepare image and annotation to Tensors
30 | """
31 |
32 | @configurable
33 | def __init__(
34 | self,
35 | is_train=True,
36 | *,
37 | augmentations,
38 | image_format,
39 | ignore_label,
40 | size_divisibility,
41 | ):
42 | """
43 | NOTE: this interface is experimental.
44 | Args:
45 | is_train: for training or inference
46 | augmentations: a list of augmentations or deterministic transforms to apply
47 | image_format: an image format supported by :func:`detection_utils.read_image`.
48 | ignore_label: the label that is ignored to evaluation
49 | size_divisibility: pad image size to be divisible by this value
50 | """
51 | super().__init__(
52 | is_train,
53 | augmentations=augmentations,
54 | image_format=image_format,
55 | ignore_label=ignore_label,
56 | size_divisibility=size_divisibility,
57 | )
58 |
59 | def __call__(self, dataset_dict):
60 | """
61 | Args:
62 | dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
63 |
64 | Returns:
65 | dict: a format that builtin models in detectron2 accept
66 | """
67 | assert self.is_train, "MaskFormerPanopticDatasetMapper should only be used for training!"
68 |
69 | dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below
70 | image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
71 | utils.check_image_size(dataset_dict, image)
72 |
73 | # semantic segmentation
74 | if "sem_seg_file_name" in dataset_dict:
75 | # PyTorch transformation not implemented for uint16, so converting it to double first
76 | sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name")).astype("double")
77 | else:
78 | sem_seg_gt = None
79 |
80 | # panoptic segmentation
81 | if "pan_seg_file_name" in dataset_dict:
82 | pan_seg_gt = utils.read_image(dataset_dict.pop("pan_seg_file_name"), "RGB")
83 | segments_info = dataset_dict["segments_info"]
84 | else:
85 | pan_seg_gt = None
86 | segments_info = None
87 |
88 | if pan_seg_gt is None:
89 | raise ValueError(
90 | "Cannot find 'pan_seg_file_name' for panoptic segmentation dataset {}.".format(
91 | dataset_dict["file_name"]
92 | )
93 | )
94 |
95 | aug_input = T.AugInput(image, sem_seg=sem_seg_gt)
96 | aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input)
97 | image = aug_input.image
98 | if sem_seg_gt is not None:
99 | sem_seg_gt = aug_input.sem_seg
100 |
101 | # apply the same transformation to panoptic segmentation
102 | pan_seg_gt = transforms.apply_segmentation(pan_seg_gt)
103 |
104 | from panopticapi.utils import rgb2id
105 |
106 | pan_seg_gt = rgb2id(pan_seg_gt)
107 |
108 | # Pad image and segmentation label here!
109 | image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
110 | if sem_seg_gt is not None:
111 | sem_seg_gt = torch.as_tensor(sem_seg_gt.astype("long"))
112 | pan_seg_gt = torch.as_tensor(pan_seg_gt.astype("long"))
113 |
114 | if self.size_divisibility > 0:
115 | image_size = (image.shape[-2], image.shape[-1])
116 | padding_size = [
117 | 0,
118 | self.size_divisibility - image_size[1],
119 | 0,
120 | self.size_divisibility - image_size[0],
121 | ]
122 | image = F.pad(image, padding_size, value=128).contiguous()
123 | if sem_seg_gt is not None:
124 | sem_seg_gt = F.pad(sem_seg_gt, padding_size, value=self.ignore_label).contiguous()
125 | pan_seg_gt = F.pad(
126 | pan_seg_gt, padding_size, value=0
127 | ).contiguous() # 0 is the VOID panoptic label
128 |
129 | image_shape = (image.shape[-2], image.shape[-1]) # h, w
130 |
131 | # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
132 | # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
133 | # Therefore it's important to use torch.Tensor.
134 | dataset_dict["image"] = image
135 | if sem_seg_gt is not None:
136 | dataset_dict["sem_seg"] = sem_seg_gt.long()
137 |
138 | if "annotations" in dataset_dict:
139 | raise ValueError("Pemantic segmentation dataset should not have 'annotations'.")
140 |
141 | # Prepare per-category binary masks
142 | pan_seg_gt = pan_seg_gt.numpy()
143 | instances = Instances(image_shape)
144 | classes = []
145 | masks = []
146 | for segment_info in segments_info:
147 | class_id = segment_info["category_id"]
148 | if not segment_info["iscrowd"]:
149 | classes.append(class_id)
150 | masks.append(pan_seg_gt == segment_info["id"])
151 |
152 | classes = np.array(classes)
153 | instances.gt_classes = torch.tensor(classes, dtype=torch.int64)
154 | if len(masks) == 0:
155 | # Some image does not have annotation (all ignored)
156 | instances.gt_masks = torch.zeros((0, pan_seg_gt.shape[-2], pan_seg_gt.shape[-1]))
157 | else:
158 | masks = BitMasks(
159 | torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks])
160 | )
161 | instances.gt_masks = masks.tensor
162 |
163 | dataset_dict["instances"] = instances
164 |
165 | return dataset_dict
166 |
--------------------------------------------------------------------------------
/hgformer/modeling/pixel_decoder/ops/modules/ms_deform_attn.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------------------------------
2 | # Deformable DETR
3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5 | # ------------------------------------------------------------------------------------------------
6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
7 | # ------------------------------------------------------------------------------------------------
8 |
9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 |
12 | from __future__ import absolute_import
13 | from __future__ import print_function
14 | from __future__ import division
15 |
16 | import warnings
17 | import math
18 |
19 | import torch
20 | from torch import nn
21 | import torch.nn.functional as F
22 | from torch.nn.init import xavier_uniform_, constant_
23 |
24 | from ..functions import MSDeformAttnFunction
25 | from ..functions.ms_deform_attn_func import ms_deform_attn_core_pytorch
26 |
27 |
28 | def _is_power_of_2(n):
29 | if (not isinstance(n, int)) or (n < 0):
30 | raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n)))
31 | return (n & (n-1) == 0) and n != 0
32 |
33 |
34 | class MSDeformAttn(nn.Module):
35 | def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4):
36 | """
37 | Multi-Scale Deformable Attention Module
38 | :param d_model hidden dimension
39 | :param n_levels number of feature levels
40 | :param n_heads number of attention heads
41 | :param n_points number of sampling points per attention head per feature level
42 | """
43 | super().__init__()
44 | if d_model % n_heads != 0:
45 | raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads))
46 | _d_per_head = d_model // n_heads
47 | # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation
48 | if not _is_power_of_2(_d_per_head):
49 | warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 "
50 | "which is more efficient in our CUDA implementation.")
51 |
52 | self.im2col_step = 128
53 |
54 | self.d_model = d_model
55 | self.n_levels = n_levels
56 | self.n_heads = n_heads
57 | self.n_points = n_points
58 |
59 | self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2)
60 | self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points)
61 | self.value_proj = nn.Linear(d_model, d_model)
62 | self.output_proj = nn.Linear(d_model, d_model)
63 |
64 | self._reset_parameters()
65 |
66 | def _reset_parameters(self):
67 | constant_(self.sampling_offsets.weight.data, 0.)
68 | thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
69 | grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
70 | grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1)
71 | for i in range(self.n_points):
72 | grid_init[:, :, i, :] *= i + 1
73 | with torch.no_grad():
74 | self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
75 | constant_(self.attention_weights.weight.data, 0.)
76 | constant_(self.attention_weights.bias.data, 0.)
77 | xavier_uniform_(self.value_proj.weight.data)
78 | constant_(self.value_proj.bias.data, 0.)
79 | xavier_uniform_(self.output_proj.weight.data)
80 | constant_(self.output_proj.bias.data, 0.)
81 |
82 | def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None):
83 | """
84 | :param query (N, Length_{query}, C)
85 | :param reference_points (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area
86 | or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes
87 | :param input_flatten (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C)
88 | :param input_spatial_shapes (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
89 | :param input_level_start_index (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}]
90 | :param input_padding_mask (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements
91 |
92 | :return output (N, Length_{query}, C)
93 | """
94 | N, Len_q, _ = query.shape
95 | N, Len_in, _ = input_flatten.shape
96 | assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in
97 |
98 | value = self.value_proj(input_flatten)
99 | if input_padding_mask is not None:
100 | value = value.masked_fill(input_padding_mask[..., None], float(0))
101 | value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads)
102 | sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2)
103 | attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points)
104 | attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points)
105 | # N, Len_q, n_heads, n_levels, n_points, 2
106 | if reference_points.shape[-1] == 2:
107 | offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1)
108 | sampling_locations = reference_points[:, :, None, :, None, :] \
109 | + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
110 | elif reference_points.shape[-1] == 4:
111 | sampling_locations = reference_points[:, :, None, :, None, :2] \
112 | + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
113 | else:
114 | raise ValueError(
115 | 'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1]))
116 | try:
117 | output = MSDeformAttnFunction.apply(
118 | value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step)
119 | except:
120 | # CPU
121 | output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights)
122 | # # For FLOPs calculation only
123 | # output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights)
124 | output = self.output_proj(output)
125 | return output
126 |
--------------------------------------------------------------------------------
/hgformer/data/dataset_mappers/mask_former_instance_dataset_mapper.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | import copy
3 | import logging
4 |
5 | import numpy as np
6 | import pycocotools.mask as mask_util
7 | import torch
8 | from torch.nn import functional as F
9 |
10 | from detectron2.config import configurable
11 | from detectron2.data import detection_utils as utils
12 | from detectron2.data import transforms as T
13 | from detectron2.projects.point_rend import ColorAugSSDTransform
14 | from detectron2.structures import BitMasks, Instances, polygons_to_bitmask
15 |
16 | __all__ = ["MaskFormerInstanceDatasetMapper"]
17 |
18 |
19 | class MaskFormerInstanceDatasetMapper:
20 | """
21 | A callable which takes a dataset dict in Detectron2 Dataset format,
22 | and map it into a format used by MaskFormer for instance segmentation.
23 |
24 | The callable currently does the following:
25 |
26 | 1. Read the image from "file_name"
27 | 2. Applies geometric transforms to the image and annotation
28 | 3. Find and applies suitable cropping to the image and annotation
29 | 4. Prepare image and annotation to Tensors
30 | """
31 |
32 | @configurable
33 | def __init__(
34 | self,
35 | is_train=True,
36 | *,
37 | augmentations,
38 | image_format,
39 | size_divisibility,
40 | ):
41 | """
42 | NOTE: this interface is experimental.
43 | Args:
44 | is_train: for training or inference
45 | augmentations: a list of augmentations or deterministic transforms to apply
46 | image_format: an image format supported by :func:`detection_utils.read_image`.
47 | size_divisibility: pad image size to be divisible by this value
48 | """
49 | self.is_train = is_train
50 | self.tfm_gens = augmentations
51 | self.img_format = image_format
52 | self.size_divisibility = size_divisibility
53 |
54 | logger = logging.getLogger(__name__)
55 | mode = "training" if is_train else "inference"
56 | logger.info(f"[{self.__class__.__name__}] Augmentations used in {mode}: {augmentations}")
57 |
58 | @classmethod
59 | def from_config(cls, cfg, is_train=True):
60 | # Build augmentation
61 | augs = [
62 | T.ResizeShortestEdge(
63 | cfg.INPUT.MIN_SIZE_TRAIN,
64 | cfg.INPUT.MAX_SIZE_TRAIN,
65 | cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING,
66 | )
67 | ]
68 | if cfg.INPUT.CROP.ENABLED:
69 | augs.append(
70 | T.RandomCrop(
71 | cfg.INPUT.CROP.TYPE,
72 | cfg.INPUT.CROP.SIZE,
73 | )
74 | )
75 | if cfg.INPUT.COLOR_AUG_SSD:
76 | augs.append(ColorAugSSDTransform(img_format=cfg.INPUT.FORMAT))
77 | augs.append(T.RandomFlip())
78 |
79 | ret = {
80 | "is_train": is_train,
81 | "augmentations": augs,
82 | "image_format": cfg.INPUT.FORMAT,
83 | "size_divisibility": cfg.INPUT.SIZE_DIVISIBILITY,
84 | }
85 | return ret
86 |
87 | def __call__(self, dataset_dict):
88 | """
89 | Args:
90 | dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
91 |
92 | Returns:
93 | dict: a format that builtin models in detectron2 accept
94 | """
95 | assert self.is_train, "MaskFormerPanopticDatasetMapper should only be used for training!"
96 |
97 | dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below
98 | image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
99 | utils.check_image_size(dataset_dict, image)
100 |
101 | aug_input = T.AugInput(image)
102 | aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input)
103 | image = aug_input.image
104 |
105 | # transform instnace masks
106 | assert "annotations" in dataset_dict
107 | for anno in dataset_dict["annotations"]:
108 | anno.pop("keypoints", None)
109 |
110 | annos = [
111 | utils.transform_instance_annotations(obj, transforms, image.shape[:2])
112 | for obj in dataset_dict.pop("annotations")
113 | if obj.get("iscrowd", 0) == 0
114 | ]
115 |
116 | if len(annos):
117 | assert "segmentation" in annos[0]
118 | segms = [obj["segmentation"] for obj in annos]
119 | masks = []
120 | for segm in segms:
121 | if isinstance(segm, list):
122 | # polygon
123 | masks.append(polygons_to_bitmask(segm, *image.shape[:2]))
124 | elif isinstance(segm, dict):
125 | # COCO RLE
126 | masks.append(mask_util.decode(segm))
127 | elif isinstance(segm, np.ndarray):
128 | assert segm.ndim == 2, "Expect segmentation of 2 dimensions, got {}.".format(
129 | segm.ndim
130 | )
131 | # mask array
132 | masks.append(segm)
133 | else:
134 | raise ValueError(
135 | "Cannot convert segmentation of type '{}' to BitMasks!"
136 | "Supported types are: polygons as list[list[float] or ndarray],"
137 | " COCO-style RLE as a dict, or a binary segmentation mask "
138 | " in a 2D numpy array of shape HxW.".format(type(segm))
139 | )
140 |
141 | # Pad image and segmentation label here!
142 | image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
143 | masks = [torch.from_numpy(np.ascontiguousarray(x)) for x in masks]
144 |
145 | classes = [int(obj["category_id"]) for obj in annos]
146 | classes = torch.tensor(classes, dtype=torch.int64)
147 |
148 | if self.size_divisibility > 0:
149 | image_size = (image.shape[-2], image.shape[-1])
150 | padding_size = [
151 | 0,
152 | self.size_divisibility - image_size[1],
153 | 0,
154 | self.size_divisibility - image_size[0],
155 | ]
156 | # pad image
157 | image = F.pad(image, padding_size, value=128).contiguous()
158 | # pad mask
159 | masks = [F.pad(x, padding_size, value=0).contiguous() for x in masks]
160 |
161 | image_shape = (image.shape[-2], image.shape[-1]) # h, w
162 |
163 | # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
164 | # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
165 | # Therefore it's important to use torch.Tensor.
166 | dataset_dict["image"] = image
167 |
168 | # Prepare per-category binary masks
169 | instances = Instances(image_shape)
170 | instances.gt_classes = classes
171 | if len(masks) == 0:
172 | # Some image does not have annotation (all ignored)
173 | instances.gt_masks = torch.zeros((0, image.shape[-2], image.shape[-1]))
174 | else:
175 | masks = BitMasks(torch.stack(masks))
176 | instances.gt_masks = masks.tensor
177 |
178 | dataset_dict["instances"] = instances
179 |
180 | return dataset_dict
181 |
--------------------------------------------------------------------------------
/hgformer/modeling/transformer_decoder/maskformer_transformer_decoder.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/detr.py
3 | import fvcore.nn.weight_init as weight_init
4 | import torch
5 | from torch import nn
6 | from torch.nn import functional as F
7 |
8 | from detectron2.config import configurable
9 | from detectron2.layers import Conv2d
10 | from detectron2.utils.registry import Registry
11 |
12 | from .position_encoding import PositionEmbeddingSine
13 | from .transformer import Transformer
14 |
15 |
16 | TRANSFORMER_DECODER_REGISTRY = Registry("TRANSFORMER_MODULE")
17 | TRANSFORMER_DECODER_REGISTRY.__doc__ = """
18 | Registry for transformer module in MaskFormer.
19 | """
20 |
21 |
22 | def build_transformer_decoder(cfg, in_channels, mask_classification=True):
23 | """
24 | Build a instance embedding branch from `cfg.MODEL.INS_EMBED_HEAD.NAME`.
25 | """
26 | name = cfg.MODEL.MASK_FORMER.TRANSFORMER_DECODER_NAME
27 | return TRANSFORMER_DECODER_REGISTRY.get(name)(cfg, in_channels, mask_classification)
28 |
29 |
30 | @TRANSFORMER_DECODER_REGISTRY.register()
31 | class StandardTransformerDecoder(nn.Module):
32 | @configurable
33 | def __init__(
34 | self,
35 | in_channels,
36 | mask_classification=True,
37 | *,
38 | num_classes: int,
39 | hidden_dim: int,
40 | num_queries: int,
41 | nheads: int,
42 | dropout: float,
43 | dim_feedforward: int,
44 | enc_layers: int,
45 | dec_layers: int,
46 | pre_norm: bool,
47 | deep_supervision: bool,
48 | mask_dim: int,
49 | enforce_input_project: bool,
50 | ):
51 | """
52 | NOTE: this interface is experimental.
53 | Args:
54 | in_channels: channels of the input features
55 | mask_classification: whether to add mask classifier or not
56 | num_classes: number of classes
57 | hidden_dim: Transformer feature dimension
58 | num_queries: number of queries
59 | nheads: number of heads
60 | dropout: dropout in Transformer
61 | dim_feedforward: feature dimension in feedforward network
62 | enc_layers: number of Transformer encoder layers
63 | dec_layers: number of Transformer decoder layers
64 | pre_norm: whether to use pre-LayerNorm or not
65 | deep_supervision: whether to add supervision to every decoder layers
66 | mask_dim: mask feature dimension
67 | enforce_input_project: add input project 1x1 conv even if input
68 | channels and hidden dim is identical
69 | """
70 | super().__init__()
71 |
72 | self.mask_classification = mask_classification
73 |
74 | # positional encoding
75 | N_steps = hidden_dim // 2
76 | self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True)
77 |
78 | transformer = Transformer(
79 | d_model=hidden_dim,
80 | dropout=dropout,
81 | nhead=nheads,
82 | dim_feedforward=dim_feedforward,
83 | num_encoder_layers=enc_layers,
84 | num_decoder_layers=dec_layers,
85 | normalize_before=pre_norm,
86 | return_intermediate_dec=deep_supervision,
87 | )
88 |
89 | self.num_queries = num_queries
90 | self.transformer = transformer
91 | hidden_dim = transformer.d_model
92 |
93 | self.query_embed = nn.Embedding(num_queries, hidden_dim)
94 |
95 | if in_channels != hidden_dim or enforce_input_project:
96 | self.input_proj = Conv2d(in_channels, hidden_dim, kernel_size=1)
97 | weight_init.c2_xavier_fill(self.input_proj)
98 | else:
99 | self.input_proj = nn.Sequential()
100 | self.aux_loss = deep_supervision
101 |
102 | # output FFNs
103 | if self.mask_classification:
104 | self.class_embed = nn.Linear(hidden_dim, num_classes + 1)
105 | self.mask_embed = MLP(hidden_dim, hidden_dim, mask_dim, 3)
106 |
107 | @classmethod
108 | def from_config(cls, cfg, in_channels, mask_classification):
109 | ret = {}
110 | ret["in_channels"] = in_channels
111 | ret["mask_classification"] = mask_classification
112 |
113 | ret["num_classes"] = cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES
114 | ret["hidden_dim"] = cfg.MODEL.MASK_FORMER.HIDDEN_DIM
115 | ret["num_queries"] = cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES
116 | # Transformer parameters:
117 | ret["nheads"] = cfg.MODEL.MASK_FORMER.NHEADS
118 | ret["dropout"] = cfg.MODEL.MASK_FORMER.DROPOUT
119 | ret["dim_feedforward"] = cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD
120 | ret["enc_layers"] = cfg.MODEL.MASK_FORMER.ENC_LAYERS
121 | ret["dec_layers"] = cfg.MODEL.MASK_FORMER.DEC_LAYERS
122 | ret["pre_norm"] = cfg.MODEL.MASK_FORMER.PRE_NORM
123 | ret["deep_supervision"] = cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION
124 | ret["enforce_input_project"] = cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ
125 |
126 | ret["mask_dim"] = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM
127 |
128 | return ret
129 |
130 | def forward(self, x, mask_features, mask=None):
131 | if mask is not None:
132 | mask = F.interpolate(mask[None].float(), size=x.shape[-2:]).to(torch.bool)[0]
133 | pos = self.pe_layer(x, mask)
134 |
135 | src = x
136 | hs, memory = self.transformer(self.input_proj(src), mask, self.query_embed.weight, pos)
137 |
138 | if self.mask_classification:
139 | outputs_class = self.class_embed(hs)
140 | out = {"pred_logits": outputs_class[-1]}
141 | else:
142 | out = {}
143 |
144 | if self.aux_loss:
145 | # [l, bs, queries, embed]
146 | mask_embed = self.mask_embed(hs)
147 | outputs_seg_masks = torch.einsum("lbqc,bchw->lbqhw", mask_embed, mask_features)
148 | out["pred_masks"] = outputs_seg_masks[-1]
149 | out["aux_outputs"] = self._set_aux_loss(
150 | outputs_class if self.mask_classification else None, outputs_seg_masks
151 | )
152 | else:
153 | # FIXME h_boxes takes the last one computed, keep this in mind
154 | # [bs, queries, embed]
155 | mask_embed = self.mask_embed(hs[-1])
156 | outputs_seg_masks = torch.einsum("bqc,bchw->bqhw", mask_embed, mask_features)
157 | out["pred_masks"] = outputs_seg_masks
158 | return out
159 |
160 | @torch.jit.unused
161 | def _set_aux_loss(self, outputs_class, outputs_seg_masks):
162 | # this is a workaround to make torchscript happy, as torchscript
163 | # doesn't support dictionary with non-homogeneous values, such
164 | # as a dict having both a Tensor and a list.
165 | if self.mask_classification:
166 | return [
167 | {"pred_logits": a, "pred_masks": b}
168 | for a, b in zip(outputs_class[:-1], outputs_seg_masks[:-1])
169 | ]
170 | else:
171 | return [{"pred_masks": b} for b in outputs_seg_masks[:-1]]
172 |
173 |
174 | class MLP(nn.Module):
175 | """Very simple multi-layer perceptron (also called FFN)"""
176 |
177 | def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
178 | super().__init__()
179 | self.num_layers = num_layers
180 | h = [hidden_dim] * (num_layers - 1)
181 | self.layers = nn.ModuleList(
182 | nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])
183 | )
184 |
185 | def forward(self, x):
186 | for i, layer in enumerate(self.layers):
187 | x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
188 | return x
189 |
--------------------------------------------------------------------------------
/hgformer/data/dataset_mappers/mask_former_semantic_dataset_mapper.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | import copy
3 | import logging
4 |
5 | import numpy as np
6 | import torch
7 | from torch.nn import functional as F
8 |
9 | from detectron2.config import configurable
10 | from detectron2.data import MetadataCatalog
11 | from detectron2.data import detection_utils as utils
12 | from detectron2.data import transforms as T
13 | from detectron2.projects.point_rend import ColorAugSSDTransform
14 | from detectron2.structures import BitMasks, Instances
15 |
16 | __all__ = ["MaskFormerSemanticDatasetMapper"]
17 |
18 |
19 | class MaskFormerSemanticDatasetMapper:
20 | """
21 | A callable which takes a dataset dict in Detectron2 Dataset format,
22 | and map it into a format used by MaskFormer for semantic segmentation.
23 |
24 | The callable currently does the following:
25 |
26 | 1. Read the image from "file_name"
27 | 2. Applies geometric transforms to the image and annotation
28 | 3. Find and applies suitable cropping to the image and annotation
29 | 4. Prepare image and annotation to Tensors
30 | """
31 |
32 | @configurable
33 | def __init__(
34 | self,
35 | is_train=True,
36 | *,
37 | augmentations,
38 | image_format,
39 | ignore_label,
40 | size_divisibility,
41 | ):
42 | """
43 | NOTE: this interface is experimental.
44 | Args:
45 | is_train: for training or inference
46 | augmentations: a list of augmentations or deterministic transforms to apply
47 | image_format: an image format supported by :func:`detection_utils.read_image`.
48 | ignore_label: the label that is ignored to evaluation
49 | size_divisibility: pad image size to be divisible by this value
50 | """
51 | self.is_train = is_train
52 | self.tfm_gens = augmentations
53 | self.img_format = image_format
54 | self.ignore_label = ignore_label
55 | self.size_divisibility = size_divisibility
56 |
57 | logger = logging.getLogger(__name__)
58 | mode = "training" if is_train else "inference"
59 | logger.info(f"[{self.__class__.__name__}] Augmentations used in {mode}: {augmentations}")
60 |
61 | @classmethod
62 | def from_config(cls, cfg, is_train=True):
63 | # Build augmentation
64 | augs = [
65 | T.ResizeShortestEdge(
66 | cfg.INPUT.MIN_SIZE_TRAIN,
67 | cfg.INPUT.MAX_SIZE_TRAIN,
68 | cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING,
69 | )
70 | ]
71 | if cfg.INPUT.CROP.ENABLED:
72 | augs.append(
73 | T.RandomCrop_CategoryAreaConstraint(
74 | cfg.INPUT.CROP.TYPE,
75 | cfg.INPUT.CROP.SIZE,
76 | cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA,
77 | cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
78 | )
79 | )
80 | # import ipdb; ipdb.set_trace()
81 | if cfg.INPUT.COLOR_AUG_SSD:
82 | augs.append(ColorAugSSDTransform(img_format=cfg.INPUT.FORMAT))
83 | augs.append(T.RandomFlip())
84 |
85 | # Assume always applies to the training set.
86 | dataset_names = cfg.DATASETS.TRAIN
87 | meta = MetadataCatalog.get(dataset_names[0])
88 | ignore_label = meta.ignore_label
89 |
90 | ret = {
91 | "is_train": is_train,
92 | "augmentations": augs,
93 | "image_format": cfg.INPUT.FORMAT,
94 | "ignore_label": ignore_label,
95 | "size_divisibility": cfg.INPUT.SIZE_DIVISIBILITY,
96 | }
97 | return ret
98 |
99 | def __call__(self, dataset_dict):
100 | """
101 | Args:
102 | dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
103 |
104 | Returns:
105 | dict: a format that builtin models in detectron2 accept
106 | """
107 | assert self.is_train, "MaskFormerSemanticDatasetMapper should only be used for training!"
108 |
109 | dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below
110 | image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
111 | utils.check_image_size(dataset_dict, image)
112 | # import ipdb; ipdb.set_trace()
113 | if "sem_seg_file_name" in dataset_dict:
114 | # PyTorch transformation not implemented for uint16, so converting it to double first
115 | sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name")).astype("double")
116 | else:
117 | sem_seg_gt = None
118 |
119 | if sem_seg_gt is None:
120 | raise ValueError(
121 | "Cannot find 'sem_seg_file_name' for semantic segmentation dataset {}.".format(
122 | dataset_dict["file_name"]
123 | )
124 | )
125 |
126 | aug_input = T.AugInput(image, sem_seg=sem_seg_gt)
127 | aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input)
128 | image = aug_input.image
129 | sem_seg_gt = aug_input.sem_seg
130 |
131 | # Pad image and segmentation label here!
132 | image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
133 | if sem_seg_gt is not None:
134 | sem_seg_gt = torch.as_tensor(sem_seg_gt.astype("long"))
135 |
136 | if self.size_divisibility > 0:
137 | image_size = (image.shape[-2], image.shape[-1])
138 | padding_size = [
139 | 0,
140 | self.size_divisibility - image_size[1],
141 | 0,
142 | self.size_divisibility - image_size[0],
143 | ]
144 | image = F.pad(image, padding_size, value=128).contiguous()
145 | if sem_seg_gt is not None:
146 | sem_seg_gt = F.pad(sem_seg_gt, padding_size, value=self.ignore_label).contiguous()
147 |
148 | image_shape = (image.shape[-2], image.shape[-1]) # h, w
149 |
150 | # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
151 | # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
152 | # Therefore it's important to use torch.Tensor.
153 | dataset_dict["image"] = image
154 |
155 | if sem_seg_gt is not None:
156 | dataset_dict["sem_seg"] = sem_seg_gt.long()
157 | # import ipdb; ipdb.set_trace()
158 | if "annotations" in dataset_dict:
159 | raise ValueError("Semantic segmentation dataset should not have 'annotations'.")
160 | # import ipdb; ipdb.set_trace()
161 | # Prepare per-category binary masks
162 | if sem_seg_gt is not None:
163 | sem_seg_gt = sem_seg_gt.numpy()
164 | instances = Instances(image_shape)
165 | classes = np.unique(sem_seg_gt)
166 | # remove ignored region
167 | classes = classes[classes != self.ignore_label]
168 | instances.gt_classes = torch.tensor(classes, dtype=torch.int64)
169 |
170 | masks = []
171 | for class_id in classes:
172 | masks.append(sem_seg_gt == class_id)
173 |
174 | if len(masks) == 0:
175 | # Some image does not have annotation (all ignored)
176 | instances.gt_masks = torch.zeros((0, sem_seg_gt.shape[-2], sem_seg_gt.shape[-1]))
177 | else:
178 | try:
179 | masks = BitMasks(
180 | torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks])
181 | )
182 | instances.gt_masks = masks.tensor
183 | except:
184 | import ipdb; ipdb.set_trace()
185 |
186 | dataset_dict["instances"] = instances
187 | # import ipdb; ipdb.set_trace()
188 | return dataset_dict
189 |
--------------------------------------------------------------------------------
/hgformer/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.cu:
--------------------------------------------------------------------------------
1 | /*!
2 | **************************************************************************************************
3 | * Deformable DETR
4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6 | **************************************************************************************************
7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
8 | **************************************************************************************************
9 | */
10 |
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 |
16 | #include
17 | #include "cuda/ms_deform_im2col_cuda.cuh"
18 |
19 | #include
20 | #include
21 | #include
22 | #include
23 |
24 |
25 | at::Tensor ms_deform_attn_cuda_forward(
26 | const at::Tensor &value,
27 | const at::Tensor &spatial_shapes,
28 | const at::Tensor &level_start_index,
29 | const at::Tensor &sampling_loc,
30 | const at::Tensor &attn_weight,
31 | const int im2col_step)
32 | {
33 | AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
34 | AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
35 | AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
36 | AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
37 | AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
38 |
39 | AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
40 | AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
41 | AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
42 | AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
43 | AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
44 |
45 | const int batch = value.size(0);
46 | const int spatial_size = value.size(1);
47 | const int num_heads = value.size(2);
48 | const int channels = value.size(3);
49 |
50 | const int num_levels = spatial_shapes.size(0);
51 |
52 | const int num_query = sampling_loc.size(1);
53 | const int num_point = sampling_loc.size(4);
54 |
55 | const int im2col_step_ = std::min(batch, im2col_step);
56 |
57 | AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
58 |
59 | auto output = at::zeros({batch, num_query, num_heads, channels}, value.options());
60 |
61 | const int batch_n = im2col_step_;
62 | auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
63 | auto per_value_size = spatial_size * num_heads * channels;
64 | auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
65 | auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
66 | for (int n = 0; n < batch/im2col_step_; ++n)
67 | {
68 | auto columns = output_n.select(0, n);
69 | AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_forward_cuda", ([&] {
70 | ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(),
71 | value.data() + n * im2col_step_ * per_value_size,
72 | spatial_shapes.data(),
73 | level_start_index.data(),
74 | sampling_loc.data() + n * im2col_step_ * per_sample_loc_size,
75 | attn_weight.data() + n * im2col_step_ * per_attn_weight_size,
76 | batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
77 | columns.data());
78 |
79 | }));
80 | }
81 |
82 | output = output.view({batch, num_query, num_heads*channels});
83 |
84 | return output;
85 | }
86 |
87 |
88 | std::vector ms_deform_attn_cuda_backward(
89 | const at::Tensor &value,
90 | const at::Tensor &spatial_shapes,
91 | const at::Tensor &level_start_index,
92 | const at::Tensor &sampling_loc,
93 | const at::Tensor &attn_weight,
94 | const at::Tensor &grad_output,
95 | const int im2col_step)
96 | {
97 |
98 | AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
99 | AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
100 | AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
101 | AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
102 | AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
103 | AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous");
104 |
105 | AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
106 | AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
107 | AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
108 | AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
109 | AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
110 | AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor");
111 |
112 | const int batch = value.size(0);
113 | const int spatial_size = value.size(1);
114 | const int num_heads = value.size(2);
115 | const int channels = value.size(3);
116 |
117 | const int num_levels = spatial_shapes.size(0);
118 |
119 | const int num_query = sampling_loc.size(1);
120 | const int num_point = sampling_loc.size(4);
121 |
122 | const int im2col_step_ = std::min(batch, im2col_step);
123 |
124 | AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
125 |
126 | auto grad_value = at::zeros_like(value);
127 | auto grad_sampling_loc = at::zeros_like(sampling_loc);
128 | auto grad_attn_weight = at::zeros_like(attn_weight);
129 |
130 | const int batch_n = im2col_step_;
131 | auto per_value_size = spatial_size * num_heads * channels;
132 | auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
133 | auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
134 | auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
135 |
136 | for (int n = 0; n < batch/im2col_step_; ++n)
137 | {
138 | auto grad_output_g = grad_output_n.select(0, n);
139 | AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_backward_cuda", ([&] {
140 | ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(),
141 | grad_output_g.data(),
142 | value.data() + n * im2col_step_ * per_value_size,
143 | spatial_shapes.data(),
144 | level_start_index.data(),
145 | sampling_loc.data() + n * im2col_step_ * per_sample_loc_size,
146 | attn_weight.data() + n * im2col_step_ * per_attn_weight_size,
147 | batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
148 | grad_value.data() + n * im2col_step_ * per_value_size,
149 | grad_sampling_loc.data() + n * im2col_step_ * per_sample_loc_size,
150 | grad_attn_weight.data() + n * im2col_step_ * per_attn_weight_size);
151 |
152 | }));
153 | }
154 |
155 | return {
156 | grad_value, grad_sampling_loc, grad_attn_weight
157 | };
158 | }
--------------------------------------------------------------------------------
/hgformer/config.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Copyright (c) Facebook, Inc. and its affiliates.
3 | from detectron2.config import CfgNode as CN
4 |
5 |
6 | def add_maskformer2_config(cfg):
7 | """
8 | Add config for MASK_FORMER.
9 | """
10 | # NOTE: configs from original maskformer
11 | # data config
12 | # select the dataset mapper
13 | cfg.INPUT.DATASET_MAPPER_NAME = "mask_former_semantic"
14 | # Color augmentation
15 | cfg.INPUT.COLOR_AUG_SSD = False
16 | cfg.INPUT.COLOR_AUG_MIX = 'partial'
17 | # We retry random cropping until no single category in semantic segmentation GT occupies more
18 | # than `SINGLE_CATEGORY_MAX_AREA` part of the crop.
19 | cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA = 1.0
20 | # Pad image and segmentation GT in dataset mapper.
21 | cfg.INPUT.SIZE_DIVISIBILITY = -1
22 |
23 | # solver config
24 | # weight decay on embedding
25 | cfg.SOLVER.WEIGHT_DECAY_EMBED = 0.0
26 | # optimizer
27 | cfg.SOLVER.OPTIMIZER = "ADAMW"
28 | cfg.SOLVER.BACKBONE_MULTIPLIER = 0.1
29 |
30 | # mask_former model config
31 | cfg.MODEL.MASK_FORMER = CN()
32 |
33 | # loss
34 | cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION = True
35 | cfg.MODEL.MASK_FORMER.DEEP_MASK_SUPERVISION = False
36 | cfg.MODEL.MASK_FORMER.NO_OBJECT_WEIGHT = 0.1
37 | cfg.MODEL.MASK_FORMER.CLASS_WEIGHT = 1.0
38 | cfg.MODEL.MASK_FORMER.DICE_WEIGHT = 1.0
39 | cfg.MODEL.MASK_FORMER.MASK_WEIGHT = 20.0
40 | cfg.MODEL.MASK_FORMER.SPIX_MASK_WEIGHT = 20.0
41 | cfg.MODEL.MASK_FORMER.SPIX_COLOR_WEIGHT = 1.0
42 | cfg.MODEL.MASK_FORMER.SPIX_CLASS_WEIGHT = 1.0
43 | cfg.MODEL.MASK_FORMER.PIXEL_CLASS_WEIGHT = 2.0
44 | cfg.MODEL.MASK_FORMER.REGION_PROXY_CLS_WEIGHT = 2.0
45 | cfg.MODEL.MASK_FORMER.CONTRASTIVE_WEIGH = 2.0
46 | cfg.MODEL.MASK_FORMER.CONTRASTIVE_LOSS = False
47 | # cfg.MODEL.MASK_FORMER.EDGE_DISTANCES = [1, 2, 4, 8]
48 | cfg.MODEL.MASK_FORMER.HIGH_THRESHOLD = 0.3
49 | cfg.MODEL.MASK_FORMER.LOW_THRESHOLD = 0.05
50 | cfg.MODEL.MASK_FORMER.RETURN_ITERATION = False
51 | cfg.MODEL.MASK_FORMER.OBLIQUE_DISTANCES = [1, 2, 4, 8]
52 | # cfg.MODEL.MASK_FORMER.BYOL_WEIGH = 2.0
53 | # cfg.MODEL.MASK_FORMER.EDGE_WEIGH = 2.0
54 | # cfg.MODEL.MASK_FORMER.PSEUDO_EDGE_WEIGH = 2.0
55 | cfg.MODEL.MASK_FORMER.SPIX_PIXEL_CLS_WEIGH = 2.0
56 | # cfg.MODEL.MASK_FORMER.BYOL_LOSS = False
57 | # cfg.MODEL.MASK_FORMER.EDGE_LOSS = False
58 | cfg.MODEL.MASK_FORMER.CONTRASTIVE_TAU = 0.3
59 | cfg.MODEL.MASK_FORMER.COMPUTE_RAMA = False
60 | cfg.MODEL.MASK_FORMER.RECONSTRUCT_LOSS = False
61 | cfg.MODEL.MASK_FORMER.RECONSTRUCT_COLOR = False
62 | cfg.MODEL.MASK_FORMER.RECONSTRUCT_COORD = False
63 | cfg.MODEL.MASK_FORMER.STAGE_WEIGHTS = [1.0, 1.0]
64 | cfg.MODEL.MASK_FORMER.SPIX_MASK_STAGE2 = 1.0
65 |
66 | # transformer config
67 | cfg.MODEL.MASK_FORMER.NHEADS = 8
68 | cfg.MODEL.MASK_FORMER.DROPOUT = 0.1
69 | cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD = 2048
70 | cfg.MODEL.MASK_FORMER.ENC_LAYERS = 0
71 | cfg.MODEL.MASK_FORMER.DEC_LAYERS = 6
72 | cfg.MODEL.MASK_FORMER.SPIX_SELF_ATTEN_LAYERS = 6
73 | cfg.MODEL.MASK_FORMER.PRE_NORM = False
74 |
75 | cfg.MODEL.MASK_FORMER.HIDDEN_DIM = 256
76 | cfg.MODEL.MASK_FORMER.CONTRASTIVE_DIM = 128
77 | cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES = 100
78 |
79 | cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE = "res5"
80 | cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ = False
81 |
82 | # mask_former inference config
83 | cfg.MODEL.MASK_FORMER.TEST = CN()
84 | cfg.MODEL.MASK_FORMER.TEST.SEMANTIC_ON = True
85 | cfg.MODEL.MASK_FORMER.TEST.INSTANCE_ON = False
86 | cfg.MODEL.MASK_FORMER.TEST.PANOPTIC_ON = False
87 | cfg.MODEL.MASK_FORMER.TEST.OBJECT_MASK_THRESHOLD = 0.0
88 | cfg.MODEL.MASK_FORMER.TEST.OVERLAP_THRESHOLD = 0.0
89 | cfg.MODEL.MASK_FORMER.TEST.SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE = False
90 | # cfg.TEST.MODE = "whole" # "whole" or "slide"
91 | # cfg.TEST.STRIDE = (300, 768)
92 | # cfg.TEST.CROP_SIZE = (512, 1024)
93 | cfg.TEST.CLUSTER_SOFTMAX = False
94 | cfg.TEST.PRED_STAGE = "all"
95 |
96 | # Sometimes `backbone.size_divisibility` is set to 0 for some backbone (e.g. ResNet)
97 | # you can use this config to override
98 | cfg.MODEL.MASK_FORMER.SIZE_DIVISIBILITY = 32
99 |
100 | cfg.MODEL.MASK_FORMER.GZERO_CALIBRATE = -1.0
101 | cfg.MODEL.MASK_FORMER.ENSEMBLING = False
102 | cfg.MODEL.MASK_FORMER.ENSEMBLING_ALL_CLS = False
103 |
104 | # vis
105 | cfg.MODEL.MASK_FORMER.VIS = False
106 | cfg.MODEL.MASK_FORMER.QUERY_SHAPE = [8, 16] # h, w
107 | cfg.MODEL.MASK_FORMER.ENSEMBLING_START = 1
108 |
109 | # pixel decoder config
110 | cfg.MODEL.SEM_SEG_HEAD.MASK_DIM = 256
111 | # adding transformer in pixel decoder
112 | cfg.MODEL.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS = 0
113 | # pixel decoder
114 | cfg.MODEL.SEM_SEG_HEAD.PIXEL_DECODER_NAME = "BasePixelDecoder"
115 | # gzero calibrate
116 | cfg.MODEL.SEM_SEG_HEAD.GZERO_CALIBRATE = -1.0
117 |
118 | # swin transformer backbone
119 | cfg.MODEL.SWIN = CN()
120 | cfg.MODEL.SWIN.PRETRAIN_IMG_SIZE = 224
121 | cfg.MODEL.SWIN.PATCH_SIZE = 4
122 | cfg.MODEL.SWIN.EMBED_DIM = 96
123 | cfg.MODEL.SWIN.DEPTHS = [2, 2, 6, 2]
124 | cfg.MODEL.SWIN.NUM_HEADS = [3, 6, 12, 24]
125 | cfg.MODEL.SWIN.WINDOW_SIZE = 7
126 | cfg.MODEL.SWIN.MLP_RATIO = 4.0
127 | cfg.MODEL.SWIN.QKV_BIAS = True
128 | cfg.MODEL.SWIN.QK_SCALE = None
129 | cfg.MODEL.SWIN.DROP_RATE = 0.0
130 | cfg.MODEL.SWIN.ATTN_DROP_RATE = 0.0
131 | cfg.MODEL.SWIN.DROP_PATH_RATE = 0.3
132 | cfg.MODEL.SWIN.APE = False
133 | cfg.MODEL.SWIN.PATCH_NORM = True
134 | cfg.MODEL.SWIN.OUT_FEATURES = ["res2", "res3", "res4", "res5"]
135 | cfg.MODEL.SWIN.USE_CHECKPOINT = False
136 |
137 | # pvt backbone
138 | cfg.MODEL.PVTV2 = CN()
139 | cfg.MODEL.PVTV2.PATCH_SIZE = 4
140 | cfg.MODEL.PVTV2.IN_CHANS = 3
141 | cfg.MODEL.PVTV2.EMBED_DIMS = [32, 64, 160, 256]
142 | cfg.MODEL.PVTV2.NUM_HEADS = [1, 2, 5, 8]
143 | cfg.MODEL.PVTV2.MLP_RATIO = [8, 8, 4, 4]
144 | cfg.MODEL.PVTV2.QKV_BIAS = True
145 | cfg.MODEL.PVTV2.DROP_RATE = 0.0
146 | cfg.MODEL.PVTV2.DROP_PATH_RATE = 0.
147 | cfg.MODEL.PVTV2.QK_SCALE = None
148 | cfg.MODEL.PVTV2.DEPTHS = [2, 2, 2, 2]
149 | cfg.MODEL.PVTV2.SR_RATIOS = [8, 4, 2, 1]
150 | cfg.MODEL.PVTV2.OUT_FEATURES = ["res2", "res3", "res4", "res5"]
151 |
152 |
153 | cfg.MODEL.SEM_SEG_HEAD.MASKATTENTIONPOOL = False
154 | cfg.MODEL.SEM_SEG_HEAD.TEMPERATURE = 0.01
155 | cfg.MODEL.SEM_SEG_HEAD.GAT_NUM_LAYERS = 2
156 | cfg.MODEL.SEM_SEG_HEAD.DOWNSAMPLE_RATE = 4
157 | # cfg.MODEL.CRITERION = "spix" # default
158 |
159 | # self training config
160 | cfg.MODEL.PSEUDO_LABEL = False
161 | cfg.MODEL.PSEUDO_WEIGHT = 1.0
162 | cfg.MODEL.PSEUDO_THR = -1.
163 |
164 |
165 | cfg.MODEL.DYNAMIC_MEN_STD = False
166 | # cfg.MODEL.LAB_INPUT = False
167 |
168 | # NOTE: maskformer2 extra conffigs
169 | # transformer module
170 | cfg.MODEL.MASK_FORMER.TRANSFORMER_DECODER_NAME = "MultiScaleMaskedTransformerDecoder"
171 |
172 | # LSJ aug
173 | cfg.INPUT.IMAGE_SIZE = 1024
174 | cfg.INPUT.MIN_SCALE = 0.1
175 | cfg.INPUT.MAX_SCALE = 2.0
176 |
177 | # MSDeformAttn encoder configs
178 | cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES = ["res3", "res4", "res5"]
179 | cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_POINTS = 4
180 | cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_HEADS = 8
181 |
182 | # point loss configs
183 | # Number of points sampled during training for a mask point head.
184 | cfg.MODEL.MASK_FORMER.TRAIN_NUM_POINTS = 112 * 112
185 | # Oversampling parameter for PointRend point sampling during training. Parameter `k` in the
186 | # original paper.
187 | cfg.MODEL.MASK_FORMER.OVERSAMPLE_RATIO = 3.0
188 | # Importance sampling parameter for PointRend point sampling during training. Parametr `beta` in
189 | # the original paper.
190 | cfg.MODEL.MASK_FORMER.IMPORTANCE_SAMPLE_RATIO = 0.75
191 |
192 | # params for groupformer
193 | cfg.MODEL.SEM_SEG_HEAD.NUM_GROUP_TOKENS = [256, 128, 64]
194 | cfg.MODEL.SEM_SEG_HEAD.NUM_OUTPUT_GROUPS = [256, 128, 64]
195 | cfg.MODEL.SEM_SEG_HEAD.NUM_HEADS = [8, 8, 8]
196 | cfg.MODEL.SEM_SEG_HEAD.SPIX_RES = [32, 32]
197 | cfg.MODEL.SEM_SEG_HEAD.MASK_POOL_STYLE = "attn_pool"
198 | cfg.MODEL.SEM_SEG_HEAD.TAU = 0.07
199 |
200 | cfg.MODEL.OUT_SUBMISSION_FORMAT = False
201 |
202 | cfg.MODEL.SEM_SEG_HEAD.SPIX_SELF_ATTEN = True
203 | cfg.MODEL.SEM_SEG_HEAD.SPIX_FFN = True
204 |
--------------------------------------------------------------------------------
/demo/predictor.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # Copied from: https://github.com/facebookresearch/detectron2/blob/master/demo/predictor.py
3 | import atexit
4 | import bisect
5 | import multiprocessing as mp
6 | from collections import deque
7 |
8 | import cv2
9 | import torch
10 |
11 | from detectron2.data import MetadataCatalog
12 | from detectron2.engine.defaults import DefaultPredictor
13 | from detectron2.utils.video_visualizer import VideoVisualizer
14 | from detectron2.utils.visualizer import ColorMode, Visualizer
15 |
16 |
17 | class VisualizationDemo(object):
18 | def __init__(self, cfg, instance_mode=ColorMode.IMAGE, parallel=False):
19 | """
20 | Args:
21 | cfg (CfgNode):
22 | instance_mode (ColorMode):
23 | parallel (bool): whether to run the model in different processes from visualization.
24 | Useful since the visualization logic can be slow.
25 | """
26 | # import ipdb; ipdb.set_trace()
27 | # self.metadata = MetadataCatalog.get(
28 | # cfg.DATASETS.TEST[0] if len(cfg.DATASETS.TEST) else "__unused"
29 | # )
30 | # TODO: fix it, sorry, hard coded for cityscapes categories
31 | self.metadata = MetadataCatalog.get(
32 | cfg.DATASETS.TEST[0] if len(cfg.DATASETS.TEST) else "__unused"
33 | )
34 | self.cpu_device = torch.device("cpu")
35 | self.instance_mode = instance_mode
36 |
37 | self.parallel = parallel
38 | if parallel:
39 | num_gpu = torch.cuda.device_count()
40 | self.predictor = AsyncPredictor(cfg, num_gpus=num_gpu)
41 | else:
42 | self.predictor = DefaultPredictor(cfg)
43 |
44 | def run_on_image(self, image):
45 | """
46 | Args:
47 | image (np.ndarray): an image of shape (H, W, C) (in BGR order).
48 | This is the format used by OpenCV.
49 | Returns:
50 | predictions (dict): the output of the model.
51 | vis_output (VisImage): the visualized image output.
52 | """
53 | vis_output = None
54 | predictions = self.predictor(image)
55 | # Convert image from OpenCV BGR format to Matplotlib RGB format.
56 | image = image[:, :, ::-1]
57 | # import ipdb; ipdb.set_trace()
58 | visualizer = Visualizer(image, self.metadata, instance_mode=self.instance_mode)
59 | if "panoptic_seg" in predictions:
60 | panoptic_seg, segments_info = predictions["panoptic_seg"]
61 | vis_output = visualizer.draw_panoptic_seg_predictions(
62 | panoptic_seg.to(self.cpu_device), segments_info
63 | )
64 | else:
65 | if "sem_seg" in predictions:
66 | vis_output = visualizer.draw_sem_seg(
67 | predictions["sem_seg"].argmax(dim=0).to(self.cpu_device)
68 | )
69 | if "instances" in predictions:
70 | instances = predictions["instances"].to(self.cpu_device)
71 | vis_output = visualizer.draw_instance_predictions(predictions=instances)
72 |
73 | return predictions, vis_output
74 |
75 | def _frame_from_video(self, video):
76 | while video.isOpened():
77 | success, frame = video.read()
78 | if success:
79 | yield frame
80 | else:
81 | break
82 |
83 | def run_on_video(self, video):
84 | """
85 | Visualizes predictions on frames of the input video.
86 | Args:
87 | video (cv2.VideoCapture): a :class:`VideoCapture` object, whose source can be
88 | either a webcam or a video file.
89 | Yields:
90 | ndarray: BGR visualizations of each video frame.
91 | """
92 | video_visualizer = VideoVisualizer(self.metadata, self.instance_mode)
93 |
94 | def process_predictions(frame, predictions):
95 | frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
96 | if "panoptic_seg" in predictions:
97 | panoptic_seg, segments_info = predictions["panoptic_seg"]
98 | vis_frame = video_visualizer.draw_panoptic_seg_predictions(
99 | frame, panoptic_seg.to(self.cpu_device), segments_info
100 | )
101 | elif "instances" in predictions:
102 | predictions = predictions["instances"].to(self.cpu_device)
103 | vis_frame = video_visualizer.draw_instance_predictions(frame, predictions)
104 | elif "sem_seg" in predictions:
105 | vis_frame = video_visualizer.draw_sem_seg(
106 | frame, predictions["sem_seg"].argmax(dim=0).to(self.cpu_device)
107 | )
108 |
109 | # Converts Matplotlib RGB format to OpenCV BGR format
110 | vis_frame = cv2.cvtColor(vis_frame.get_image(), cv2.COLOR_RGB2BGR)
111 | return vis_frame
112 |
113 | frame_gen = self._frame_from_video(video)
114 | if self.parallel:
115 | buffer_size = self.predictor.default_buffer_size
116 |
117 | frame_data = deque()
118 |
119 | for cnt, frame in enumerate(frame_gen):
120 | frame_data.append(frame)
121 | self.predictor.put(frame)
122 |
123 | if cnt >= buffer_size:
124 | frame = frame_data.popleft()
125 | predictions = self.predictor.get()
126 | yield process_predictions(frame, predictions)
127 |
128 | while len(frame_data):
129 | frame = frame_data.popleft()
130 | predictions = self.predictor.get()
131 | yield process_predictions(frame, predictions)
132 | else:
133 | for frame in frame_gen:
134 | yield process_predictions(frame, self.predictor(frame))
135 |
136 |
137 | class AsyncPredictor:
138 | """
139 | A predictor that runs the model asynchronously, possibly on >1 GPUs.
140 | Because rendering the visualization takes considerably amount of time,
141 | this helps improve throughput a little bit when rendering videos.
142 | """
143 |
144 | class _StopToken:
145 | pass
146 |
147 | class _PredictWorker(mp.Process):
148 | def __init__(self, cfg, task_queue, result_queue):
149 | self.cfg = cfg
150 | self.task_queue = task_queue
151 | self.result_queue = result_queue
152 | super().__init__()
153 |
154 | def run(self):
155 | predictor = DefaultPredictor(self.cfg)
156 |
157 | while True:
158 | task = self.task_queue.get()
159 | if isinstance(task, AsyncPredictor._StopToken):
160 | break
161 | idx, data = task
162 | result = predictor(data)
163 | self.result_queue.put((idx, result))
164 |
165 | def __init__(self, cfg, num_gpus: int = 1):
166 | """
167 | Args:
168 | cfg (CfgNode):
169 | num_gpus (int): if 0, will run on CPU
170 | """
171 | num_workers = max(num_gpus, 1)
172 | self.task_queue = mp.Queue(maxsize=num_workers * 3)
173 | self.result_queue = mp.Queue(maxsize=num_workers * 3)
174 | self.procs = []
175 | for gpuid in range(max(num_gpus, 1)):
176 | cfg = cfg.clone()
177 | cfg.defrost()
178 | cfg.MODEL.DEVICE = "cuda:{}".format(gpuid) if num_gpus > 0 else "cpu"
179 | self.procs.append(
180 | AsyncPredictor._PredictWorker(cfg, self.task_queue, self.result_queue)
181 | )
182 |
183 | self.put_idx = 0
184 | self.get_idx = 0
185 | self.result_rank = []
186 | self.result_data = []
187 |
188 | for p in self.procs:
189 | p.start()
190 | atexit.register(self.shutdown)
191 |
192 | def put(self, image):
193 | self.put_idx += 1
194 | self.task_queue.put((self.put_idx, image))
195 |
196 | def get(self):
197 | self.get_idx += 1 # the index needed for this request
198 | if len(self.result_rank) and self.result_rank[0] == self.get_idx:
199 | res = self.result_data[0]
200 | del self.result_data[0], self.result_rank[0]
201 | return res
202 |
203 | while True:
204 | # make sure the results are returned in the correct order
205 | idx, res = self.result_queue.get()
206 | if idx == self.get_idx:
207 | return res
208 | insert = bisect.bisect(self.result_rank, idx)
209 | self.result_rank.insert(insert, idx)
210 | self.result_data.insert(insert, res)
211 |
212 | def __len__(self):
213 | return self.put_idx - self.get_idx
214 |
215 | def __call__(self, image):
216 | self.put(image)
217 | return self.get()
218 |
219 | def shutdown(self):
220 | for _ in self.procs:
221 | self.task_queue.put(AsyncPredictor._StopToken())
222 |
223 | @property
224 | def default_buffer_size(self):
225 | return len(self.procs) * 5
226 |
--------------------------------------------------------------------------------