├── hgformer
    ├── evaluation
    │   ├── __init__.py
    │   └── instance_evaluation.py
    ├── utils
    │   ├── __init__.py
    │   └── misc.py
    ├── modeling
    │   ├── backbone
    │   │   └── __init__.py
    │   ├── meta_arch
    │   │   ├── __init__.py
    │   │   ├── group_former_head.py
    │   │   └── mask_former_head.py
    │   ├── pixel_decoder
    │   │   ├── __init__.py
    │   │   └── ops
    │   │   │   ├── make.sh
    │   │   │   ├── modules
    │   │   │       ├── __init__.py
    │   │   │       └── ms_deform_attn.py
    │   │   │   ├── functions
    │   │   │       ├── __init__.py
    │   │   │       └── ms_deform_attn_func.py
    │   │   │   ├── src
    │   │   │       ├── vision.cpp
    │   │   │       ├── cuda
    │   │   │       │   ├── ms_deform_attn_cuda.h
    │   │   │       │   └── ms_deform_attn_cuda.cu
    │   │   │       ├── cpu
    │   │   │       │   ├── ms_deform_attn_cpu.h
    │   │   │       │   └── ms_deform_attn_cpu.cpp
    │   │   │       └── ms_deform_attn.h
    │   │   │   ├── setup.py
    │   │   │   └── test.py
    │   ├── transformer_decoder
    │   │   ├── __init__.py
    │   │   ├── position_encoding.py
    │   │   └── maskformer_transformer_decoder.py
    │   └── __init__.py
    ├── data
    │   ├── dataset_mappers
    │   │   ├── __init__.py
    │   │   ├── mask_former_panoptic_dataset_mapper.py
    │   │   ├── mask_former_instance_dataset_mapper.py
    │   │   └── mask_former_semantic_dataset_mapper.py
    │   ├── __init__.py
    │   ├── samplers
    │   │   ├── __init__.py
    │   │   ├── grouped_batch_sampler.py
    │   │   └── balanced_sampler.py
    │   └── datasets
    │   │   ├── __init__.py
    │   │   ├── register_gta.py
    │   │   ├── register_synthia.py
    │   │   ├── register_bdd.py
    │   │   ├── register_mapillary_19.py
    │   │   ├── register_city_c.py
    │   │   └── register_city_c_vis.py
    ├── __init__.py
    ├── test_time_augmentation.py
    └── config.py
├── requirements.txt
├── configs
    ├── cityscapes
    │   ├── maskformer2_swin_large_IN21k_384_bs16_20k.yaml
    │   ├── hgformer_swin_large_IN21K_384_bs16_20k.yaml
    │   ├── maskformer2_swin_tiny_bs16_20k.yaml
    │   ├── hgformer_swin_tiny_bs16_20k.yaml
    │   ├── maskformer2_R50_bs16_20k_gn.yaml
    │   ├── Base-Cityscapes-SemanticSegmentation.yaml
    │   └── hgformer_R50_bs16_20k.yaml
    ├── mapillary
    │   ├── maskformer2_swin_large_IN21k_384_bs16_20k_mapillary.yaml
    │   ├── hgformer_swin_tiny_bs16_20k_mapillary.yaml
    │   ├── maskformer2_swin_tiny_bs16_20k_mapillary.yaml
    │   ├── hgformer_swin_large_IN21k_384_bs16_20k_mapillary.yaml
    │   ├── maskformer2_R50_bs16_90k_gn_mapillary_20k.yaml
    │   ├── Base-mapillary19-SemanticSegmentation.yaml
    │   └── hgformer_R50_bs16_20k_mapillary.yaml
    └── city_c
    │   ├── hgformer_swin_large_IN21K_384_bs16_20k.yaml
    │   ├── maskformer2_swin_large_IN21k_384_bs16_90k.yaml
    │   ├── hgformer_swin_tiny_bs16_20k.yaml
    │   └── maskformer2_swin_tiny_bs16_20k.yaml
├── .gitignore
├── tools
    ├── convert-pretrained-swin-model-to-d2.py
    ├── convert-torchvision-to-d2.py
    ├── evaluate_coco_boundary_ap.py
    ├── README.md
    ├── visualize_data.py
    └── analyze_model.py
├── README.md
├── datasets
    ├── split_data
    │   ├── gta
    │   │   ├── resize_img.py
    │   │   └── split_gta.py
    │   └── synthia
    │   │   └── split_synthia.py
    ├── prepare_gta_sem_seg.py
    ├── generate_cityscapes_c.py
    ├── find_truncated_images.py
    ├── prepare_mapillary_sem_seg.py
    ├── prepare_synthia_sem_seg.py
    └── README.md
├── INSTALL.md
├── GETTING_STARTED.md
├── MODEL_ZOO.md
└── demo
    ├── inference.py
    └── predictor.py


/hgformer/evaluation/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/hgformer/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/hgformer/modeling/backbone/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/hgformer/data/dataset_mappers/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/hgformer/modeling/meta_arch/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/hgformer/modeling/pixel_decoder/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/hgformer/data/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from . import datasets


--------------------------------------------------------------------------------
/hgformer/data/samplers/__init__.py:
--------------------------------------------------------------------------------
1 | from .balanced_sampler import (
2 |     BalancedTrainingSampler,
3 | )
4 | 
5 | 
6 | __all__ = [
7 |     "BalancedTrainingSampler",
8 | ]
9 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | cython
 2 | scipy
 3 | numpy==1.23.1
 4 | setuptools==58.0.4
 5 | shapely
 6 | timm
 7 | h5py
 8 | submitit
 9 | scikit-image
10 | ftfy
11 | einops
12 | regex
13 | mmcv
14 | imagecorruptions


--------------------------------------------------------------------------------
/hgformer/data/datasets/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | from . import (
 3 |     # register_acdc,
 4 |     register_gta,
 5 |     register_city_c,
 6 |     register_bdd,
 7 |     register_synthia,
 8 |     register_mapillary_19,
 9 |     register_city_c_vis,
10 | )
11 | 


--------------------------------------------------------------------------------
/hgformer/modeling/transformer_decoder/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from .maskformer_transformer_decoder import StandardTransformerDecoder
3 | from .mask2former_transformer_decoder import MultiScaleMaskedTransformerDecoder
4 | from .groupFormer_transformer_decoder import GroupFormerDecoder
5 | from .mask2former_transformer_decoder_wo_maskatten import MultiScaleMaskedTransformerDecoderWoMaskAtten
6 | 


--------------------------------------------------------------------------------
/configs/cityscapes/maskformer2_swin_large_IN21k_384_bs16_20k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: maskformer2_R50_bs16_20k_gn.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 100
19 | SOLVER:
20 |   MAX_ITER: 20000


--------------------------------------------------------------------------------
/configs/mapillary/maskformer2_swin_large_IN21k_384_bs16_20k_mapillary.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: maskformer2_R50_bs16_90k_gn_mapillary_20k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 100
19 | SOLVER:
20 |   MAX_ITER: 20000


--------------------------------------------------------------------------------
/configs/mapillary/hgformer_swin_tiny_bs16_20k_mapillary.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: hgformer_R50_bs16_20k_mapillary.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 6, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 
17 | SOLVER:
18 |   MAX_ITER: 20000
19 |   IMS_PER_BATCH: 16
20 | 
21 | TEST:
22 |   CLUSTER_SOFTMAX: True
23 |   PRED_STAGE: "spix_pixelexclude0125+stage3"


--------------------------------------------------------------------------------
/configs/mapillary/maskformer2_swin_tiny_bs16_20k_mapillary.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: maskformer2_R50_bs16_90k_gn_mapillary_20k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 6, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 
17 | DATALOADER:
18 |   FILTER_EMPTY_ANNOTATIONS: True
19 |   NUM_WORKERS: 4
20 | VERSION: 2
21 | SOLVER:
22 |   MAX_ITER: 20000
23 | 
24 | CUDNN_BENCHMARK: True
25 | 


--------------------------------------------------------------------------------
/configs/city_c/hgformer_swin_large_IN21K_384_bs16_20k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: hgformer_swin_tiny_bs16_20k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | 
18 | SOLVER:
19 |   MAX_ITER: 20000
20 | #  IMS_PER_BATCH: 2
21 | 
22 | TEST:
23 |   CLUSTER_SOFTMAX: True
24 |   PRED_STAGE: "spix_all_stage_exclude012"


--------------------------------------------------------------------------------
/configs/cityscapes/hgformer_swin_large_IN21K_384_bs16_20k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: hgformer_R50_bs16_20k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | 
18 | SOLVER:
19 |   MAX_ITER: 20000
20 | #  IMS_PER_BATCH: 2
21 | 
22 | TEST:
23 |   CLUSTER_SOFTMAX: True
24 |   PRED_STAGE: "spix_pixelexclude0125+stage3"


--------------------------------------------------------------------------------
/configs/mapillary/hgformer_swin_large_IN21k_384_bs16_20k_mapillary.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: hgformer_R50_bs16_20k_mapillary.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | 
18 | SOLVER:
19 |   MAX_ITER: 20000
20 | #  IMS_PER_BATCH: 2
21 | 
22 | TEST:
23 |   CLUSTER_SOFTMAX: True
24 |   PRED_STAGE: "spix_pixelexclude0125+stage3"


--------------------------------------------------------------------------------
/hgformer/modeling/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | from .backbone.swin import D2SwinTransformer, D2SwinTransformerFreeze
 3 | from .pixel_decoder.fpn import BasePixelDecoder
 4 | from .pixel_decoder.msdeformattn import MSDeformAttnPixelDecoder
 5 | from .pixel_decoder.msdeformattn import MSDeformAttnPixelDecoderv2
 6 | from .pixel_decoder.msdeformattn import MSDeformAttnPixelDecoderv3
 7 | from .pixel_decoder.msdeformattn import MSDeformAttnPixelDecodervSingleLayer
 8 | from .meta_arch.mask_former_head import MaskFormerHead
 9 | from .meta_arch.group_former_head import GroupFormerHead
10 | from .meta_arch.per_pixel_baseline import PerPixelBaselineHead, PerPixelBaselinePlusHead
11 | 


--------------------------------------------------------------------------------
/configs/cityscapes/maskformer2_swin_tiny_bs16_20k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: maskformer2_R50_bs16_20k_gn.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 6, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | #DATASETS:
17 | #  TRAIN: ("cityscapes_fine_sem_seg_train",)
18 | #  TEST: ("cityscapes_fine_sem_seg_val",)
19 | DATALOADER:
20 |   FILTER_EMPTY_ANNOTATIONS: True
21 |   NUM_WORKERS: 4
22 | VERSION: 2
23 | SOLVER:
24 |   MAX_ITER: 20000
25 | 
26 | CUDNN_BENCHMARK: True
27 | 


--------------------------------------------------------------------------------
/configs/cityscapes/hgformer_swin_tiny_bs16_20k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: hgformer_R50_bs16_20k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 6, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | #DATASETS:
17 | #  TRAIN: ("cityscapes_fine_sem_seg_train",)
18 | #  TEST: ("synthia_val", "cityscapes_fine_sem_seg_val", "bdd_val", "mapillary_val", "gta_trainid_val")
19 | SOLVER:
20 |   MAX_ITER: 20000
21 |   IMS_PER_BATCH: 16
22 | 
23 | TEST:
24 |   CLUSTER_SOFTMAX: True
25 |   PRED_STAGE: "spix_pixelexclude0125+stage3"


--------------------------------------------------------------------------------
/hgformer/modeling/pixel_decoder/ops/make.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # ------------------------------------------------------------------------------------------------
 3 | # Deformable DETR
 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | # ------------------------------------------------------------------------------------------------
 7 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | # ------------------------------------------------------------------------------------------------
 9 | 
10 | # Copyright (c) Facebook, Inc. and its affiliates.
11 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
12 | 
13 | python setup.py build install
14 | 


--------------------------------------------------------------------------------
/hgformer/modeling/pixel_decoder/ops/modules/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from .ms_deform_attn import MSDeformAttn
13 | 


--------------------------------------------------------------------------------
/hgformer/modeling/pixel_decoder/ops/functions/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from .ms_deform_attn_func import MSDeformAttnFunction
13 | 
14 | 


--------------------------------------------------------------------------------
/hgformer/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | from . import data  # register all new datasets
 3 | from . import modeling
 4 | 
 5 | # config
 6 | from .config import add_maskformer2_config
 7 | 
 8 | # dataset loading
 9 | 
10 | from .data.dataset_mappers.mask_former_instance_dataset_mapper import (
11 |     MaskFormerInstanceDatasetMapper,
12 | )
13 | from .data.dataset_mappers.mask_former_panoptic_dataset_mapper import (
14 |     MaskFormerPanopticDatasetMapper,
15 | )
16 | from .data.dataset_mappers.mask_former_semantic_dataset_mapper import (
17 |     MaskFormerSemanticDatasetMapper,
18 | )
19 | 
20 | 
21 | # models
22 | from .maskformer_model import MaskFormer
23 | from .test_time_augmentation import SemanticSegmentorWithTTA
24 | from .groupformer_model import GroupFormer
25 | 
26 | # evaluation
27 | from .evaluation.instance_evaluation import InstanceSegEvaluator
28 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # output dir
 2 | output
 3 | instant_test_output
 4 | inference_test_output
 5 | 
 6 | 
 7 | *.png
 8 | *.json
 9 | *.diff
10 | *.jpg
11 | !/projects/DensePose/doc/images/*.jpg
12 | 
13 | # compilation and distribution
14 | __pycache__
15 | _ext
16 | *.pyc
17 | *.pyd
18 | *.so
19 | *.dll
20 | *.egg-info/
21 | build/
22 | dist/
23 | wheels/
24 | 
25 | # pytorch/python/numpy formats
26 | *.pth
27 | *.pkl
28 | *.npy
29 | *.ts
30 | model_ts*.txt
31 | 
32 | # ipython/jupyter notebooks
33 | *.ipynb
34 | **/.ipynb_checkpoints/
35 | 
36 | # Editor temporaries
37 | *.swn
38 | *.swo
39 | *.swp
40 | *~
41 | 
42 | # editor settings
43 | .idea
44 | .vscode
45 | _darcs
46 | 
47 | # project dirs
48 | /detectron2/model_zoo/configs
49 | # /datasets/*
50 | !/datasets/*.*
51 | /projects/*/datasets
52 | /models
53 | /snippet
54 | /GroupViT
55 | /work_dirs
56 | /work_dirs_1
57 | test*.sh
58 | start*.sh
59 | slurm*
60 | /detectron2
61 | 


--------------------------------------------------------------------------------
/tools/convert-pretrained-swin-model-to-d2.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 3 | 
 4 | import pickle as pkl
 5 | import sys
 6 | 
 7 | import torch
 8 | 
 9 | """
10 | Usage:
11 |   # download pretrained swin model:
12 |   wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth
13 |   # run the conversion
14 |   ./convert-pretrained-model-to-d2.py swin_tiny_patch4_window7_224.pth swin_tiny_patch4_window7_224.pkl
15 |   # Then, use swin_tiny_patch4_window7_224.pkl with the following changes in config:
16 | MODEL:
17 |   WEIGHTS: "/path/to/swin_tiny_patch4_window7_224.pkl"
18 | INPUT:
19 |   FORMAT: "RGB"
20 | """
21 | 
22 | if __name__ == "__main__":
23 |     input = sys.argv[1]
24 | 
25 |     obj = torch.load(input, map_location="cpu")["model"]
26 | 
27 |     res = {"model": obj, "__author__": "third_party", "matching_heuristics": True}
28 | 
29 |     with open(sys.argv[2], "wb") as f:
30 |         pkl.dump(res, f)
31 | 


--------------------------------------------------------------------------------
/hgformer/modeling/pixel_decoder/ops/src/vision.cpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #include "ms_deform_attn.h"
17 | 
18 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
19 |   m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward");
20 |   m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward");
21 | }
22 | 


--------------------------------------------------------------------------------
/hgformer/data/datasets/register_gta.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | import os
 3 | 
 4 | from detectron2.data import DatasetCatalog, MetadataCatalog
 5 | from detectron2.data.datasets import load_sem_seg
 6 | from detectron2.data.datasets.builtin_meta import _get_builtin_metadata
 7 | 
 8 | # ==== Predefined splits for raw gta images ===========
 9 | 
10 | GTA_Trainid = {
11 |     "gta_trainid_val": ("gta/images/valid/", "gta/labels_detectron2/valid/"),
12 | }
13 | 
14 | def register_all_gta_sem_seg(root):
15 |     for key, (image_dir, gt_dir) in GTA_Trainid.items():
16 |         meta = _get_builtin_metadata("cityscapes")
17 |         image_dir = os.path.join(root, image_dir)
18 |         gt_dir = os.path.join(root, gt_dir)
19 | 
20 |         DatasetCatalog.register(
21 |             key, lambda x=image_dir, y=gt_dir: load_sem_seg(y, x, gt_ext="png", image_ext="png")
22 |         )
23 |         MetadataCatalog.get(key).set(
24 |             image_dir=image_dir,
25 |             gt_dir=gt_dir,
26 |             evaluator_type="sem_seg",
27 |             ignore_label=255,
28 |             **meta,
29 |         )
30 | 
31 | _root = os.getenv("DETECTRON2_DATASETS", "datasets")
32 | 
33 | register_all_gta_sem_seg(_root)


--------------------------------------------------------------------------------
/hgformer/data/datasets/register_synthia.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | import os
 3 | 
 4 | from detectron2.data import DatasetCatalog, MetadataCatalog
 5 | from detectron2.data.datasets import load_sem_seg
 6 | from detectron2.data.datasets.builtin_meta import _get_builtin_metadata
 7 | # from .acdc import load_acdc_semantic
 8 | # from detectron2.data.datasets import load_sem_seg
 9 | 
10 | 
11 | _RAW_BDD_SPLITS = {
12 |     "synthia_train": ("synthia/RGB/train", "synthia/labels_detectron2/train"),
13 |     "synthia_val": ("synthia/RGB/val", "synthia/labels_detectron2/val")
14 | }
15 | 
16 | def register_all_synthia(root):
17 |     for key, (image_dir, gt_dir) in _RAW_BDD_SPLITS.items():
18 |         meta = _get_builtin_metadata("cityscapes")
19 |         image_dir = os.path.join(root, image_dir)
20 |         gt_dir = os.path.join(root, gt_dir)
21 | 
22 |         DatasetCatalog.register(
23 |             key, lambda x=image_dir, y=gt_dir: load_sem_seg(y, x, gt_ext="png", image_ext="png")
24 |         )
25 |         MetadataCatalog.get(key).set(
26 |             image_dir=image_dir,
27 |             gt_dir=gt_dir,
28 |             evaluator_type="sem_seg",
29 |             ignore_label=255,
30 |             **meta,
31 |         )
32 | 
33 | _root = os.getenv("DETECTRON2_DATASETS", "datasets")
34 | register_all_synthia(_root)


--------------------------------------------------------------------------------
/hgformer/data/datasets/register_bdd.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | import os
 3 | 
 4 | from detectron2.data import DatasetCatalog, MetadataCatalog
 5 | from detectron2.data.datasets import load_sem_seg
 6 | from detectron2.data.datasets.builtin_meta import _get_builtin_metadata
 7 | 
 8 | 
 9 | _RAW_BDD_SPLITS = {
10 |     "bdd_train": ("bdd/images/10k/train", "bdd/labels/sem_seg/masks/train"),
11 |     "bdd_val": ("bdd/images/10k/val", "bdd/labels/sem_seg/masks/val")
12 | }
13 | 
14 | def register_all_bdd(root):
15 |     for key, (image_dir, gt_dir) in _RAW_BDD_SPLITS.items():
16 |         meta = _get_builtin_metadata("cityscapes")
17 |         image_dir = os.path.join(root, image_dir)
18 |         gt_dir = os.path.join(root, gt_dir)
19 | 
20 |         # DatasetCatalog.register(
21 |         #     key, lambda x=image_dir, y=gt_dir: load_sem_seg(x, y)
22 |         # )
23 |         DatasetCatalog.register(
24 |             key, lambda x=image_dir, y=gt_dir: load_sem_seg(y, x, gt_ext="png", image_ext="jpg")
25 |         )
26 |         MetadataCatalog.get(key).set(
27 |             image_dir=image_dir,
28 |             gt_dir=gt_dir,
29 |             evaluator_type="sem_seg",
30 |             ignore_label=255,
31 |             **meta,
32 |         )
33 | 
34 | _root = os.getenv("DETECTRON2_DATASETS", "datasets")
35 | register_all_bdd(_root)


--------------------------------------------------------------------------------
/hgformer/data/datasets/register_mapillary_19.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | import os
 3 | 
 4 | from detectron2.data import DatasetCatalog, MetadataCatalog
 5 | from detectron2.data.datasets import load_sem_seg
 6 | from detectron2.data.datasets.builtin_meta import _get_builtin_metadata
 7 | # from detectron2.data.datasets import load_sem_seg
 8 | 
 9 | 
10 | _RAW_BDD_SPLITS = {
11 |     "mapillary_train": ("mapillary/training/images", "mapillary/labels_detectron2/training"),
12 |     "mapillary_val": ("mapillary/validation/images", "mapillary/labels_detectron2/validation")
13 | }
14 | 
15 | def register_all_mapillary_19(root):
16 |     for key, (image_dir, gt_dir) in _RAW_BDD_SPLITS.items():
17 |         meta = _get_builtin_metadata("cityscapes")
18 |         image_dir = os.path.join(root, image_dir)
19 |         gt_dir = os.path.join(root, gt_dir)
20 | 
21 |         DatasetCatalog.register(
22 |             key, lambda x=image_dir, y=gt_dir: load_sem_seg(y, x, gt_ext="png", image_ext="jpg")
23 |         )
24 |         MetadataCatalog.get(key).set(
25 |             image_dir=image_dir,
26 |             gt_dir=gt_dir,
27 |             evaluator_type="sem_seg",
28 |             ignore_label=255,
29 |             **meta,
30 |         )
31 | 
32 | _root = os.getenv("DETECTRON2_DATASETS", "datasets")
33 | register_all_mapillary_19(_root)


--------------------------------------------------------------------------------
/configs/city_c/maskformer2_swin_large_IN21k_384_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../cityscapes/maskformer2_R50_bs16_20k_gn.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 100
19 | DATASETS:
20 |   TRAIN: ("cityscapes_fine_sem_seg_train", )
21 |   TEST:  ("cityscapes_fine_sem_seg_val", "cityscapes_fine_gaussian_noise_5_val",  "cityscapes_fine_shot_noise_5_val", "cityscapes_fine_impulse_noise_5_val", "cityscapes_fine_defocus_blur_5_val", "cityscapes_fine_glass_blur_5_val", "cityscapes_fine_motion_blur_5_val", "cityscapes_fine_zoom_blur_5_val", "cityscapes_fine_snow_5_val", "cityscapes_fine_frost_5_val", "cityscapes_fine_fog_5_val", "cityscapes_fine_brightness_5_val", "cityscapes_fine_contrast_5_val", "cityscapes_fine_elastic_transform_5_val", "cityscapes_fine_pixelate_5_val", "cityscapes_fine_jpeg_compression_5_val", "cityscapes_fine_speckle_noise_5_val", "cityscapes_fine_gaussian_blur_5_val", "cityscapes_fine_spatter_5_val", "cityscapes_fine_saturate_5_val")
22 | 


--------------------------------------------------------------------------------
/configs/city_c/hgformer_swin_tiny_bs16_20k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../cityscapes/hgformer_R50_bs16_20k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 6, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 
17 | SOLVER:
18 |   MAX_ITER: 20000
19 | #  IMS_PER_BATCH: 2
20 | DATASETS:
21 |   TRAIN: ("cityscapes_fine_sem_seg_train", )
22 |   TEST:  ("cityscapes_fine_sem_seg_val", "cityscapes_fine_gaussian_noise_5_val",  "cityscapes_fine_shot_noise_5_val", "cityscapes_fine_impulse_noise_5_val", "cityscapes_fine_defocus_blur_5_val", "cityscapes_fine_glass_blur_5_val", "cityscapes_fine_motion_blur_5_val", "cityscapes_fine_zoom_blur_5_val", "cityscapes_fine_snow_5_val", "cityscapes_fine_frost_5_val", "cityscapes_fine_fog_5_val", "cityscapes_fine_brightness_5_val", "cityscapes_fine_contrast_5_val", "cityscapes_fine_elastic_transform_5_val", "cityscapes_fine_pixelate_5_val", "cityscapes_fine_jpeg_compression_5_val", "cityscapes_fine_speckle_noise_5_val", "cityscapes_fine_gaussian_blur_5_val", "cityscapes_fine_spatter_5_val", "cityscapes_fine_saturate_5_val")
23 | TEST:
24 |   CLUSTER_SOFTMAX: True
25 |   PRED_STAGE: "spix_pixelexclude0125+stage3"


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # HGFormer: Hierarchical Grouping Transformer for Domain Generalized Semantic Segmentation
 2 | 
 3 | This is the official code for the [HGFormer](https://openaccess.thecvf.com/content/CVPR2023/papers/Ding_HGFormer_Hierarchical_Grouping_Transformer_for_Domain_Generalized_Semantic_Segmentation_CVPR_2023_paper.pdf) (CVPR 2023) 
 4 | 
 5 | ## Installation
 6 | 
 7 | See [installation instructions](INSTALL.md).
 8 | 
 9 | ## Getting Started
10 | 
11 | See [Preparing Datasets for HGFormer](datasets/README.md).
12 | 
13 | See [Getting Started with HGFormer](GETTING_STARTED.md).
14 | 
15 | ## Pre-trained Models and Baselines
16 | 
17 | We provide a large set of baseline results and trained models available for download in the [HGFormer Model Zoo](MODEL_ZOO.md).
18 | 
19 | ## <a name="CitingHGFormer"></a>Citing HGFormer
20 | 
21 | If you use HGFormer in your research, please use the following BibTeX entry.
22 | 
23 | ```BibTeX
24 | @inproceedings{ding2023hgformer,
25 |   title={HGFormer: Hierarchical Grouping Transformer for Domain Generalized Semantic Segmentation},
26 |   author={Ding, Jian and Xue, Nan and Xia, Gui-Song and Schiele, Bernt and Dai, Dengxin},
27 |   booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
28 |   pages={15413--15423},
29 |   year={2023}
30 | }
31 | ```
32 | 
33 | ## Acknowledgement
34 | 
35 | Code is largely based on Mask2Former (https://github.com/facebookresearch/Mask2Former).
36 | 


--------------------------------------------------------------------------------
/configs/city_c/maskformer2_swin_tiny_bs16_20k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../cityscapes/maskformer2_R50_bs16_20k_gn.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 6, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 
17 | DATALOADER:
18 |   FILTER_EMPTY_ANNOTATIONS: True
19 |   NUM_WORKERS: 4
20 | VERSION: 2
21 | SOLVER:
22 |   MAX_ITER: 20000
23 | 
24 | CUDNN_BENCHMARK: True
25 | DATASETS:
26 |   TRAIN: ("cityscapes_fine_sem_seg_train", )
27 |   TEST:  ("cityscapes_fine_sem_seg_val", "cityscapes_fine_gaussian_noise_5_val",  "cityscapes_fine_shot_noise_5_val", "cityscapes_fine_impulse_noise_5_val", "cityscapes_fine_defocus_blur_5_val", "cityscapes_fine_glass_blur_5_val", "cityscapes_fine_motion_blur_5_val", "cityscapes_fine_zoom_blur_5_val", "cityscapes_fine_snow_5_val", "cityscapes_fine_frost_5_val", "cityscapes_fine_fog_5_val", "cityscapes_fine_brightness_5_val", "cityscapes_fine_contrast_5_val", "cityscapes_fine_elastic_transform_5_val", "cityscapes_fine_pixelate_5_val", "cityscapes_fine_jpeg_compression_5_val", "cityscapes_fine_speckle_noise_5_val", "cityscapes_fine_gaussian_blur_5_val", "cityscapes_fine_spatter_5_val", "cityscapes_fine_saturate_5_val")
28 | 


--------------------------------------------------------------------------------
/hgformer/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #pragma once
17 | #include <torch/extension.h>
18 | 
19 | at::Tensor ms_deform_attn_cuda_forward(
20 |     const at::Tensor &value, 
21 |     const at::Tensor &spatial_shapes,
22 |     const at::Tensor &level_start_index,
23 |     const at::Tensor &sampling_loc,
24 |     const at::Tensor &attn_weight,
25 |     const int im2col_step);
26 | 
27 | std::vector<at::Tensor> ms_deform_attn_cuda_backward(
28 |     const at::Tensor &value, 
29 |     const at::Tensor &spatial_shapes,
30 |     const at::Tensor &level_start_index,
31 |     const at::Tensor &sampling_loc,
32 |     const at::Tensor &attn_weight,
33 |     const at::Tensor &grad_output,
34 |     const int im2col_step);
35 | 
36 | 


--------------------------------------------------------------------------------
/hgformer/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #pragma once
17 | #include <torch/extension.h>
18 | 
19 | at::Tensor
20 | ms_deform_attn_cpu_forward(
21 |     const at::Tensor &value, 
22 |     const at::Tensor &spatial_shapes,
23 |     const at::Tensor &level_start_index,
24 |     const at::Tensor &sampling_loc,
25 |     const at::Tensor &attn_weight,
26 |     const int im2col_step);
27 | 
28 | std::vector<at::Tensor>
29 | ms_deform_attn_cpu_backward(
30 |     const at::Tensor &value, 
31 |     const at::Tensor &spatial_shapes,
32 |     const at::Tensor &level_start_index,
33 |     const at::Tensor &sampling_loc,
34 |     const at::Tensor &attn_weight,
35 |     const at::Tensor &grad_output,
36 |     const int im2col_step);
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/configs/cityscapes/maskformer2_R50_bs16_20k_gn.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-Cityscapes-SemanticSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   RESNETS:
 5 |     NORM: "GN"
 6 |   SEM_SEG_HEAD:
 7 |     NAME: "MaskFormerHead"
 8 |     IGNORE_VALUE: 255
 9 |     NUM_CLASSES: 19
10 |     LOSS_WEIGHT: 1.0
11 |     CONVS_DIM: 256
12 |     MASK_DIM: 256
13 |     NORM: "GN"
14 |     # pixel decoder
15 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
16 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
17 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
18 |     COMMON_STRIDE: 4
19 |     TRANSFORMER_ENC_LAYERS: 6
20 |   MASK_FORMER:
21 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
22 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
23 |     DEEP_SUPERVISION: True
24 |     NO_OBJECT_WEIGHT: 0.1
25 |     CLASS_WEIGHT: 2.0
26 |     MASK_WEIGHT: 5.0
27 |     DICE_WEIGHT: 5.0
28 |     HIDDEN_DIM: 256
29 |     NUM_OBJECT_QUERIES: 100
30 |     NHEADS: 8
31 |     DROPOUT: 0.0
32 |     DIM_FEEDFORWARD: 2048
33 |     ENC_LAYERS: 0
34 |     PRE_NORM: False
35 |     ENFORCE_INPUT_PROJ: False
36 |     SIZE_DIVISIBILITY: 32
37 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
38 |     TRAIN_NUM_POINTS: 12544
39 |     OVERSAMPLE_RATIO: 3.0
40 |     IMPORTANCE_SAMPLE_RATIO: 0.75
41 |     TEST:
42 |       SEMANTIC_ON: True
43 |       INSTANCE_ON: False
44 |       PANOPTIC_ON: False
45 |       OVERLAP_THRESHOLD: 0.8
46 |       OBJECT_MASK_THRESHOLD: 0.8
47 | SOLVER:
48 |   IMS_PER_BATCH: 16
49 |   BASE_LR: 0.0001
50 |   MAX_ITER: 20000


--------------------------------------------------------------------------------
/configs/mapillary/maskformer2_R50_bs16_90k_gn_mapillary_20k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-mapillary19-SemanticSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   RESNETS:
 5 |     NORM: "GN"
 6 |   SEM_SEG_HEAD:
 7 |     NAME: "MaskFormerHead"
 8 |     IGNORE_VALUE: 255
 9 |     NUM_CLASSES: 19
10 |     LOSS_WEIGHT: 1.0
11 |     CONVS_DIM: 256
12 |     MASK_DIM: 256
13 |     NORM: "GN"
14 |     # pixel decoder
15 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
16 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
17 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
18 |     COMMON_STRIDE: 4
19 |     TRANSFORMER_ENC_LAYERS: 6
20 |   MASK_FORMER:
21 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
22 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
23 |     DEEP_SUPERVISION: True
24 |     NO_OBJECT_WEIGHT: 0.1
25 |     CLASS_WEIGHT: 2.0
26 |     MASK_WEIGHT: 5.0
27 |     DICE_WEIGHT: 5.0
28 |     HIDDEN_DIM: 256
29 |     NUM_OBJECT_QUERIES: 100
30 |     NHEADS: 8
31 |     DROPOUT: 0.0
32 |     DIM_FEEDFORWARD: 2048
33 |     ENC_LAYERS: 0
34 |     PRE_NORM: False
35 |     ENFORCE_INPUT_PROJ: False
36 |     SIZE_DIVISIBILITY: 32
37 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
38 |     TRAIN_NUM_POINTS: 12544
39 |     OVERSAMPLE_RATIO: 3.0
40 |     IMPORTANCE_SAMPLE_RATIO: 0.75
41 |     TEST:
42 |       SEMANTIC_ON: True
43 |       INSTANCE_ON: False
44 |       PANOPTIC_ON: False
45 |       OVERLAP_THRESHOLD: 0.8
46 |       OBJECT_MASK_THRESHOLD: 0.8
47 | SOLVER:
48 |   IMS_PER_BATCH: 16
49 |   BASE_LR: 0.0001
50 |   MAX_ITER: 20000


--------------------------------------------------------------------------------
/hgformer/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.cpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #include <vector>
17 | 
18 | #include <ATen/ATen.h>
19 | #include <ATen/cuda/CUDAContext.h>
20 | 
21 | 
22 | at::Tensor
23 | ms_deform_attn_cpu_forward(
24 |     const at::Tensor &value, 
25 |     const at::Tensor &spatial_shapes,
26 |     const at::Tensor &level_start_index,
27 |     const at::Tensor &sampling_loc,
28 |     const at::Tensor &attn_weight,
29 |     const int im2col_step)
30 | {
31 |     AT_ERROR("Not implement on cpu");
32 | }
33 | 
34 | std::vector<at::Tensor>
35 | ms_deform_attn_cpu_backward(
36 |     const at::Tensor &value, 
37 |     const at::Tensor &spatial_shapes,
38 |     const at::Tensor &level_start_index,
39 |     const at::Tensor &sampling_loc,
40 |     const at::Tensor &attn_weight,
41 |     const at::Tensor &grad_output,
42 |     const int im2col_step)
43 | {
44 |     AT_ERROR("Not implement on cpu");
45 | }
46 | 
47 | 


--------------------------------------------------------------------------------
/tools/convert-torchvision-to-d2.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | 
 4 | import pickle as pkl
 5 | import sys
 6 | 
 7 | import torch
 8 | 
 9 | """
10 | Usage:
11 |   # download one of the ResNet{18,34,50,101,152} models from torchvision:
12 |   wget https://download.pytorch.org/models/resnet50-19c8e357.pth -O r50.pth
13 |   # run the conversion
14 |   ./convert-torchvision-to-d2.py r50.pth r50.pkl
15 |   # Then, use r50.pkl with the following changes in config:
16 | MODEL:
17 |   WEIGHTS: "/path/to/r50.pkl"
18 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
19 |   PIXEL_STD: [58.395, 57.120, 57.375]
20 |   RESNETS:
21 |     DEPTH: 50
22 |     STRIDE_IN_1X1: False
23 | INPUT:
24 |   FORMAT: "RGB"
25 | """
26 | 
27 | if __name__ == "__main__":
28 |     input = sys.argv[1]
29 | 
30 |     obj = torch.load(input, map_location="cpu")
31 | 
32 |     newmodel = {}
33 |     for k in list(obj.keys()):
34 |         old_k = k
35 |         if "layer" not in k:
36 |             k = "stem." + k
37 |         for t in [1, 2, 3, 4]:
38 |             k = k.replace("layer{}".format(t), "res{}".format(t + 1))
39 |         for t in [1, 2, 3]:
40 |             k = k.replace("bn{}".format(t), "conv{}.norm".format(t))
41 |         k = k.replace("downsample.0", "shortcut")
42 |         k = k.replace("downsample.1", "shortcut.norm")
43 |         print(old_k, "->", k)
44 |         newmodel[k] = obj.pop(old_k).detach().numpy()
45 | 
46 |     res = {"model": newmodel, "__author__": "torchvision", "matching_heuristics": True}
47 | 
48 |     with open(sys.argv[2], "wb") as f:
49 |         pkl.dump(res, f)
50 |     if obj:
51 |         print("Unconverted keys:", obj.keys())
52 | 


--------------------------------------------------------------------------------
/datasets/split_data/gta/resize_img.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from PIL import Image
 3 | import numpy as np
 4 | import cv2
 5 | 
 6 | def GetFileFromThisRootDir(dir,ext = None):
 7 |   allfiles = []
 8 |   needExtFilter = (ext != None)
 9 |   for root,dirs,files in os.walk(dir):
10 |     for filespath in files:
11 |       filepath = os.path.join(root, filespath)
12 |       extension = os.path.splitext(filepath)[1][1:]
13 |       if needExtFilter and extension in ext:
14 |         allfiles.append(filepath)
15 |       elif not needExtFilter:
16 |         allfiles.append(filepath)
17 |   return allfiles
18 | 
19 | def resize_split(split):
20 |   filenames = GetFileFromThisRootDir(f'datasets/GTA/images/{split}')
21 |   for filename in filenames:
22 |     basename = os.path.basename(filename)
23 |     img = Image.open(filename)
24 |     gtname = os.path.join(f'datasets/GTA/labels/{split}', basename)
25 |     gt = Image.open(gtname)
26 |     print(f'filename: {filename}')
27 |     if not os.path.exists(f'datasets/GTA/labels/{split}_resize'):
28 |       os.makedirs(f'datasets/GTA/labels/{split}_resize')
29 |     if (img.width != gt.width) or (img.height != gt.height):
30 |       # read img
31 |       gt_np = np.asarray(gt)
32 |       # resize img
33 |       width, height = img.width, img.height
34 |       resized_gt_np = cv2.resize(gt_np, (width, height), interpolation=cv2.INTER_NEAREST)
35 |       # import ipdb;
36 |       # ipdb.set_trace()
37 |       # save img
38 |       outname = os.path.join(f'datasets/GTA/labels/{split}_resize', basename)
39 |       cv2.imwrite(outname, resized_gt_np)
40 | 
41 | if __name__ == '__main__':
42 |   resize_split('valid')
43 |   # resize_split('train')
44 |   # resize_split('test')


--------------------------------------------------------------------------------
/tools/evaluate_coco_boundary_ap.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 3 | # Modified by Bowen Cheng from: https://github.com/bowenc0221/boundary-iou-api/blob/master/tools/coco_instance_evaluation.py
 4 | 
 5 | """
 6 | Evaluation for COCO val2017:
 7 | python ./tools/coco_instance_evaluation.py \
 8 |     --gt-json-file COCO_GT_JSON \
 9 |     --dt-json-file COCO_DT_JSON
10 | """
11 | import argparse
12 | import json
13 | 
14 | from boundary_iou.coco_instance_api.coco import COCO
15 | from boundary_iou.coco_instance_api.cocoeval import COCOeval
16 | 
17 | 
18 | def main():
19 |     parser = argparse.ArgumentParser()
20 |     parser.add_argument("--gt-json-file", default="")
21 |     parser.add_argument("--dt-json-file", default="")
22 |     parser.add_argument("--iou-type", default="boundary")
23 |     parser.add_argument("--dilation-ratio", default="0.020", type=float)
24 |     args = parser.parse_args()
25 |     print(args)
26 | 
27 |     annFile = args.gt_json_file
28 |     resFile = args.dt_json_file
29 |     dilation_ratio = args.dilation_ratio
30 |     if args.iou_type == "boundary":
31 |         get_boundary = True
32 |     else:
33 |         get_boundary = False
34 |     cocoGt = COCO(annFile, get_boundary=get_boundary, dilation_ratio=dilation_ratio)
35 |     
36 |     # remove box predictions
37 |     resFile = json.load(open(resFile))
38 |     for c in resFile:
39 |         c.pop("bbox", None)
40 | 
41 |     cocoDt = cocoGt.loadRes(resFile)
42 |     cocoEval = COCOeval(cocoGt, cocoDt, iouType=args.iou_type, dilation_ratio=dilation_ratio)
43 |     cocoEval.evaluate()
44 |     cocoEval.accumulate()
45 |     cocoEval.summarize()
46 | 
47 | 
48 | if __name__ == '__main__':
49 |     main()
50 | 


--------------------------------------------------------------------------------
/INSTALL.md:
--------------------------------------------------------------------------------
 1 | ## Installation
 2 | 
 3 | ### Requirements
 4 | - Linux or macOS with Python ≥ 3.6
 5 | - PyTorch ≥ 1.9 and [torchvision](https://github.com/pytorch/vision/) that matches the PyTorch installation.
 6 |   Install them together at [pytorch.org](https://pytorch.org) to make sure of this. Note, please check
 7 |   PyTorch version matches that is required by Detectron2.
 8 | - Detectron2: follow [Detectron2 installation instructions](https://detectron2.readthedocs.io/tutorials/install.html).
 9 | - OpenCV is optional but needed by demo and visualization
10 | - `pip install -r requirements.txt`
11 | 
12 | ### CUDA kernel for MSDeformAttn
13 | After preparing the required environment, run the following command to compile CUDA kernel for MSDeformAttn:
14 | 
15 | `CUDA_HOME` must be defined and points to the directory of the installed CUDA toolkit.
16 | 
17 | ```bash
18 | cd hgformer/modeling/pixel_decoder/ops
19 | python setup.py build install
20 | ```
21 | 
22 | #### Building on another system
23 | To build on a system that does not have a GPU device but provide the drivers:
24 | ```bash
25 | TORCH_CUDA_ARCH_LIST='8.0' FORCE_CUDA=1 python setup.py build install
26 | ```
27 | 
28 | ### Example conda environment setup
29 | ```bash
30 | conda create --name hgformer python=3.8 -y
31 | conda activate hgformer
32 | conda install pytorch==1.9.0 torchvision==0.10.0 cudatoolkit=11.1 -c pytorch -c nvidia
33 | pip install -U opencv-python
34 | 
35 | # under your working directory
36 | python -m pip install detectron2 -f \
37 |   https://dl.fbaipublicfiles.com/detectron2/wheels/cu111/torch1.9/index.html
38 | 
39 | pip install git+https://github.com/mcordts/cityscapesScripts.git
40 | 
41 | cd ..
42 | git clone https://github.com/dingjiansw101/HGFormer.git
43 | cd HGFormer
44 | pip install -r requirements.txt
45 | cd hgformer/modeling/pixel_decoder/ops
46 | sh make.sh
47 | ```
48 | 


--------------------------------------------------------------------------------
/configs/mapillary/Base-mapillary19-SemanticSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     NORM: "SyncBN"  # use syncbn for cityscapes dataset
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("mapillary_train",)
18 |   TEST: ("mapillary_val", "gta_trainid_val", "synthia_val", "cityscapes_fine_sem_seg_val", "bdd_val")
19 | 
20 | SOLVER:
21 |   IMS_PER_BATCH: 16
22 |   BASE_LR: 0.0001
23 |   MAX_ITER: 90000
24 |   WARMUP_FACTOR: 1.0
25 |   WARMUP_ITERS: 0
26 |   WEIGHT_DECAY: 0.05
27 |   OPTIMIZER: "ADAMW"
28 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
29 |   BACKBONE_MULTIPLIER: 0.1
30 |   CLIP_GRADIENTS:
31 |     ENABLED: True
32 |     CLIP_TYPE: "full_model"
33 |     CLIP_VALUE: 0.01
34 |     NORM_TYPE: 2.0
35 |   AMP:
36 |     ENABLED: True
37 | INPUT:
38 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1024) for x in range(5, 21)]"]
39 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
40 |   MIN_SIZE_TEST: 1024
41 |   MAX_SIZE_TRAIN: 4096
42 |   MAX_SIZE_TEST: 2048
43 |   CROP:
44 |     ENABLED: True
45 |     TYPE: "absolute"
46 |     SIZE: (512, 1024)
47 |     SINGLE_CATEGORY_MAX_AREA: 1.0
48 |   COLOR_AUG_SSD: True
49 |   SIZE_DIVISIBILITY: -1
50 |   FORMAT: "RGB"
51 |   DATASET_MAPPER_NAME: "mask_former_semantic"
52 | TEST:
53 |   EVAL_PERIOD: 90000
54 |   AUG:
55 |     ENABLED: False
56 |     MIN_SIZES: [512, 768, 1024, 1280, 1536, 1792]
57 |     MAX_SIZE: 4096
58 |     FLIP: True
59 | DATALOADER:
60 |   FILTER_EMPTY_ANNOTATIONS: True
61 |   NUM_WORKERS: 4
62 | VERSION: 2
63 | 


--------------------------------------------------------------------------------
/datasets/prepare_gta_sem_seg.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | import os
 4 | from pathlib import Path
 5 | 
 6 | import numpy as np
 7 | import tqdm
 8 | from PIL import Image
 9 | from multiprocessing import Pool
10 | 
11 | id_to_trainid = {7: 0, 8: 1, 11: 2, 12: 3, 13: 4, 17: 5,
12 |                               19: 6, 20: 7, 21: 8, 22: 9, 23: 10, 24: 11, 25: 12,
13 |                               26: 13, 27: 14, 28: 15, 31: 16, 32: 17, 33: 18}
14 | 
15 | 
16 | def convert(input, outputpath):
17 |     lab = np.asarray(Image.open(input))
18 |     assert lab.dtype == np.uint8
19 |     output = np.zeros_like(lab, dtype=np.uint8) + 255
20 |     for obj_id in np.unique(lab):
21 |         if obj_id in id_to_trainid:
22 |             output[lab == obj_id] = id_to_trainid[obj_id]
23 | 
24 |     Image.fromarray(output).save(outputpath)
25 | 
26 | def worker(file_tuple):
27 |     file, output_file = file_tuple
28 |     lab = np.asarray(Image.open(file))
29 |     assert lab.dtype == np.uint8
30 |     output = np.zeros_like(lab, dtype=np.uint8) + 255
31 |     for obj_id in np.unique(lab):
32 |         if obj_id in id_to_trainid:
33 |             output[lab == obj_id] = id_to_trainid[obj_id]
34 | 
35 |     Image.fromarray(output).save(output_file)
36 | 
37 | if __name__ == "__main__":
38 |     dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "GTA"
39 |     for name in ["train", "valid", "test"]:
40 |         annotation_dir = dataset_dir / "labels" / name
41 |         output_dir = dataset_dir / "labels_detectron2" / name
42 |         output_dir.mkdir(parents=True, exist_ok=True)
43 | 
44 |         file_list = []
45 |         for file in tqdm.tqdm(list(annotation_dir.iterdir())):
46 |             output_file = output_dir / file.name
47 |             file_list.append((file, output_file))
48 |             # convert(file, output_file)
49 | 
50 |         pool = Pool(32)
51 |         pool.map(worker, file_list)
52 |         print(f'done {name}')
53 | 


--------------------------------------------------------------------------------
/configs/cityscapes/Base-Cityscapes-SemanticSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     NORM: "SyncBN"  # use syncbn for cityscapes dataset
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | 
17 | DATASETS:
18 |   TRAIN: ("cityscapes_fine_sem_seg_train",)
19 |   TEST: ("cityscapes_fine_sem_seg_val", "mapillary_val", "bdd_val", "gta_trainid_val", "synthia_val")
20 | SOLVER:
21 |   IMS_PER_BATCH: 16
22 |   BASE_LR: 0.0001
23 |   MAX_ITER: 90000
24 |   WARMUP_FACTOR: 1.0
25 |   WARMUP_ITERS: 0
26 |   WEIGHT_DECAY: 0.05
27 |   OPTIMIZER: "ADAMW"
28 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
29 |   BACKBONE_MULTIPLIER: 0.1
30 |   # dense period for job array on gpu22
31 |   CHECKPOINT_PERIOD: 1000
32 |   CLIP_GRADIENTS:
33 |     ENABLED: True
34 |     CLIP_TYPE: "full_model"
35 |     CLIP_VALUE: 0.01
36 |     NORM_TYPE: 2.0
37 |   AMP:
38 |     ENABLED: True
39 | INPUT:
40 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1024) for x in range(5, 21)]"]
41 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
42 |   MIN_SIZE_TEST: 1024
43 |   MAX_SIZE_TRAIN: 4096
44 |   MAX_SIZE_TEST: 2048
45 |   CROP:
46 |     ENABLED: True
47 |     TYPE: "absolute"
48 |     SIZE: (512, 1024)
49 |     SINGLE_CATEGORY_MAX_AREA: 1.0
50 |   COLOR_AUG_SSD: True
51 |   SIZE_DIVISIBILITY: -1
52 |   FORMAT: "RGB"
53 |   DATASET_MAPPER_NAME: "mask_former_semantic"
54 | TEST:
55 |   EVAL_PERIOD: 5000
56 |   AUG:
57 |     ENABLED: False
58 |     MIN_SIZES: [512, 768, 1024, 1280, 1536, 1792]
59 |     MAX_SIZE: 4096
60 |     FLIP: True
61 | DATALOADER:
62 |   FILTER_EMPTY_ANNOTATIONS: True
63 |   NUM_WORKERS: 4
64 | VERSION: 2
65 | 
66 | #CUDNN_BENCHMARK: True


--------------------------------------------------------------------------------
/hgformer/data/datasets/register_city_c.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | import os
 3 | 
 4 | from detectron2.data import DatasetCatalog, MetadataCatalog
 5 | from detectron2.data.datasets import load_sem_seg
 6 | from detectron2.data.datasets.builtin_meta import _get_builtin_metadata
 7 | # from .acdc import load_acdc_semantic
 8 | from detectron2.data.datasets.cityscapes import load_cityscapes_semantic
 9 | 
10 | corruptions = ['clean', 'gaussian_noise', 'shot_noise', 'impulse_noise', 'defocus_blur',
11 |                 'glass_blur', 'motion_blur', 'zoom_blur', 'snow', 'frost', 'fog',
12 |                 'brightness', 'contrast', 'elastic_transform', 'pixelate', 'jpeg_compression',
13 |                 'speckle_noise', 'gaussian_blur', 'spatter', 'saturate']
14 | # ==== Predefined splits for raw cityscapes c images ===========
15 | 
16 | _RAW_ACDC_SPLITS = {}
17 | for noise in corruptions:
18 |     if noise == 'clean':
19 |         cur_data = {f"cityscapes_fine_{noise}_val": (f"cityscapes-c/{noise}/", "cityscapes/gtFine/val/")}
20 |     else:
21 |         for severity in range(5):
22 |             severity_str = str(severity+1)
23 |             cur_data = {f"cityscapes_fine_{noise}_{severity_str}_val": (f"cityscapes-c/{noise}/{severity_str}", "cityscapes/gtFine/val/")}
24 |             _RAW_ACDC_SPLITS.update(cur_data)
25 | def register_all_city_c(root):
26 |     for key, (image_dir, gt_dir) in _RAW_ACDC_SPLITS.items():
27 |         meta = _get_builtin_metadata("cityscapes")
28 |         image_dir = os.path.join(root, image_dir)
29 |         gt_dir = os.path.join(root, gt_dir)
30 |         # sem_key = key.format(task="sem_seg")
31 |         DatasetCatalog.register(
32 |             key, lambda x=image_dir, y=gt_dir: load_cityscapes_semantic(x, y)
33 |         )
34 |         MetadataCatalog.get(key).set(
35 |             image_dir=image_dir,
36 |             gt_dir=gt_dir,
37 |             evaluator_type="cityscapes_sem_seg",
38 |             ignore_label=255,
39 |             **meta,
40 |         )
41 | 
42 | _root = os.getenv("DETECTRON2_DATASETS", "datasets")
43 | register_all_city_c(_root)
44 | 


--------------------------------------------------------------------------------
/hgformer/data/samplers/grouped_batch_sampler.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | import numpy as np
 3 | from torch.utils.data.sampler import BatchSampler, Sampler
 4 | 
 5 | 
 6 | class GroupedBatchSampler(BatchSampler):
 7 |     """
 8 |     Wraps another sampler to yield a mini-batch of indices.
 9 |     It enforces that the batch only contain elements from the same group.
10 |     It also tries to provide mini-batches which follows an ordering which is
11 |     as close as possible to the ordering from the original sampler.
12 |     """
13 | 
14 |     def __init__(self, sampler, group_ids, batch_size):
15 |         """
16 |         Args:
17 |             sampler (Sampler): Base sampler.
18 |             group_ids (list[int]): If the sampler produces indices in range [0, N),
19 |                 `group_ids` must be a list of `N` ints which contains the group id of each sample.
20 |                 The group ids must be a set of integers in the range [0, num_groups).
21 |             batch_size (int): Size of mini-batch.
22 |         """
23 |         if not isinstance(sampler, Sampler):
24 |             raise ValueError(
25 |                 "sampler should be an instance of "
26 |                 "torch.utils.data.Sampler, but got sampler={}".format(sampler)
27 |             )
28 |         self.sampler = sampler
29 |         self.group_ids = np.asarray(group_ids)
30 |         assert self.group_ids.ndim == 1
31 |         self.batch_size = batch_size
32 |         groups = np.unique(self.group_ids).tolist()
33 | 
34 |         # buffer the indices of each group until batch size is reached
35 |         self.buffer_per_group = {k: [] for k in groups}
36 | 
37 |     def __iter__(self):
38 |         for idx in self.sampler:
39 |             group_id = self.group_ids[idx]
40 |             group_buffer = self.buffer_per_group[group_id]
41 |             group_buffer.append(idx)
42 |             if len(group_buffer) == self.batch_size:
43 |                 yield group_buffer[:]  # yield a copy of the list
44 |                 del group_buffer[:]
45 | 
46 |     def __len__(self):
47 |         raise NotImplementedError("len() of GroupedBatchSampler is not well-defined.")
48 | 


--------------------------------------------------------------------------------
/hgformer/modeling/pixel_decoder/ops/src/ms_deform_attn.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #pragma once
17 | 
18 | #include "cpu/ms_deform_attn_cpu.h"
19 | 
20 | #ifdef WITH_CUDA
21 | #include "cuda/ms_deform_attn_cuda.h"
22 | #endif
23 | 
24 | 
25 | at::Tensor
26 | ms_deform_attn_forward(
27 |     const at::Tensor &value, 
28 |     const at::Tensor &spatial_shapes,
29 |     const at::Tensor &level_start_index,
30 |     const at::Tensor &sampling_loc,
31 |     const at::Tensor &attn_weight,
32 |     const int im2col_step)
33 | {
34 |     if (value.type().is_cuda())
35 |     {
36 | #ifdef WITH_CUDA
37 |         return ms_deform_attn_cuda_forward(
38 |             value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step);
39 | #else
40 |         AT_ERROR("Not compiled with GPU support");
41 | #endif
42 |     }
43 |     AT_ERROR("Not implemented on the CPU");
44 | }
45 | 
46 | std::vector<at::Tensor>
47 | ms_deform_attn_backward(
48 |     const at::Tensor &value, 
49 |     const at::Tensor &spatial_shapes,
50 |     const at::Tensor &level_start_index,
51 |     const at::Tensor &sampling_loc,
52 |     const at::Tensor &attn_weight,
53 |     const at::Tensor &grad_output,
54 |     const int im2col_step)
55 | {
56 |     if (value.type().is_cuda())
57 |     {
58 | #ifdef WITH_CUDA
59 |         return ms_deform_attn_cuda_backward(
60 |             value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step);
61 | #else
62 |         AT_ERROR("Not compiled with GPU support");
63 | #endif
64 |     }
65 |     AT_ERROR("Not implemented on the CPU");
66 | }
67 | 
68 | 


--------------------------------------------------------------------------------
/hgformer/data/datasets/register_city_c_vis.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | import os
 3 | 
 4 | from detectron2.data import DatasetCatalog, MetadataCatalog
 5 | from detectron2.data.datasets import load_sem_seg
 6 | from detectron2.data.datasets.builtin_meta import _get_builtin_metadata
 7 | # from .acdc import load_acdc_semantic
 8 | from detectron2.data.datasets.cityscapes import load_cityscapes_semantic
 9 | 
10 | corruptions = ['clean', 'gaussian_noise', 'shot_noise', 'impulse_noise', 'defocus_blur',
11 |                 'glass_blur', 'motion_blur', 'zoom_blur', 'snow', 'frost', 'fog',
12 |                 'brightness', 'contrast', 'elastic_transform', 'pixelate', 'jpeg_compression',
13 |                 'speckle_noise', 'gaussian_blur', 'spatter', 'saturate']
14 | # ==== Predefined splits for raw cityscapes c images ===========
15 | _RAW_ACDC_SPLITS = {
16 |     "city_c_gaussiannoise5_vis": ("gauss_noise/5/", "cityscapes/gtFine/val/"),
17 |     "city_c_gaussiannoise4_vis": ("gauss_noise/4/", "cityscapes/gtFine/val/"),
18 |     "city_c_gaussiannoise3_vis": ("gauss_noise/3/", "cityscapes/gtFine/val/"),
19 |     "city_c_gaussiannoise2_vis": ("gauss_noise/2/", "cityscapes/gtFine/val/"),
20 |     "city_c_gaussiannoise1_vis": ("gauss_noise/1/", "cityscapes/gtFine/val/"),
21 |     "city_c_gaussiannoise0_vis": ("gauss_noise/0/", "cityscapes/gtFine/val/"),
22 |     "city_c_tmp_gaussiannoise4_vis": ("city_c_tmp/gaussian_noise/4/", "cityscapes/gtFine/val/"),
23 |     "city_c_tmp_clean_vis": ("city_c_tmp/clean/", "cityscapes/gtFine/val/"),
24 | 
25 | }
26 | 
27 | def register_all_city_c_vis(root):
28 |     for key, (image_dir, gt_dir) in _RAW_ACDC_SPLITS.items():
29 |         meta = _get_builtin_metadata("cityscapes")
30 |         image_dir = os.path.join(root, image_dir)
31 |         gt_dir = os.path.join(root, gt_dir)
32 |         # sem_key = key.format(task="sem_seg")
33 |         DatasetCatalog.register(
34 |             key, lambda x=image_dir, y=gt_dir: load_cityscapes_semantic(x, y)
35 |         )
36 |         MetadataCatalog.get(key).set(
37 |             image_dir=image_dir,
38 |             gt_dir=gt_dir,
39 |             evaluator_type="cityscapes_sem_seg",
40 |             ignore_label=255,
41 |             **meta,
42 |         )
43 | 
44 | _root = os.getenv("DETECTRON2_DATASETS", "datasets")
45 | register_all_city_c_vis(_root)
46 | 


--------------------------------------------------------------------------------
/datasets/generate_cityscapes_c.py:
--------------------------------------------------------------------------------
 1 | from imagecorruptions import corrupt
 2 | from imagecorruptions import get_corruption_names
 3 | import os
 4 | import cv2
 5 | from multiprocessing import Pool
 6 | import numpy as np
 7 | import random
 8 | import mmcv
 9 | 
10 | random.seed(8) # for reproducibility
11 | np.random.seed(8)
12 | corruptions = ['gaussian_noise', 'shot_noise', 'impulse_noise', 'defocus_blur',
13 |                 'glass_blur', 'motion_blur', 'zoom_blur', 'snow', 'frost', 'fog',
14 |                 'brightness', 'contrast', 'elastic_transform', 'pixelate', 'jpeg_compression',
15 |                 'speckle_noise', 'gaussian_blur', 'spatter', 'saturate']
16 | 
17 | img_dir = 'datasets/cityscapes-c/clean'
18 | num_imgs = 500
19 | img_names = []
20 | prog_bar = mmcv.ProgressBar(num_imgs)
21 | img_dict = {}
22 | for img_path in mmcv.scandir(img_dir, suffix='png', recursive=True):
23 |     img_name = os.path.join(img_dir, img_path)
24 |     img = mmcv.imread(img_name)
25 |     img_dict[img_name] = img
26 |     prog_bar.update()
27 | 
28 | def perturb(i, p, s):
29 |     img = corrupt(i, corruption_name=p, severity=s)
30 |     return img
31 | 
32 | def worker(optuple):
33 |     srcfile, p, s, perturbed_img_path = optuple
34 |     img = img_dict[srcfile]
35 |     perturbed_img = perturb(img, p, s)
36 |     mmcv.imwrite(perturbed_img, perturbed_img_path, auto_mkdir=True)
37 | 
38 | def convert_img_path(ori_path, suffix):
39 |     new_path = ori_path.replace('clean', suffix)
40 |     assert new_path != ori_path
41 |     return new_path
42 | 
43 | if __name__ == '__main__':
44 | 
45 |     pool = Pool(32)
46 |     filelist = []
47 |     for p in corruptions:
48 |         print("\n ### gen corruption:{} ###".format(p))
49 |         for img_path in mmcv.scandir(img_dir, suffix='png', recursive=True):
50 |             srcfile = os.path.join(img_dir, img_path)
51 |             for s in range(5):
52 |                 img_suffix = p + "/" + str(s+1)
53 |                 out_dir = img_dir.replace('clean', img_suffix)
54 |                 assert out_dir != img_dir
55 |                 if not os.path.exists(out_dir):
56 |                     os.makedirs(out_dir)
57 |                 perturbed_img_path = convert_img_path(srcfile, img_suffix)
58 |                 filelist.append((srcfile, p, s+1, perturbed_img_path))
59 |     # import ipdb; ipdb.set_trace()
60 |     pool.map(worker, filelist)


--------------------------------------------------------------------------------
/datasets/find_truncated_images.py:
--------------------------------------------------------------------------------
 1 | from PIL import Image
 2 | import os
 3 | import numpy as np
 4 | 
 5 | # https://en.wikipedia.org/wiki/YUV#SDTV_with_BT.601
 6 | _M_RGB2YUV = [[0.299, 0.587, 0.114], [-0.14713, -0.28886, 0.436], [0.615, -0.51499, -0.10001]]
 7 | _M_YUV2RGB = [[1.0, 0.0, 1.13983], [1.0, -0.39465, -0.58060], [1.0, 2.03211, 0.0]]
 8 | 
 9 | def GetFileFromThisRootDir(dir,ext = None):
10 |   allfiles = []
11 |   needExtFilter = (ext != None)
12 |   for root,dirs,files in os.walk(dir):
13 |     for filespath in files:
14 |       filepath = os.path.join(root, filespath)
15 |       extension = os.path.splitext(filepath)[1][1:]
16 |       if needExtFilter and extension in ext:
17 |         allfiles.append(filepath)
18 |       elif not needExtFilter:
19 |         allfiles.append(filepath)
20 |   return allfiles
21 | 
22 | def convert_PIL_to_numpy(image, format):
23 |     """
24 |     Convert PIL image to numpy array of target format.
25 | 
26 |     Args:
27 |         image (PIL.Image): a PIL image
28 |         format (str): the format of output image
29 | 
30 |     Returns:
31 |         (np.ndarray): also see `read_image`
32 |     """
33 |     if format is not None:
34 |         # PIL only supports RGB, so convert to RGB and flip channels over below
35 |         conversion_format = format
36 |         if format in ["BGR", "YUV-BT.601"]:
37 |             conversion_format = "RGB"
38 |         image = image.convert(conversion_format)
39 |     image = np.asarray(image)
40 |     # PIL squeezes out the channel dimension for "L", so make it HWC
41 |     if format == "L":
42 |         image = np.expand_dims(image, -1)
43 | 
44 |     # handle formats not supported by PIL
45 |     elif format == "BGR":
46 |         # flip channels if needed
47 |         image = image[:, :, ::-1]
48 |     elif format == "YUV-BT.601":
49 |         image = image / 255.0
50 |         image = np.dot(image, np.array(_M_RGB2YUV).T)
51 | 
52 |     return image
53 | 
54 | filepath = "/BS/databases15/GTA/images/train"
55 | 
56 | filenames = GetFileFromThisRootDir(filepath)
57 | count = 0
58 | for file in filenames:
59 |     img = Image.open(file)
60 |     print(f'filename: {file}')
61 |     try:
62 |         img_np = convert_PIL_to_numpy(img, format="RGB")
63 |     except:
64 |         # import ipdb; ipdb.set_trace()
65 |         count = count + 1
66 |         print(f"count: {count}")
67 | print(f"count: {count}")


--------------------------------------------------------------------------------
/datasets/split_data/synthia/split_synthia.py:
--------------------------------------------------------------------------------
 1 | from multiprocessing import Pool
 2 | import shutil
 3 | import os
 4 | shutil._USE_CP_SENDFILE = False
 5 | 
 6 | def worker(path_pair):
 7 |     srcpath, dstpath = path_pair
 8 |     # print(f'srcpath{srcpath}')
 9 |     # print(f'dstpath{dstpath}')
10 |     # shutil.copyfile(srcpath, dstpath)
11 |     shutil.move(srcpath, dstpath)
12 | 
13 | if __name__ == '__main__':
14 |     pool = Pool(32)
15 |     image_path = r'datasets/synthia/RGB'
16 |     label_path = r'datasets/synthia/GT/LABELS'
17 | 
18 |     # dst_image_path = r'datasets/synthia_split/RGB'
19 |     # dst_label_path = r'datasets/synthia_split/GT'
20 | 
21 |     dst_image_path = image_path
22 |     dst_label_path = label_path
23 | 
24 |     with open('datasets/split_data/synthia_split_train.txt', 'r') as f:
25 |         train_list = f.readlines()
26 |         train_list = [x.strip() for x in train_list]
27 | 
28 |     with open('datasets/split_data/synthia_split_val.txt', 'r') as f:
29 |         val_list = f.readlines()
30 |         val_list = [x.strip() for x in val_list]
31 | 
32 |     train_pairs = []
33 | 
34 |     if not os.path.exists(os.path.join(dst_image_path, 'train')):
35 |         os.makedirs(os.path.join(dst_image_path, 'train'))
36 | 
37 |     if not os.path.exists(os.path.join(dst_label_path, 'train')):
38 |         os.makedirs(os.path.join(dst_label_path, 'train'))
39 | 
40 |     for file in train_list:
41 |         srcfile = os.path.join(image_path, file)
42 |         dstfile = os.path.join(dst_image_path, 'train', file)
43 |         train_pairs.append((srcfile, dstfile))
44 | 
45 |         srclabel = os.path.join(label_path, file)
46 |         dstlabel = os.path.join(dst_label_path, 'train', file)
47 |         train_pairs.append((srclabel, dstlabel))
48 |     pool.map(worker, train_pairs)
49 | 
50 |     val_pairs = []
51 | 
52 |     if not os.path.exists(os.path.join(dst_image_path, 'val')):
53 |         os.makedirs(os.path.join(dst_image_path, 'val'))
54 | 
55 |     if not os.path.exists(os.path.join(dst_label_path, 'val')):
56 |         os.makedirs(os.path.join(dst_label_path, 'val'))
57 | 
58 |     for file in val_list:
59 |         srcfile = os.path.join(image_path, file)
60 |         dstfile = os.path.join(dst_image_path, 'val', file)
61 |         val_pairs.append((srcfile, dstfile))
62 | 
63 |         srclabel = os.path.join(label_path, file)
64 |         dstlabel = os.path.join(dst_label_path, 'val', file)
65 |         val_pairs.append((srclabel, dstlabel))
66 |     pool.map(worker, val_pairs)
67 | 


--------------------------------------------------------------------------------
/configs/cityscapes/hgformer_R50_bs16_20k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-Cityscapes-SemanticSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "GroupFormer"
 4 |   RESNETS:
 5 |     NORM: "GN"
 6 |   SEM_SEG_HEAD:
 7 |     NAME: "MaskFormerHead"
 8 |     IGNORE_VALUE: 255
 9 |     NUM_CLASSES: 19
10 |     LOSS_WEIGHT: 1.0
11 |     CONVS_DIM: 256
12 |     MASK_DIM: 256
13 |     NORM: "GN"
14 |     # pixel decoder
15 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
16 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
17 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
18 |     COMMON_STRIDE: 4
19 |     TRANSFORMER_ENC_LAYERS: 6
20 |     NUM_GROUP_TOKENS: [512, 32]
21 |     NUM_OUTPUT_GROUPS: [512, 32]
22 | #    DOWNSAMPLE_RATE: 16 # 0.31
23 | #    DOWNSAMPLE_RATE: 8 #
24 |     DOWNSAMPLE_RATE: 4 # 0.32s
25 | 
26 | #    SPIX_RES: [16, 16]
27 |   MASK_FORMER:
28 | #    TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
29 |     TRANSFORMER_DECODER_NAME: "GroupFormerDecoder"
30 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
31 |     DEEP_SUPERVISION: True
32 | #    DEEP_MASK_SUPERVISION: False
33 |     NO_OBJECT_WEIGHT: 0.1
34 |     STAGE_WEIGHTS: [1.0]
35 |     CLASS_WEIGHT: 2.0
36 |     MASK_WEIGHT: 5.0
37 |     DICE_WEIGHT: 5.0
38 |     SPIX_MASK_WEIGHT: 0.0
39 |     SPIX_CLASS_WEIGHT: 2.0
40 |     CONTRASTIVE_LOSS: True
41 |     CONTRASTIVE_WEIGH: 6.0
42 |     CONTRASTIVE_TAU: 0.1
43 |     HIDDEN_DIM: 256
44 |     NUM_OBJECT_QUERIES: 100
45 |     NHEADS: 8
46 |     DROPOUT: 0.0
47 |     DIM_FEEDFORWARD: 2048
48 |     ENC_LAYERS: 0
49 |     PRE_NORM: False
50 |     ENFORCE_INPUT_PROJ: False
51 |     SIZE_DIVISIBILITY: 32
52 |     DEC_LAYERS: 6  # 9 decoder layers, add one for the loss on learnable query
53 |     SPIX_SELF_ATTEN_LAYERS: 6
54 |     TRAIN_NUM_POINTS: 12544
55 |     OVERSAMPLE_RATIO: 3.0
56 |     IMPORTANCE_SAMPLE_RATIO: 0.75
57 |     TEST:
58 |       SEMANTIC_ON: True
59 |       INSTANCE_ON: False
60 |       PANOPTIC_ON: False
61 |       OVERLAP_THRESHOLD: 0.8
62 |       OBJECT_MASK_THRESHOLD: 0.8
63 | SOLVER:
64 |   IMS_PER_BATCH: 16
65 |   BASE_LR: 0.0001
66 |   MAX_ITER: 20000
67 |   WARMUP_FACTOR: 1.0
68 |   WARMUP_ITERS: 0
69 |   WEIGHT_DECAY: 0.05
70 |   OPTIMIZER: "ADAMW"
71 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
72 |   BACKBONE_MULTIPLIER: 0.1
73 |   CLIP_GRADIENTS:
74 |     ENABLED: True
75 |     CLIP_TYPE: "full_model"
76 |     CLIP_VALUE: 0.01
77 |     NORM_TYPE: 2.0
78 |   AMP:
79 |     ENABLED: False
80 | 
81 | DATALOADER:
82 |   FILTER_EMPTY_ANNOTATIONS: True
83 |   NUM_WORKERS: 4
84 | VERSION: 2
85 | 
86 | CUDNN_BENCHMARK: True
87 | TEST:
88 |   CLUSTER_SOFTMAX: True
89 |   PRED_STAGE: "spix_pixelexclude0125+stage3"


--------------------------------------------------------------------------------
/GETTING_STARTED.md:
--------------------------------------------------------------------------------
 1 | # Getting Started with HGFormer
 2 | 
 3 | This document provides a brief intro of the usage of HGFormer.
 4 | 
 5 | Please see [Getting Started with Detectron2](https://github.com/facebookresearch/detectron2/blob/master/GETTING_STARTED.md) for full usage.
 6 | 
 7 | ## Evaluation with Pre-trained Models
 8 | 
 9 | Download [models](https://drive.google.com/drive/folders/1fUWaIhXtSxHLdTFxnuOSldLUe_ferauh?usp=drive_link).
10 | 
11 | ### Cityscapes -> ACDC
12 | 
13 | ```
14 | python demo/inference.py --config-file configs/cityscapes/hgformer_swin_tiny_bs16_20k.yaml \
15 | --input datasets/acdc/rgb_anon/all/test --output path_to_output \
16 | --opts MODEL.WEIGHTS path_to_checkpoint
17 | ```
18 | After running the command, you will find the results on ```path_to_output```. Then you can follow the instructions on [ACDC evaluation server](https://acdc.vision.ee.ethz.ch/login?target=%2Fsubmit) to get your scores.
19 | You can replace ```all``` with a specific type ```fog, snow, night, rain```, if you want to evaluate on a specific type
20 | 
21 | ### Cityscapes -> Cityscapes-c
22 | 
23 | ```
24 | python test_city_c_level5.py --num-gpus 8 --config-file configs/city_c/hgformer_swin_tiny_bs16_20k.yaml \
25 |  --eval-only MODEL.WEIGHTS path_to_checkpoint OUTPUT_DIR path_to_output
26 | ```
27 | 
28 | ### Cityscapes -> Others
29 | 
30 | ```
31 | python plain_train_net.py --num-gpus 8 --config-file configs/cityscapes/hgformer_swin_tiny_bs16_20k.yaml \
32 | --eval-only MODEL.WEIGHTS path_to_checkpoint OUTPUT_DIR path_to_output
33 | ```
34 | 
35 | ### Mapillary -> Others
36 | 
37 | ```
38 | python plain_train_net.py --num-gpus 8 --config-file configs/mapillary/hgformer_swin_tiny_bs16_20k_mapillary.yaml \
39 | --eval-only MODEL.WEIGHTS path_to_checkpoint OUTPUT_DIR path_to_output
40 | ```
41 | 
42 | ## Training in Command Line
43 | 
44 | 
45 | To train a model, first
46 | setup the corresponding datasets following
47 | [datasets/README.md](./datasets/README.md), then prepare the models pre-trained on ImageNet classificaiton following [tools/README.md](./tools/README.md). Finally run:
48 | ```
49 | python plain_train_net.py --num-gpus 8 \
50 |   --config-file configs/cityscapes/hgformer_swin_tiny_bs16_20k.yaml OUTPUT_DIR path_to_output
51 | ```
52 | 
53 | The configs are made for 8-GPU training.
54 | Since we use ADAMW optimizer, it is not clear how to scale learning rate with batch size.
55 | To train on 1 GPU, you need to figure out learning rate and batch size by yourself:
56 | ```
57 | python plain_train_net.py \
58 |   --config-file configs/cityscapes/hgformer_swin_tiny_bs16_20k.yaml \
59 |   --num-gpus 1 SOLVER.IMS_PER_BATCH SET_TO_SOME_REASONABLE_VALUE SOLVER.BASE_LR SET_TO_SOME_REASONABLE_VALUE
60 | ```
61 | 


--------------------------------------------------------------------------------
/configs/mapillary/hgformer_R50_bs16_20k_mapillary.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-mapillary19-SemanticSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "GroupFormer"
 4 |   RESNETS:
 5 |     NORM: "GN"
 6 |   SEM_SEG_HEAD:
 7 |     NAME: "MaskFormerHead"
 8 |     IGNORE_VALUE: 255
 9 |     NUM_CLASSES: 19
10 |     LOSS_WEIGHT: 1.0
11 |     CONVS_DIM: 256
12 |     MASK_DIM: 256
13 |     NORM: "GN"
14 |     # pixel decoder
15 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
16 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
17 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
18 |     COMMON_STRIDE: 4
19 |     TRANSFORMER_ENC_LAYERS: 6
20 |     NUM_GROUP_TOKENS: [512, 32]
21 |     NUM_OUTPUT_GROUPS: [512, 32]
22 | #    DOWNSAMPLE_RATE: 16 # 0.31
23 | #    DOWNSAMPLE_RATE: 8 # mapillary: (16, 22)
24 |     DOWNSAMPLE_RATE: 4 # 0.32s mapillary: (32, 44), cityscapes: (32, 64)
25 | 
26 | #    SPIX_RES: [16, 16]
27 |   MASK_FORMER:
28 | #    TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
29 |     TRANSFORMER_DECODER_NAME: "GroupFormerDecoder"
30 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
31 |     DEEP_SUPERVISION: True
32 | #    DEEP_MASK_SUPERVISION: False
33 |     NO_OBJECT_WEIGHT: 0.1
34 |     STAGE_WEIGHTS: [1.0]
35 |     CLASS_WEIGHT: 2.0
36 |     MASK_WEIGHT: 5.0
37 |     DICE_WEIGHT: 5.0
38 |     SPIX_MASK_WEIGHT: 0.0
39 |     SPIX_CLASS_WEIGHT: 2.0
40 |     CONTRASTIVE_LOSS: True
41 |     CONTRASTIVE_WEIGH: 6.0
42 |     CONTRASTIVE_TAU: 0.1
43 |     HIDDEN_DIM: 256
44 |     NUM_OBJECT_QUERIES: 100
45 |     NHEADS: 8
46 |     DROPOUT: 0.0
47 |     DIM_FEEDFORWARD: 2048
48 |     ENC_LAYERS: 0
49 |     PRE_NORM: False
50 |     ENFORCE_INPUT_PROJ: False
51 |     SIZE_DIVISIBILITY: 32
52 | #    SIZE_DIVISIBILITY: 64
53 |     DEC_LAYERS: 6  # 9 decoder layers, add one for the loss on learnable query
54 |     SPIX_SELF_ATTEN_LAYERS: 6
55 |     TRAIN_NUM_POINTS: 12544
56 |     OVERSAMPLE_RATIO: 3.0
57 |     IMPORTANCE_SAMPLE_RATIO: 0.75
58 |     TEST:
59 |       SEMANTIC_ON: True
60 |       INSTANCE_ON: False
61 |       PANOPTIC_ON: False
62 |       OVERLAP_THRESHOLD: 0.8
63 |       OBJECT_MASK_THRESHOLD: 0.8
64 | SOLVER:
65 |   IMS_PER_BATCH: 16
66 |   BASE_LR: 0.0001
67 |   MAX_ITER: 20000
68 |   WARMUP_FACTOR: 1.0
69 |   WARMUP_ITERS: 0
70 |   WEIGHT_DECAY: 0.05
71 |   OPTIMIZER: "ADAMW"
72 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
73 |   BACKBONE_MULTIPLIER: 0.1
74 |   CLIP_GRADIENTS:
75 |     ENABLED: True
76 |     CLIP_TYPE: "full_model"
77 |     CLIP_VALUE: 0.01
78 |     NORM_TYPE: 2.0
79 |   AMP:
80 |     ENABLED: False
81 | DATALOADER:
82 |   FILTER_EMPTY_ANNOTATIONS: True
83 |   NUM_WORKERS: 4
84 | VERSION: 2
85 | 
86 | CUDNN_BENCHMARK: True
87 | 
88 | TEST:
89 |   CLUSTER_SOFTMAX: True
90 |   PRED_STAGE: "spix_pixelexclude0125+stage3"


--------------------------------------------------------------------------------
/datasets/split_data/gta/split_gta.py:
--------------------------------------------------------------------------------
 1 | from multiprocessing import Pool
 2 | import shutil
 3 | import os
 4 | shutil._USE_CP_SENDFILE = False
 5 | def worker(path_pair):
 6 |     srcpath, dstpath = path_pair
 7 |     # shutil.copyfile(srcpath, dstpath)
 8 |     shutil.move(srcpath, dstpath)
 9 | 
10 | if __name__ == '__main__':
11 |     pool = Pool(32)
12 |     image_path = r'datasets/GTA/images'
13 |     label_path = r'datasets/GTA/labels'
14 | 
15 |     with open('datasets/split_data/gtav_split_train.txt', 'r') as f:
16 |         train_list = f.readlines()
17 |         train_list = [x.strip() for x in train_list]
18 | 
19 |     with open('datasets/split_data/gtav_split_val.txt', 'r') as f:
20 |         val_list = f.readlines()
21 |         val_list = [x.strip() for x in val_list]
22 | 
23 |     with open('datasets/split_data/gtav_split_test.txt', 'r') as f:
24 |         test_list = f.readlines()
25 |         test_list = [x.strip() for x in test_list]
26 | 
27 |     train_pairs = []
28 | 
29 |     if not os.path.exists(os.path.join(image_path, 'train')):
30 |         os.makedirs(os.path.join(image_path, 'train'))
31 | 
32 |     if not os.path.exists(os.path.join(label_path, 'train')):
33 |         os.makedirs(os.path.join(label_path, 'train'))
34 | 
35 |     for file in train_list:
36 |         srcfile = os.path.join(image_path, file)
37 |         dstfile = os.path.join(image_path, 'train', file)
38 |         train_pairs.append((srcfile, dstfile))
39 | 
40 |         srclabel = os.path.join(label_path, file)
41 |         dstlabel = os.path.join(label_path, 'train', file)
42 |         train_pairs.append((srclabel, dstlabel))
43 |     pool.map(worker, train_pairs)
44 | 
45 |     val_pairs = []
46 | 
47 |     if not os.path.exists(os.path.join(image_path, 'valid')):
48 |         os.makedirs(os.path.join(image_path, 'valid'))
49 | 
50 |     if not os.path.exists(os.path.join(label_path, 'valid')):
51 |         os.makedirs(os.path.join(label_path, 'valid'))
52 | 
53 |     for file in val_list:
54 |         srcfile = os.path.join(image_path, file)
55 |         dstfile = os.path.join(image_path, 'valid', file)
56 |         val_pairs.append((srcfile, dstfile))
57 | 
58 |         srclabel = os.path.join(label_path, file)
59 |         dstlabel = os.path.join(label_path, 'valid', file)
60 |         val_pairs.append((srclabel, dstlabel))
61 |     pool.map(worker, val_pairs)
62 | 
63 |     test_pairs = []
64 | 
65 |     if not os.path.exists(os.path.join(image_path, 'test')):
66 |         os.makedirs(os.path.join(image_path, 'test'))
67 | 
68 |     if not os.path.exists(os.path.join(label_path, 'test')):
69 |         os.makedirs(os.path.join(label_path, 'test'))
70 | 
71 |     for file in test_list:
72 |         srcfile = os.path.join(image_path, file)
73 |         dstfile = os.path.join(image_path, 'test', file)
74 |         test_pairs.append((srcfile, dstfile))
75 | 
76 |         srclabel = os.path.join(label_path, file)
77 |         dstlabel = os.path.join(label_path, 'test', file)
78 |         test_pairs.append((srclabel, dstlabel))
79 |     pool.map(worker, test_pairs)


--------------------------------------------------------------------------------
/tools/README.md:
--------------------------------------------------------------------------------
 1 | This directory contains few tools for HGFormer.
 2 | 
 3 | * `convert-torchvision-to-d2.py`
 4 | 
 5 | Tool to convert torchvision pre-trained weights for D2.
 6 | 
 7 | ```
 8 | wget https://download.pytorch.org/models/resnet101-63fe2227.pth
 9 | python tools/convert-torchvision-to-d2.py resnet101-63fe2227.pth R-101.pkl
10 | ```
11 | 
12 | * `convert-pretrained-swin-model-to-d2.py`
13 | 
14 | Tool to convert Swin Transformer pre-trained weights for D2.
15 | 
16 | ```
17 | pip install timm
18 | 
19 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth
20 | python tools/convert-pretrained-swin-model-to-d2.py swin_tiny_patch4_window7_224.pth swin_tiny_patch4_window7_224.pkl
21 | 
22 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_small_patch4_window7_224.pth
23 | python tools/convert-pretrained-swin-model-to-d2.py swin_small_patch4_window7_224.pth swin_small_patch4_window7_224.pkl
24 | 
25 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window12_384_22k.pth
26 | python tools/convert-pretrained-swin-model-to-d2.py swin_base_patch4_window12_384_22k.pth swin_base_patch4_window12_384_22k.pkl
27 | 
28 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth
29 | python tools/convert-pretrained-swin-model-to-d2.py swin_large_patch4_window12_384_22k.pth swin_large_patch4_window12_384_22k.pkl
30 | ```
31 | 
32 | * `evaluate_pq_for_semantic_segmentation.py`
33 | 
34 | Tool to evaluate PQ (PQ-stuff) for semantic segmentation predictions.
35 | 
36 | Usage:
37 | 
38 | ```
39 | python tools/evaluate_pq_for_semantic_segmentation.py --dataset-name ade20k_sem_seg_val --json-file OUTPUT_DIR/inference/sem_seg_predictions.json
40 | ```
41 | 
42 | where `OUTPUT_DIR` is set in the config file.
43 | 
44 | * `evaluate_coco_boundary_ap.py`
45 | 
46 | Tool to evaluate Boundary AP for instance segmentation predictions.
47 | 
48 | Usage:
49 | 
50 | ```
51 | python tools/coco_instance_evaluation.py --gt-json-file COCO_GT_JSON --dt-json-file COCO_DT_JSON
52 | ```
53 | 
54 | To install Boundary IoU API, run:
55 | 
56 | ```
57 | pip install git+https://github.com/bowenc0221/boundary-iou-api.git
58 | ```
59 | 
60 | * `analyze_model.py`
61 | 
62 | Tool to analyze model parameters and flops.
63 | 
64 | Usage for semantic segmentation (ADE20K only, use with caution!):
65 | 
66 | ```
67 | python tools/analyze_model.py --num-inputs 1 --tasks flop --use-fixed-input-size --config-file CONFIG_FILE
68 | ```
69 | 
70 | Note that, for semantic segmentation (ADE20K only), we use a dummy image with fixed size that equals to `cfg.INPUT.CROP.SIZE[0] x cfg.INPUT.CROP.SIZE[0]`.
71 | Please do not use `--use-fixed-input-size` for calculating FLOPs on other datasets like Cityscapes!
72 | 
73 | Usage for panoptic and instance segmentation:
74 | 
75 | ```
76 | python tools/analyze_model.py --num-inputs 100 --tasks flop --config-file CONFIG_FILE
77 | ```
78 | 
79 | Note that, for panoptic and instance segmentation, we compute the average flops over 100 real validation images.
80 | 


--------------------------------------------------------------------------------
/hgformer/modeling/transformer_decoder/position_encoding.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py
 3 | """
 4 | Various positional encodings for the transformer.
 5 | """
 6 | import math
 7 | 
 8 | import torch
 9 | from torch import nn
10 | 
11 | 
12 | class PositionEmbeddingSine(nn.Module):
13 |     """
14 |     This is a more standard version of the position embedding, very similar to the one
15 |     used by the Attention is all you need paper, generalized to work on images.
16 |     """
17 | 
18 |     def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
19 |         super().__init__()
20 |         self.num_pos_feats = num_pos_feats
21 |         self.temperature = temperature
22 |         self.normalize = normalize
23 |         if scale is not None and normalize is False:
24 |             raise ValueError("normalize should be True if scale is passed")
25 |         if scale is None:
26 |             scale = 2 * math.pi
27 |         self.scale = scale
28 | 
29 |     def forward(self, x, mask=None):
30 |         if mask is None:
31 |             mask = torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool)
32 |         # mask: e.g. shape [2, 16, 32], [B, H, W]
33 |         not_mask = ~mask
34 |         y_embed = not_mask.cumsum(1, dtype=torch.float32) # [B, H, W]
35 |         x_embed = not_mask.cumsum(2, dtype=torch.float32) # [B, H, W]
36 |         if self.normalize:
37 |             eps = 1e-6
38 |             y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale # normalize the coordinates, then multiply 2pi
39 |             x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
40 | 
41 |         dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) # [128]
42 |         # import ipdb; ipdb.set_trace()
43 |         dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
44 | 
45 |         pos_x = x_embed[:, :, :, None] / dim_t # [B, H, W, num_pos_feats]
46 |         pos_y = y_embed[:, :, :, None] / dim_t
47 |         pos_x = torch.stack(
48 |             (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4
49 |         ).flatten(3) # [B, H, W, num_pos_feats]
50 |         # import ipdb; ipdb.set_trace()
51 |         pos_y = torch.stack(
52 |             (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4
53 |         ).flatten(3)
54 | 
55 |         # import ipdb; ipdb.set_trace()
56 |         pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) # [B, 2*num_pos_feats, H, W], 2 * num_pos_feats is equal to the number of feat channels
57 |         return pos
58 |     
59 |     def __repr__(self, _repr_indent=4):
60 |         head = "Positional encoding " + self.__class__.__name__
61 |         body = [
62 |             "num_pos_feats: {}".format(self.num_pos_feats),
63 |             "temperature: {}".format(self.temperature),
64 |             "normalize: {}".format(self.normalize),
65 |             "scale: {}".format(self.scale),
66 |         ]
67 |         # _repr_indent = 4
68 |         lines = [head] + [" " * _repr_indent + line for line in body]
69 |         return "\n".join(lines)
70 | 


--------------------------------------------------------------------------------
/MODEL_ZOO.md:
--------------------------------------------------------------------------------
 1 | # HGFormer Model Zoo and Baselines
 2 | 
 3 | #### Detectron2 ImageNet Pretrained Models
 4 | 
 5 | It's common to initialize from backbone models pre-trained on ImageNet classification tasks. 
 6 | 
 7 | To prepare the backbones pre-trained on ImageNet classification, please following [tools/README.md](./tools/README.md)
 8 | 
 9 | #### License
10 | 
11 | All models available for download through this document are licensed under the
12 | [Creative Commons Attribution-NonCommercial 4.0 International License](https://creativecommons.org/licenses/by-nc/4.0/).
13 | 
14 | ## Cityscapes -> ACDC
15 | |    Method   |  Backbone |  Fog  | Night |  Rain |  Snow |  All  | Download |
16 | |:-----------:|:---------:|:-----:|:-----:|:-----:|:-----:|:-----:|:--------:|
17 | | Mask2former | Swin-Tiny | 54.06 | 38.11 | 59.54 | 55.76 | 53.65 |   [model](https://drive.google.com/drive/folders/1eL38sFGdUNV8o9EbFsheurHjdm-CNg5K?usp=sharing)  |
18 | |   HGFormer  | Swin-Tiny | 59.82 | 41.88 | 60.92 | 60.82 | 56.95 |   [model](https://drive.google.com/drive/folders/1Rq1PnaYTFACpZX_-oXTq7laCa0zwbfFR?usp=drive_link)  |
19 | 
20 | 
21 | ## Cityscapes -> Cityscapes-C (level 5)
22 | |    Method   | Backbone  | Average | Motion | Defoc | Glass | Gauss | Gauss | Impul |  Shot | Speck | Bright | Contr | Satur |  JPEG |  Snow | Spatt |  Fog  | Frost | Download |
23 | |:-----------:|:-----------:|:---------:|:------:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|:------:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|:--------:|
24 | | Mask2former | Swin-Tiny |  41.68  |  51.61 | 51.52 | 39.69 | 46.71 |  6.89 |  7.68 | 12.75 | 44.10 |  72.71 | 58.60 | 69.14 | 22.86 | 26.10 | 58.35 | 67.12 | 31.11 |   [model](https://drive.google.com/drive/folders/1eL38sFGdUNV8o9EbFsheurHjdm-CNg5K?usp=sharing)  |
25 | |   HGFormer  | Swin-Tiny |  43.81  |  52.51 | 53.03 | 39.02 | 47.93 | 16.45 | 16.03 | 20.55 | 48.44 |  74.51 | 57.14 | 70.53 | 27.32 | 25.66 | 59.19 | 66.49 | 26.11 |   [model](https://drive.google.com/drive/folders/1Rq1PnaYTFACpZX_-oXTq7laCa0zwbfFR?usp=drive_link)  |
26 | ## Cityscapes -> Others
27 | |    Method   | Backbone  | Mapillary |  BDD  |  GTA  | Synthia | Average | Download |
28 | |:-----------:|:---------:|:---------:|:-----:|:-----:|:-------:|:--------:|:--------:|
29 | | Mask2former | Swin-Tiny |   65.28   | 49.87 | 51.38 |  34.76  | 50.32   |  [model](https://drive.google.com/drive/folders/1eL38sFGdUNV8o9EbFsheurHjdm-CNg5K?usp=sharing)  |
30 | |   HGFormer  | Swin-Tiny |   67.22   | 52.69 | 51.94 |  32.98  |  51.21  |[model](https://drive.google.com/drive/folders/1Rq1PnaYTFACpZX_-oXTq7laCa0zwbfFR?usp=drive_link)  |
31 | ## Mapillary -> Others
32 | 
33 | |    Method   |  Backbone |  GTA  | Synthia | Cityscapes |  BDD  | Average | Download |
34 | |:-----------:|:---------:|:-----:|:-------:|:----------:|:-----:|:-------:|:--------:|
35 | | Mask2former | Swin-Tiny | 57.81 |   40.14 |      68.23 | 59.05 |  56.31  |   [model](https://drive.google.com/drive/folders/1xqvAcQZs2NZhUD5dG2KGPmYBnlkH4u-s?usp=drive_link)  |
36 | |   HGFormer  | Swin-Tiny | 60.79 |   39.15 |      69.28 | 62.22 |  57.86  |   [model](https://drive.google.com/drive/folders/1XJgHBKT7J-_Gzqgzo3EiX0wAnjXMNCGG?usp=drive_link)  |
37 | 
38 | ## Disclaimer
39 | The numbers differ slightly from the results reported in the paper because we presented an average of three runs in the paper.


--------------------------------------------------------------------------------
/hgformer/modeling/pixel_decoder/ops/setup.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | import os
13 | import glob
14 | 
15 | import torch
16 | 
17 | from torch.utils.cpp_extension import CUDA_HOME
18 | from torch.utils.cpp_extension import CppExtension
19 | from torch.utils.cpp_extension import CUDAExtension
20 | 
21 | from setuptools import find_packages
22 | from setuptools import setup
23 | 
24 | requirements = ["torch", "torchvision"]
25 | 
26 | def get_extensions():
27 |     this_dir = os.path.dirname(os.path.abspath(__file__))
28 |     extensions_dir = os.path.join(this_dir, "src")
29 | 
30 |     main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
31 |     source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
32 |     source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
33 | 
34 |     sources = main_file + source_cpu
35 |     extension = CppExtension
36 |     extra_compile_args = {"cxx": []}
37 |     define_macros = []
38 | 
39 |     # Force cuda since torch ask for a device, not if cuda is in fact available.
40 |     if (os.environ.get('FORCE_CUDA') or torch.cuda.is_available()) and CUDA_HOME is not None:
41 |         extension = CUDAExtension
42 |         sources += source_cuda
43 |         define_macros += [("WITH_CUDA", None)]
44 |         extra_compile_args["nvcc"] = [
45 |             "-DCUDA_HAS_FP16=1",
46 |             "-D__CUDA_NO_HALF_OPERATORS__",
47 |             "-D__CUDA_NO_HALF_CONVERSIONS__",
48 |             "-D__CUDA_NO_HALF2_OPERATORS__",
49 |         ]
50 |     else:
51 |         if CUDA_HOME is None:
52 |             raise NotImplementedError('CUDA_HOME is None. Please set environment variable CUDA_HOME.')
53 |         else:
54 |             raise NotImplementedError('No CUDA runtime is found. Please set FORCE_CUDA=1 or test it by running torch.cuda.is_available().')
55 | 
56 |     sources = [os.path.join(extensions_dir, s) for s in sources]
57 |     include_dirs = [extensions_dir]
58 |     ext_modules = [
59 |         extension(
60 |             "MultiScaleDeformableAttention",
61 |             sources,
62 |             include_dirs=include_dirs,
63 |             define_macros=define_macros,
64 |             extra_compile_args=extra_compile_args,
65 |         )
66 |     ]
67 |     return ext_modules
68 | 
69 | setup(
70 |     name="MultiScaleDeformableAttention",
71 |     version="1.0",
72 |     author="Weijie Su",
73 |     url="https://github.com/fundamentalvision/Deformable-DETR",
74 |     description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention",
75 |     packages=find_packages(exclude=("configs", "tests",)),
76 |     ext_modules=get_extensions(),
77 |     cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
78 | )
79 | 


--------------------------------------------------------------------------------
/datasets/prepare_mapillary_sem_seg.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | # Copyright (c) Facebook, Inc. and its affiliates.
  4 | import os
  5 | from pathlib import Path
  6 | 
  7 | import numpy as np
  8 | import tqdm
  9 | from PIL import Image
 10 | from multiprocessing import Pool
 11 | 
 12 | ignore_label = 255
 13 | 
 14 | id_to_ignore_or_group = {}
 15 | 
 16 | # def gen_id_to_ignore():
 17 |     # global id_to_ignore_or_group
 18 | for i in range(66):
 19 |     id_to_ignore_or_group[i] = ignore_label
 20 | 
 21 | ### Convert each class to a corresponding cityscapes class
 22 | ### Road
 23 | # Road
 24 | id_to_ignore_or_group[13] = 0
 25 | # Lane Marking - General
 26 | id_to_ignore_or_group[24] = 0
 27 | # Manhole
 28 | id_to_ignore_or_group[41] = 0
 29 | 
 30 | ### Sidewalk
 31 | # Curb
 32 | id_to_ignore_or_group[2] = 1
 33 | # Sidewalk
 34 | id_to_ignore_or_group[15] = 1
 35 | 
 36 | ### Building
 37 | # Building
 38 | id_to_ignore_or_group[17] = 2
 39 | 
 40 | ### Wall
 41 | # Wall
 42 | id_to_ignore_or_group[6] = 3
 43 | 
 44 | ### Fence
 45 | # Fence
 46 | id_to_ignore_or_group[3] = 4
 47 | 
 48 | ### Pole
 49 | # Pole
 50 | id_to_ignore_or_group[45] = 5
 51 | # Utility Pole
 52 | id_to_ignore_or_group[47] = 5
 53 | 
 54 | ### Traffic Light
 55 | # Traffic Light
 56 | id_to_ignore_or_group[48] = 6
 57 | 
 58 | ### Traffic Sign
 59 | # Traffic Sign
 60 | id_to_ignore_or_group[50] = 7
 61 | 
 62 | ### Vegetation
 63 | # Vegitation
 64 | id_to_ignore_or_group[30] = 8
 65 | 
 66 | ### Terrain
 67 | # Terrain
 68 | id_to_ignore_or_group[29] = 9
 69 | 
 70 | ### Sky
 71 | # Sky
 72 | id_to_ignore_or_group[27] = 10
 73 | 
 74 | ### Person
 75 | # Person
 76 | id_to_ignore_or_group[19] = 11
 77 | 
 78 | ### Rider
 79 | # Bicyclist
 80 | id_to_ignore_or_group[20] = 12
 81 | # Motorcyclist
 82 | id_to_ignore_or_group[21] = 12
 83 | # Other Rider
 84 | id_to_ignore_or_group[22] = 12
 85 | 
 86 | ### Car
 87 | # Car
 88 | id_to_ignore_or_group[55] = 13
 89 | 
 90 | ### Truck
 91 | # Truck
 92 | id_to_ignore_or_group[61] = 14
 93 | 
 94 | ### Bus
 95 | # Bus
 96 | id_to_ignore_or_group[54] = 15
 97 | 
 98 | ### Train
 99 | # On Rails
100 | id_to_ignore_or_group[58] = 16
101 | 
102 | ### Motorcycle
103 | # Motorcycle
104 | id_to_ignore_or_group[57] = 17
105 | 
106 | ### Bicycle
107 | # Bicycle
108 | id_to_ignore_or_group[52] = 18
109 | 
110 | 
111 | 
112 | def convert(filetuple):
113 |     input, outputpath = filetuple
114 |     lab = np.asarray(Image.open(input))
115 |     assert lab.dtype == np.uint8
116 |     output = np.zeros_like(lab, dtype=np.uint8) + 255
117 |     for obj_id in np.unique(lab):
118 |         # print(f'obj_id{obj_id}')
119 |         # print(f'{id_to_ignore_or_group}')
120 |         if obj_id in id_to_ignore_or_group:
121 |             output[lab == obj_id] = id_to_ignore_or_group[obj_id]
122 | 
123 |     Image.fromarray(output).save(outputpath)
124 | 
125 | if __name__ == "__main__":
126 |     dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "mapillary"
127 |     pool = Pool(32)
128 |     # gen_id_to_ignore()
129 |     # import ipdb; ipdb.set_trace()
130 |     for name in ["training", "validation"]:
131 |         annotation_dir = dataset_dir / name / "labels"
132 |         output_dir = dataset_dir / "labels_detectron2" / name
133 |         output_dir.mkdir(parents=True, exist_ok=True)
134 |         filelist = []
135 |         for file in tqdm.tqdm(list(annotation_dir.iterdir())):
136 |             output_file = output_dir / file.name
137 |             # convert(file, output_file)
138 |             filelist.append((file, output_file))
139 |         pool.map(convert, filelist)


--------------------------------------------------------------------------------
/datasets/prepare_synthia_sem_seg.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | # Copyright (c) Facebook, Inc. and its affiliates.
 4 | import os
 5 | from pathlib import Path
 6 | 
 7 | import numpy as np
 8 | import tqdm
 9 | from PIL import Image
10 | from multiprocessing import Pool
11 | import cv2
12 | import imageio
13 | import imageio.v2 as imageio
14 | ignore_label = 255
15 | 
16 | # mapping based on README.txt from SYNTHIA_RAND_CITYSCAPES
17 | trainid_to_trainid = {
18 |         0: ignore_label,  # void
19 |         1: 10,            # sky
20 |         2: 2,             # building
21 |         3: 0,             # road
22 |         4: 1,             # sidewalk
23 |         5: 4,             # fence
24 |         6: 8,             # vegetation
25 |         7: 5,             # pole
26 |         8: 13,            # car
27 |         9: 7,             # traffic sign
28 |         10: 11,           # pedestrian - person
29 |         11: 18,           # bicycle
30 |         12: 17,           # motorcycle
31 |         13: ignore_label, # parking-slot
32 |         14: ignore_label, # road-work
33 |         15: 6,            # traffic light
34 |         16: 9,            # terrain
35 |         17: 12,           # rider
36 |         18: 14,           # truck
37 |         19: 15,           # bus
38 |         20: 16,           # train
39 |         21: 3,            # wall
40 |         22: ignore_label  # Lanemarking
41 |         }
42 | 
43 | # def convert(filetupe):
44 | #     input, outputpath = filetupe
45 | #     # lab = np.asarray(Image.open(input))
46 | #     # lab = imageio.imread(input, format='PNG-FI')
47 | #     lab = imageio.imread(input, format='PNG')
48 | #
49 | #     # print(input)
50 | #     # lab = cv2.imread(str(input), cv2.IMREAD_UNCHANGED)[:, :, -1]
51 | #     lab = np.array(lab, dtype=np.uint8)[:, :, 0]
52 | #     assert lab.dtype == np.uint8
53 | #     output = np.zeros_like(lab, dtype=np.uint8) + 255
54 | #     for obj_id in np.unique(lab):
55 | #         if obj_id in trainid_to_trainid:
56 | #             output[lab == obj_id] = trainid_to_trainid[obj_id]
57 | #
58 | #     Image.fromarray(output).save(outputpath)
59 | 
60 | 
61 | def convert(filetupe):
62 |     file, new_file = filetupe
63 |     # re-assign labels to match the format of Cityscapes
64 |     # PIL does not work with the image format, but cv2 does
65 |     label = cv2.imread(str(file), cv2.IMREAD_UNCHANGED)[:, :, -1]
66 | 
67 |     label_copy = 255 * np.ones(label.shape, dtype=np.uint8)
68 |     sample_class_stats = {}
69 |     for k, v in trainid_to_trainid.items():
70 |         k_mask = label == k
71 |         label_copy[k_mask] = v
72 |         n = int(np.sum(k_mask))
73 |         if n > 0:
74 |             sample_class_stats[v] = n
75 |     # new_file = file.replace('.png', '_labelTrainIds.png')
76 |     # assert file != new_file
77 |     # sample_class_stats['file'] = new_file
78 |     Image.fromarray(label_copy, mode='L').save(new_file)
79 |     # return sample_class_stats
80 | 
81 | if __name__ == "__main__":
82 |     dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "synthia"
83 |     pool = Pool(32)
84 |     for name in ["train", "val"]:
85 |     # for name in ["train"]:
86 |         annotation_dir = dataset_dir / "GT" / "LABELS" / name
87 |         output_dir = dataset_dir / "labels_detectron2" / name
88 |         output_dir.mkdir(parents=True, exist_ok=True)
89 |         filelist = []
90 |         for file in tqdm.tqdm(list(annotation_dir.iterdir())):
91 |             output_file = output_dir / file.name
92 |             # convert(file, output_file)
93 |             filelist.append((file, output_file))
94 |         pool.map(convert, filelist)


--------------------------------------------------------------------------------
/hgformer/modeling/pixel_decoder/ops/functions/ms_deform_attn_func.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from __future__ import absolute_import
13 | from __future__ import print_function
14 | from __future__ import division
15 | 
16 | import torch
17 | import torch.nn.functional as F
18 | from torch.autograd import Function
19 | from torch.autograd.function import once_differentiable
20 | 
21 | try:
22 |     import MultiScaleDeformableAttention as MSDA
23 | except ModuleNotFoundError as e:
24 |     info_string = (
25 |         "\n\nPlease compile MultiScaleDeformableAttention CUDA op with the following commands:\n"
26 |         "\t`cd mask2former/modeling/pixel_decoder/ops`\n"
27 |         "\t`sh make.sh`\n"
28 |     )
29 |     raise ModuleNotFoundError(info_string)
30 | 
31 | 
32 | class MSDeformAttnFunction(Function):
33 |     @staticmethod
34 |     def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step):
35 |         ctx.im2col_step = im2col_step
36 |         output = MSDA.ms_deform_attn_forward(
37 |             value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step)
38 |         ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights)
39 |         return output
40 | 
41 |     @staticmethod
42 |     @once_differentiable
43 |     def backward(ctx, grad_output):
44 |         value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors
45 |         grad_value, grad_sampling_loc, grad_attn_weight = \
46 |             MSDA.ms_deform_attn_backward(
47 |                 value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step)
48 | 
49 |         return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
50 | 
51 | 
52 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
53 |     # for debug and test only,
54 |     # need to use cuda version instead
55 |     N_, S_, M_, D_ = value.shape
56 |     _, Lq_, M_, L_, P_, _ = sampling_locations.shape
57 |     value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
58 |     sampling_grids = 2 * sampling_locations - 1
59 |     sampling_value_list = []
60 |     for lid_, (H_, W_) in enumerate(value_spatial_shapes):
61 |         # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
62 |         value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_)
63 |         # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
64 |         sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)
65 |         # N_*M_, D_, Lq_, P_
66 |         sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,
67 |                                           mode='bilinear', padding_mode='zeros', align_corners=False)
68 |         sampling_value_list.append(sampling_value_l_)
69 |     # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
70 |     attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_)
71 |     output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)
72 |     return output.transpose(1, 2).contiguous()
73 | 


--------------------------------------------------------------------------------
/hgformer/test_time_augmentation.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | import copy
  3 | import logging
  4 | from itertools import count
  5 | 
  6 | import numpy as np
  7 | import torch
  8 | from fvcore.transforms import HFlipTransform
  9 | from torch import nn
 10 | from torch.nn.parallel import DistributedDataParallel
 11 | 
 12 | from detectron2.data.detection_utils import read_image
 13 | from detectron2.modeling import DatasetMapperTTA
 14 | 
 15 | 
 16 | __all__ = [
 17 |     "SemanticSegmentorWithTTA",
 18 | ]
 19 | 
 20 | 
 21 | class SemanticSegmentorWithTTA(nn.Module):
 22 |     """
 23 |     A SemanticSegmentor with test-time augmentation enabled.
 24 |     Its :meth:`__call__` method has the same interface as :meth:`SemanticSegmentor.forward`.
 25 |     """
 26 | 
 27 |     def __init__(self, cfg, model, tta_mapper=None, batch_size=1):
 28 |         """
 29 |         Args:
 30 |             cfg (CfgNode):
 31 |             model (SemanticSegmentor): a SemanticSegmentor to apply TTA on.
 32 |             tta_mapper (callable): takes a dataset dict and returns a list of
 33 |                 augmented versions of the dataset dict. Defaults to
 34 |                 `DatasetMapperTTA(cfg)`.
 35 |             batch_size (int): batch the augmented images into this batch size for inference.
 36 |         """
 37 |         super().__init__()
 38 |         if isinstance(model, DistributedDataParallel):
 39 |             model = model.module
 40 |         self.cfg = cfg.clone()
 41 | 
 42 |         self.model = model
 43 | 
 44 |         if tta_mapper is None:
 45 |             tta_mapper = DatasetMapperTTA(cfg)
 46 |         self.tta_mapper = tta_mapper
 47 |         self.batch_size = batch_size
 48 | 
 49 |     def __call__(self, batched_inputs):
 50 |         """
 51 |         Same input/output format as :meth:`SemanticSegmentor.forward`
 52 |         """
 53 | 
 54 |         def _maybe_read_image(dataset_dict):
 55 |             ret = copy.copy(dataset_dict)
 56 |             if "image" not in ret:
 57 |                 image = read_image(ret.pop("file_name"), self.model.input_format)
 58 |                 image = torch.from_numpy(np.ascontiguousarray(image.transpose(2, 0, 1)))  # CHW
 59 |                 ret["image"] = image
 60 |             if "height" not in ret and "width" not in ret:
 61 |                 ret["height"] = image.shape[1]
 62 |                 ret["width"] = image.shape[2]
 63 |             return ret
 64 | 
 65 |         processed_results = []
 66 |         for x in batched_inputs:
 67 |             result = self._inference_one_image(_maybe_read_image(x))
 68 |             processed_results.append(result)
 69 |         return processed_results
 70 | 
 71 |     def _inference_one_image(self, input):
 72 |         """
 73 |         Args:
 74 |             input (dict): one dataset dict with "image" field being a CHW tensor
 75 |         Returns:
 76 |             dict: one output dict
 77 |         """
 78 |         orig_shape = (input["height"], input["width"])
 79 |         augmented_inputs, tfms = self._get_augmented_inputs(input)
 80 | 
 81 |         final_predictions = None
 82 |         count_predictions = 0
 83 |         for input, tfm in zip(augmented_inputs, tfms):
 84 |             count_predictions += 1
 85 |             with torch.no_grad():
 86 |                 if final_predictions is None:
 87 |                     if any(isinstance(t, HFlipTransform) for t in tfm.transforms):
 88 |                         final_predictions = self.model([input])[0].pop("sem_seg").flip(dims=[2])
 89 |                     else:
 90 |                         final_predictions = self.model([input])[0].pop("sem_seg")
 91 |                 else:
 92 |                     if any(isinstance(t, HFlipTransform) for t in tfm.transforms):
 93 |                         final_predictions += self.model([input])[0].pop("sem_seg").flip(dims=[2])
 94 |                     else:
 95 |                         final_predictions += self.model([input])[0].pop("sem_seg")
 96 | 
 97 |         final_predictions = final_predictions / count_predictions
 98 |         return {"sem_seg": final_predictions}
 99 | 
100 |     def _get_augmented_inputs(self, input):
101 |         augmented_inputs = self.tta_mapper(input)
102 |         tfms = [x.pop("transforms") for x in augmented_inputs]
103 |         return augmented_inputs, tfms
104 | 


--------------------------------------------------------------------------------
/hgformer/modeling/meta_arch/group_former_head.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | import logging
 3 | from copy import deepcopy
 4 | from typing import Callable, Dict, List, Optional, Tuple, Union
 5 | 
 6 | import fvcore.nn.weight_init as weight_init
 7 | from torch import nn
 8 | from torch.nn import functional as F
 9 | 
10 | from detectron2.config import configurable
11 | from detectron2.layers import Conv2d, ShapeSpec, get_norm
12 | from detectron2.modeling import SEM_SEG_HEADS_REGISTRY
13 | 
14 | from ..transformer_decoder.maskformer_transformer_decoder import build_transformer_decoder
15 | from ..pixel_decoder.fpn import build_pixel_decoder
16 | 
17 | 
18 | @SEM_SEG_HEADS_REGISTRY.register()
19 | class GroupFormerHead(nn.Module):
20 | 
21 |     @configurable
22 |     def __init__(
23 |         self,
24 |         input_shape: Dict[str, ShapeSpec],
25 |         *,
26 |         num_classes: int,
27 |         pixel_decoder: nn.Module,
28 |         loss_weight: float = 1.0,
29 |         ignore_value: int = -1,
30 |         # extra parameters
31 |         transformer_predictor: nn.Module,
32 |         transformer_in_feature: str,
33 |     ):
34 |         """
35 |         NOTE: this interface is experimental.
36 |         Args:
37 |             input_shape: shapes (channels and stride) of the input features
38 |             num_classes: number of classes to predict
39 |             pixel_decoder: the pixel decoder module
40 |             loss_weight: loss weight
41 |             ignore_value: category id to be ignored during training.
42 |             transformer_predictor: the transformer decoder that makes prediction
43 |             transformer_in_feature: input feature name to the transformer_predictor
44 |         """
45 |         super().__init__()
46 |         input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride)
47 |         self.in_features = [k for k, v in input_shape]
48 |         feature_strides = [v.stride for k, v in input_shape]
49 |         feature_channels = [v.channels for k, v in input_shape]
50 | 
51 |         self.ignore_value = ignore_value
52 |         self.common_stride = 4
53 |         self.loss_weight = loss_weight
54 | 
55 |         self.pixel_decoder = pixel_decoder
56 |         self.predictor = transformer_predictor
57 |         self.transformer_in_feature = transformer_in_feature
58 | 
59 |         self.num_classes = num_classes
60 | 
61 |     @classmethod
62 |     def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
63 |         # figure out in_channels to transformer predictor
64 |         if cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "transformer_encoder":
65 |             transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
66 |         elif cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "pixel_embedding":
67 |             transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM
68 |         elif cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "multi_scale_pixel_decoder":  # for maskformer2
69 |             transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
70 |         else:
71 |             transformer_predictor_in_channels = input_shape[cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE].channels
72 | 
73 |         return {
74 |             "input_shape": {
75 |                 k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES
76 |             },
77 |             "ignore_value": cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
78 |             "num_classes": cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES,
79 |             "pixel_decoder": build_pixel_decoder(cfg, input_shape),
80 |             "loss_weight": cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT,
81 |             "transformer_in_feature": cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE,
82 |             "transformer_predictor": build_transformer_decoder(
83 |                 cfg,
84 |                 transformer_predictor_in_channels,
85 |                 mask_classification=True,
86 |             ),
87 |         }
88 | 
89 |     def forward(self, features, mask=None):
90 |         return self.layers(features, mask)
91 | 
92 |     def layers(self, features, mask=None):
93 |         multi_scale_features = self.pixel_decoder.forward_features(features)
94 |         if self.transformer_in_feature == "multi_scale_pixel_decoder":
95 |             predictions = self.predictor(multi_scale_features, mask)
96 |         else:
97 |             raise NotImplementedError
98 |         return predictions
99 | 


--------------------------------------------------------------------------------
/hgformer/utils/misc.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/util/misc.py
  3 | """
  4 | Misc functions, including distributed helpers.
  5 | 
  6 | Mostly copy-paste from torchvision references.
  7 | """
  8 | from typing import List, Optional
  9 | 
 10 | import torch
 11 | import torch.distributed as dist
 12 | import torchvision
 13 | from torch import Tensor
 14 | 
 15 | 
 16 | def _max_by_axis(the_list):
 17 |     # type: (List[List[int]]) -> List[int]
 18 |     maxes = the_list[0]
 19 |     for sublist in the_list[1:]:
 20 |         for index, item in enumerate(sublist):
 21 |             maxes[index] = max(maxes[index], item)
 22 |     return maxes
 23 | 
 24 | 
 25 | class NestedTensor(object):
 26 |     def __init__(self, tensors, mask: Optional[Tensor]):
 27 |         self.tensors = tensors
 28 |         self.mask = mask
 29 | 
 30 |     def to(self, device):
 31 |         # type: (Device) -> NestedTensor # noqa
 32 |         cast_tensor = self.tensors.to(device)
 33 |         mask = self.mask
 34 |         if mask is not None:
 35 |             assert mask is not None
 36 |             cast_mask = mask.to(device)
 37 |         else:
 38 |             cast_mask = None
 39 |         return NestedTensor(cast_tensor, cast_mask)
 40 | 
 41 |     def decompose(self):
 42 |         return self.tensors, self.mask
 43 | 
 44 |     def __repr__(self):
 45 |         return str(self.tensors)
 46 | 
 47 | 
 48 | def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
 49 |     # TODO make this more general
 50 |     if tensor_list[0].ndim == 3:
 51 |         if torchvision._is_tracing():
 52 |             # nested_tensor_from_tensor_list() does not export well to ONNX
 53 |             # call _onnx_nested_tensor_from_tensor_list() instead
 54 |             return _onnx_nested_tensor_from_tensor_list(tensor_list)
 55 | 
 56 |         # TODO make it support different-sized images
 57 |         max_size = _max_by_axis([list(img.shape) for img in tensor_list])
 58 |         # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list]))
 59 |         batch_shape = [len(tensor_list)] + max_size
 60 |         b, c, h, w = batch_shape
 61 |         dtype = tensor_list[0].dtype
 62 |         device = tensor_list[0].device
 63 |         tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
 64 |         mask = torch.ones((b, h, w), dtype=torch.bool, device=device)
 65 |         for img, pad_img, m in zip(tensor_list, tensor, mask):
 66 |             pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
 67 |             m[: img.shape[1], : img.shape[2]] = False
 68 |     else:
 69 |         raise ValueError("not supported")
 70 |     return NestedTensor(tensor, mask)
 71 | 
 72 | 
 73 | # _onnx_nested_tensor_from_tensor_list() is an implementation of
 74 | # nested_tensor_from_tensor_list() that is supported by ONNX tracing.
 75 | @torch.jit.unused
 76 | def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor:
 77 |     max_size = []
 78 |     for i in range(tensor_list[0].dim()):
 79 |         max_size_i = torch.max(
 80 |             torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32)
 81 |         ).to(torch.int64)
 82 |         max_size.append(max_size_i)
 83 |     max_size = tuple(max_size)
 84 | 
 85 |     # work around for
 86 |     # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
 87 |     # m[: img.shape[1], :img.shape[2]] = False
 88 |     # which is not yet supported in onnx
 89 |     padded_imgs = []
 90 |     padded_masks = []
 91 |     for img in tensor_list:
 92 |         padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))]
 93 |         padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0]))
 94 |         padded_imgs.append(padded_img)
 95 | 
 96 |         m = torch.zeros_like(img[0], dtype=torch.int, device=img.device)
 97 |         padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1)
 98 |         padded_masks.append(padded_mask.to(torch.bool))
 99 | 
100 |     tensor = torch.stack(padded_imgs)
101 |     mask = torch.stack(padded_masks)
102 | 
103 |     return NestedTensor(tensor, mask=mask)
104 | 
105 | 
106 | def is_dist_avail_and_initialized():
107 |     if not dist.is_available():
108 |         return False
109 |     if not dist.is_initialized():
110 |         return False
111 |     return True
112 | 


--------------------------------------------------------------------------------
/hgformer/modeling/pixel_decoder/ops/test.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from __future__ import absolute_import
13 | from __future__ import print_function
14 | from __future__ import division
15 | 
16 | import time
17 | import torch
18 | import torch.nn as nn
19 | from torch.autograd import gradcheck
20 | 
21 | from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch
22 | 
23 | 
24 | N, M, D = 1, 2, 2
25 | Lq, L, P = 2, 2, 2
26 | shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda()
27 | level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1]))
28 | S = sum([(H*W).item() for H, W in shapes])
29 | 
30 | 
31 | torch.manual_seed(3)
32 | 
33 | 
34 | @torch.no_grad()
35 | def check_forward_equal_with_pytorch_double():
36 |     value = torch.rand(N, S, M, D).cuda() * 0.01
37 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
38 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
39 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
40 |     im2col_step = 2
41 |     output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu()
42 |     output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu()
43 |     fwdok = torch.allclose(output_cuda, output_pytorch)
44 |     max_abs_err = (output_cuda - output_pytorch).abs().max()
45 |     max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
46 | 
47 |     print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
48 | 
49 | 
50 | @torch.no_grad()
51 | def check_forward_equal_with_pytorch_float():
52 |     value = torch.rand(N, S, M, D).cuda() * 0.01
53 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
54 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
55 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
56 |     im2col_step = 2
57 |     output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu()
58 |     output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu()
59 |     fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3)
60 |     max_abs_err = (output_cuda - output_pytorch).abs().max()
61 |     max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
62 | 
63 |     print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
64 | 
65 | 
66 | def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True):
67 | 
68 |     value = torch.rand(N, S, M, channels).cuda() * 0.01
69 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
70 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
71 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
72 |     im2col_step = 2
73 |     func = MSDeformAttnFunction.apply
74 | 
75 |     value.requires_grad = grad_value
76 |     sampling_locations.requires_grad = grad_sampling_loc
77 |     attention_weights.requires_grad = grad_attn_weight
78 | 
79 |     gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step))
80 | 
81 |     print(f'* {gradok} check_gradient_numerical(D={channels})')
82 | 
83 | 
84 | if __name__ == '__main__':
85 |     check_forward_equal_with_pytorch_double()
86 |     check_forward_equal_with_pytorch_float()
87 | 
88 |     for channels in [30, 32, 64, 71, 1025, 2048, 3096]:
89 |         check_gradient_numerical(channels, True, True, True)
90 | 
91 | 
92 | 
93 | 


--------------------------------------------------------------------------------
/tools/visualize_data.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # Copyright (c) Facebook, Inc. and its affiliates.
  3 | import argparse
  4 | import os
  5 | from itertools import chain
  6 | import cv2
  7 | import tqdm
  8 | 
  9 | from detectron2.config import get_cfg
 10 | from detectron2.data import DatasetCatalog, MetadataCatalog, build_detection_train_loader
 11 | from detectron2.data import detection_utils as utils
 12 | from detectron2.data.build import filter_images_with_few_keypoints
 13 | from detectron2.utils.logger import setup_logger
 14 | from detectron2.utils.visualizer import Visualizer
 15 | from detectron2.projects.deeplab import add_deeplab_config, build_lr_scheduler
 16 | 
 17 | # MaskFormer
 18 | from hgformer import (
 19 |     COCOInstanceNewBaselineDatasetMapper,
 20 |     COCOPanopticNewBaselineDatasetMapper,
 21 |     InstanceSegEvaluator,
 22 |     MaskFormerInstanceDatasetMapper,
 23 |     MaskFormerPanopticDatasetMapper,
 24 |     MaskFormerSemanticDatasetMapper,
 25 |     SemanticSegmentorWithTTA,
 26 |     add_maskformer2_config,
 27 | )
 28 | 
 29 | def setup(args):
 30 |     cfg = get_cfg()
 31 |     add_deeplab_config(cfg)
 32 |     add_maskformer2_config(cfg)
 33 |     if args.config_file:
 34 |         cfg.merge_from_file(args.config_file)
 35 |     cfg.merge_from_list(args.opts)
 36 |     cfg.DATALOADER.NUM_WORKERS = 0
 37 |     cfg.freeze()
 38 |     return cfg
 39 | 
 40 | 
 41 | def parse_args(in_args=None):
 42 |     parser = argparse.ArgumentParser(description="Visualize ground-truth data")
 43 |     parser.add_argument(
 44 |         "--source",
 45 |         choices=["annotation", "dataloader"],
 46 |         required=True,
 47 |         help="visualize the annotations or the data loader (with pre-processing)",
 48 |     )
 49 |     parser.add_argument("--config-file", metavar="FILE", help="path to config file")
 50 |     parser.add_argument("--output-dir", default="./", help="path to output directory")
 51 |     parser.add_argument("--show", action="store_true", help="show output in a window")
 52 |     parser.add_argument(
 53 |         "opts",
 54 |         help="Modify config options using the command-line",
 55 |         default=None,
 56 |         nargs=argparse.REMAINDER,
 57 |     )
 58 |     return parser.parse_args(in_args)
 59 | 
 60 | 
 61 | if __name__ == "__main__":
 62 |     args = parse_args()
 63 |     logger = setup_logger()
 64 |     logger.info("Arguments: " + str(args))
 65 |     cfg = setup(args)
 66 | 
 67 |     dirname = args.output_dir
 68 |     os.makedirs(dirname, exist_ok=True)
 69 |     metadata = MetadataCatalog.get(cfg.DATASETS.TRAIN[0])
 70 | 
 71 |     def output(vis, fname):
 72 |         if args.show:
 73 |             print(fname)
 74 |             cv2.imshow("window", vis.get_image()[:, :, ::-1])
 75 |             cv2.waitKey()
 76 |         else:
 77 |             filepath = os.path.join(dirname, fname)
 78 |             print("Saving to {} ...".format(filepath))
 79 |             vis.save(filepath)
 80 | 
 81 |     scale = 1.0
 82 |     if args.source == "dataloader":
 83 |         mapper = MaskFormerSemanticDatasetMapper(cfg, True)
 84 |         train_data_loader = build_detection_train_loader(cfg, mapper=mapper)
 85 |         for batch in train_data_loader:
 86 |             for per_image in batch:
 87 |                 # Pytorch tensor is in (C, H, W) format
 88 |                 img = per_image["image"].permute(1, 2, 0).cpu().detach().numpy()
 89 |                 img = utils.convert_image_to_rgb(img, cfg.INPUT.FORMAT)
 90 | 
 91 |                 visualizer = Visualizer(img, metadata=metadata, scale=scale)
 92 |                 # import ipdb; ipdb.set_trace()
 93 |                 target_fields = per_image["instances"].get_fields()
 94 |                 # import ipdb; ipdb.set_trace()
 95 |                 labels = [metadata.stuff_classes[i] for i in target_fields["gt_classes"]]
 96 | 
 97 | 
 98 | 
 99 |                 vis = visualizer.output
100 |                 # output(vis, str(per_image["image_id"]) + ".jpg")
101 |                 output(vis, os.path.basename(per_image['file_name']))
102 | 
103 | 
104 | 
105 |                 # vis = visualizer.overlay_instances(
106 |                 #     labels=labels,
107 |                 #     # boxes=target_fields.get("gt_boxes", None),
108 |                 #     masks=target_fields.get("gt_masks", None),
109 |                 #     # keypoints=target_fields.get("gt_keypoints", None),
110 |                 # )
111 |                 # # output(vis, str(per_image["image_id"]) + ".jpg")
112 |                 # output(vis, os.path.basename(per_image['file_name']))
113 |     else:
114 |         dicts = list(chain.from_iterable([DatasetCatalog.get(k) for k in cfg.DATASETS.TRAIN]))
115 |         if cfg.MODEL.KEYPOINT_ON:
116 |             dicts = filter_images_with_few_keypoints(dicts, 1)
117 |         for dic in tqdm.tqdm(dicts):
118 |             img = utils.read_image(dic["file_name"], "RGB")
119 |             visualizer = Visualizer(img, metadata=metadata, scale=scale)
120 |             vis = visualizer.draw_dataset_dict(dic)
121 |             output(vis, os.path.basename(dic["file_name"]))
122 | 


--------------------------------------------------------------------------------
/hgformer/evaluation/instance_evaluation.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | import contextlib
  3 | import copy
  4 | import io
  5 | import itertools
  6 | import json
  7 | import logging
  8 | import numpy as np
  9 | import os
 10 | import pickle
 11 | from collections import OrderedDict
 12 | import pycocotools.mask as mask_util
 13 | import torch
 14 | from pycocotools.coco import COCO
 15 | from pycocotools.cocoeval import COCOeval
 16 | from tabulate import tabulate
 17 | 
 18 | import detectron2.utils.comm as comm
 19 | from detectron2.config import CfgNode
 20 | from detectron2.data import MetadataCatalog
 21 | from detectron2.data.datasets.coco import convert_to_coco_json
 22 | from detectron2.evaluation.coco_evaluation import COCOEvaluator, _evaluate_predictions_on_coco
 23 | from detectron2.evaluation.fast_eval_api import COCOeval_opt
 24 | from detectron2.structures import Boxes, BoxMode, pairwise_iou
 25 | from detectron2.utils.file_io import PathManager
 26 | from detectron2.utils.logger import create_small_table
 27 | 
 28 | 
 29 | # modified from COCOEvaluator for instance segmetnat
 30 | class InstanceSegEvaluator(COCOEvaluator):
 31 |     """
 32 |     Evaluate AR for object proposals, AP for instance detection/segmentation, AP
 33 |     for keypoint detection outputs using COCO's metrics.
 34 |     See http://cocodataset.org/#detection-eval and
 35 |     http://cocodataset.org/#keypoints-eval to understand its metrics.
 36 |     The metrics range from 0 to 100 (instead of 0 to 1), where a -1 or NaN means
 37 |     the metric cannot be computed (e.g. due to no predictions made).
 38 | 
 39 |     In addition to COCO, this evaluator is able to support any bounding box detection,
 40 |     instance segmentation, or keypoint detection dataset.
 41 |     """
 42 | 
 43 |     def _eval_predictions(self, predictions, img_ids=None):
 44 |         """
 45 |         Evaluate predictions. Fill self._results with the metrics of the tasks.
 46 |         """
 47 |         self._logger.info("Preparing results for COCO format ...")
 48 |         coco_results = list(itertools.chain(*[x["instances"] for x in predictions]))
 49 |         tasks = self._tasks or self._tasks_from_predictions(coco_results)
 50 | 
 51 |         # unmap the category ids for COCO
 52 |         if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"):
 53 |             dataset_id_to_contiguous_id = self._metadata.thing_dataset_id_to_contiguous_id
 54 |             # all_contiguous_ids = list(dataset_id_to_contiguous_id.values())
 55 |             # num_classes = len(all_contiguous_ids)
 56 |             # assert min(all_contiguous_ids) == 0 and max(all_contiguous_ids) == num_classes - 1
 57 | 
 58 |             reverse_id_mapping = {v: k for k, v in dataset_id_to_contiguous_id.items()}
 59 |             for result in coco_results:
 60 |                 category_id = result["category_id"]
 61 |                 # assert category_id < num_classes, (
 62 |                 #     f"A prediction has class={category_id}, "
 63 |                 #     f"but the dataset only has {num_classes} classes and "
 64 |                 #     f"predicted class id should be in [0, {num_classes - 1}]."
 65 |                 # )
 66 |                 assert category_id in reverse_id_mapping, (
 67 |                     f"A prediction has class={category_id}, "
 68 |                     f"but the dataset only has class ids in {dataset_id_to_contiguous_id}."
 69 |                 )
 70 |                 result["category_id"] = reverse_id_mapping[category_id]
 71 | 
 72 |         if self._output_dir:
 73 |             file_path = os.path.join(self._output_dir, "coco_instances_results.json")
 74 |             self._logger.info("Saving results to {}".format(file_path))
 75 |             with PathManager.open(file_path, "w") as f:
 76 |                 f.write(json.dumps(coco_results))
 77 |                 f.flush()
 78 | 
 79 |         if not self._do_evaluation:
 80 |             self._logger.info("Annotations are not available for evaluation.")
 81 |             return
 82 | 
 83 |         self._logger.info(
 84 |             "Evaluating predictions with {} COCO API...".format(
 85 |                 "unofficial" if self._use_fast_impl else "official"
 86 |             )
 87 |         )
 88 |         for task in sorted(tasks):
 89 |             assert task in {"bbox", "segm", "keypoints"}, f"Got unknown task: {task}!"
 90 |             coco_eval = (
 91 |                 _evaluate_predictions_on_coco(
 92 |                     self._coco_api,
 93 |                     coco_results,
 94 |                     task,
 95 |                     kpt_oks_sigmas=self._kpt_oks_sigmas,
 96 |                     use_fast_impl=self._use_fast_impl,
 97 |                     img_ids=img_ids,
 98 |                     max_dets_per_image=self._max_dets_per_image,
 99 |                 )
100 |                 if len(coco_results) > 0
101 |                 else None  # cocoapi does not handle empty results very well
102 |             )
103 | 
104 |             res = self._derive_coco_results(
105 |                 coco_eval, task, class_names=self._metadata.get("thing_classes")
106 |             )
107 |             self._results[task] = res
108 | 


--------------------------------------------------------------------------------
/demo/inference.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # Modified by Bowen Cheng from: https://github.com/facebookresearch/detectron2/blob/master/demo/demo.py
  3 | import argparse
  4 | import glob
  5 | import multiprocessing as mp
  6 | import os
  7 | 
  8 | # fmt: off
  9 | import sys
 10 | sys.path.insert(1, os.path.join(sys.path[0], '..'))
 11 | # fmt: on
 12 | 
 13 | import tempfile
 14 | import time
 15 | import warnings
 16 | 
 17 | import cv2
 18 | import numpy as np
 19 | import tqdm
 20 | 
 21 | from detectron2.config import get_cfg
 22 | from detectron2.data.detection_utils import read_image
 23 | from detectron2.projects.deeplab import add_deeplab_config
 24 | from detectron2.utils.logger import setup_logger
 25 | 
 26 | from hgformer import add_maskformer2_config
 27 | from predictor import VisualizationDemo
 28 | 
 29 | 
 30 | # constants
 31 | WINDOW_NAME = "mask2former demo"
 32 | 
 33 | def GetFileFromThisRootDir(dir,ext = None):
 34 |   allfiles = []
 35 |   needExtFilter = (ext != None)
 36 |   for root,dirs,files in os.walk(dir):
 37 |     for filespath in files:
 38 |       filepath = os.path.join(root, filespath)
 39 |       extension = os.path.splitext(filepath)[1][1:]
 40 |       if needExtFilter and extension in ext:
 41 |         allfiles.append(filepath)
 42 |       elif not needExtFilter:
 43 |         allfiles.append(filepath)
 44 |   return allfiles
 45 | 
 46 | def setup_cfg(args):
 47 |     # load config from file and command-line arguments
 48 |     cfg = get_cfg()
 49 |     add_deeplab_config(cfg)
 50 |     add_maskformer2_config(cfg)
 51 |     cfg.merge_from_file(args.config_file)
 52 |     cfg.merge_from_list(args.opts)
 53 |     cfg.freeze()
 54 |     return cfg
 55 | 
 56 | 
 57 | def get_parser():
 58 |     parser = argparse.ArgumentParser(description="maskformer2 demo for builtin configs")
 59 |     parser.add_argument(
 60 |         "--config-file",
 61 |         default="configs/coco/panoptic-segmentation/maskformer2_R50_bs16_50ep.yaml",
 62 |         metavar="FILE",
 63 |         help="path to config file",
 64 |     )
 65 |     parser.add_argument("--webcam", action="store_true", help="Take inputs from webcam.")
 66 |     parser.add_argument("--video-input", help="Path to video file.")
 67 |     parser.add_argument(
 68 |         "--input",
 69 |         nargs="+",
 70 |         help="A list of space separated input images; "
 71 |         "or a single glob pattern such as 'directory/*.jpg'",
 72 |     )
 73 |     parser.add_argument(
 74 |         "--output",
 75 |         help="A file or directory to save output visualizations. "
 76 |         "If not given, will show output in an OpenCV window.",
 77 |     )
 78 | 
 79 |     parser.add_argument(
 80 |         "--confidence-threshold",
 81 |         type=float,
 82 |         default=0.5,
 83 |         help="Minimum score for instance predictions to be shown",
 84 |     )
 85 |     parser.add_argument(
 86 |         "--opts",
 87 |         help="Modify config options using the command-line 'KEY VALUE' pairs",
 88 |         default=[],
 89 |         nargs=argparse.REMAINDER,
 90 |     )
 91 |     return parser
 92 | 
 93 | 
 94 | def test_opencv_video_format(codec, file_ext):
 95 |     with tempfile.TemporaryDirectory(prefix="video_format_test") as dir:
 96 |         filename = os.path.join(dir, "test_file" + file_ext)
 97 |         writer = cv2.VideoWriter(
 98 |             filename=filename,
 99 |             fourcc=cv2.VideoWriter_fourcc(*codec),
100 |             fps=float(30),
101 |             frameSize=(10, 10),
102 |             isColor=True,
103 |         )
104 |         [writer.write(np.zeros((10, 10, 3), np.uint8)) for _ in range(30)]
105 |         writer.release()
106 |         if os.path.isfile(filename):
107 |             return True
108 |         return False
109 | 
110 | 
111 | if __name__ == "__main__":
112 |     mp.set_start_method("spawn", force=True)
113 |     args = get_parser().parse_args()
114 |     setup_logger(name="fvcore")
115 |     logger = setup_logger()
116 |     logger.info("Arguments: " + str(args))
117 | 
118 |     cfg = setup_cfg(args)
119 | 
120 |     demo = VisualizationDemo(cfg)
121 | 
122 |     # import ipdb; ipdb.set_trace()
123 |     filelist = GetFileFromThisRootDir(args.input[0])
124 |     for path in tqdm.tqdm(filelist, disable=not args.output):
125 |         # use PIL, to be consistent with evaluation
126 |         img = read_image(path, format="BGR")
127 |         start_time = time.time()
128 |         # predictions, visualized_output = demo.run_on_image(img)
129 |         predictions = demo.predictor(img)
130 | 
131 |         # import ipdb; ipdb.set_trace()
132 |         logger.info(
133 |             "{}: {} in {:.2f}s".format(
134 |                 path,
135 |                 "detected {} instances".format(len(predictions["instances"]))
136 |                 if "instances" in predictions
137 |                 else "finished",
138 |                 time.time() - start_time,
139 |             )
140 |         )
141 | 
142 |         basename = os.path.basename(path)
143 |         if not os.path.exists(args.output):
144 |             os.makedirs(args.output)
145 |         output_path = os.path.join(args.output, basename)
146 | 
147 |         outimg = predictions['sem_seg'].detach().cpu().numpy().argmax(0).astype(np.uint8)
148 |         cv2.imwrite(output_path, outimg)
149 | 


--------------------------------------------------------------------------------
/datasets/README.md:
--------------------------------------------------------------------------------
  1 | # Prepare Datasets for HGFormer
  2 | 
  3 | A dataset can be used by accessing [DatasetCatalog](https://detectron2.readthedocs.io/modules/data.html#detectron2.data.DatasetCatalog)
  4 | for its data, or [MetadataCatalog](https://detectron2.readthedocs.io/modules/data.html#detectron2.data.MetadataCatalog) for its metadata (class names, etc).
  5 | This document explains how to setup the builtin datasets so they can be used by the above APIs.
  6 | [Use Custom Datasets](https://detectron2.readthedocs.io/tutorials/datasets.html) gives a deeper dive on how to use `DatasetCatalog` and `MetadataCatalog`,
  7 | and how to add new datasets to them.
  8 | 
  9 | HGFormer has builtin support for a few datasets.
 10 | The datasets are assumed to exist in a directory specified by the environment variable
 11 | `DETECTRON2_DATASETS`.
 12 | Under this directory, detectron2 will look for datasets in the structure described below, if needed.
 13 | ```
 14 | $DETECTRON2_DATASETS/
 15 |   cityscapes/
 16 |   cityscapes-c/
 17 |   mapillary/
 18 |   acdc/
 19 |   bdd/
 20 |   gta/
 21 |   synthia/
 22 | ```
 23 | 
 24 | You can set the location for builtin datasets by `export DETECTRON2_DATASETS=/path/to/datasets`.
 25 | If left unset, the default is `./datasets` relative to your current working directory.
 26 | 
 27 | 
 28 | ## Expected dataset structure for [cityscapes](https://www.cityscapes-dataset.com/downloads/):
 29 | ```
 30 | cityscapes/
 31 |   gtFine/
 32 |     train/
 33 |       aachen/
 34 |         color.png, instanceIds.png, labelIds.png, polygons.json,
 35 |         labelTrainIds.png
 36 |       ...
 37 |     val/
 38 |     test/
 39 |   leftImg8bit/
 40 |     train/
 41 |     val/
 42 |     test/
 43 | ```
 44 | 
 45 | Install cityscapes scripts by:
 46 | ```
 47 | pip install git+https://github.com/mcordts/cityscapesScripts.git
 48 | ```
 49 | 
 50 | Note: to create labelTrainIds.png, first prepare the above structure, then run cityscapesescript with:
 51 | ```
 52 | CITYSCAPES_DATASET=/path/to/abovementioned/cityscapes python cityscapesscripts/preparation/createTrainIdLabelImgs.py
 53 | ```
 54 | 
 55 | ## Expected dataset structure for [ACDC](https://acdc.vision.ee.ethz.ch/download):
 56 | ```
 57 | acdc/
 58 |     rgb_anon/
 59 |         fog/
 60 |             test/
 61 |         night/
 62 |             test/
 63 |         rain/
 64 |             test/
 65 |         snow/
 66 |             test/
 67 |         all/
 68 |             test/
 69 |     
 70 | ```
 71 | You should create the folder of ```all``` and copy test images of all types to ```all/test```
 72 | 
 73 | ## Expected dataset structure for [Mapillary](https://www.mapillary.com/dataset/vistas):
 74 | ```
 75 | mapillary/
 76 |     training/
 77 |         images/
 78 |         labels
 79 |     validation/
 80 |         images/
 81 |         labels/
 82 |     testing/
 83 |         images/
 84 |         labels/
 85 |     labels_detectron2/
 86 |         training/
 87 |         validation/
 88 | ```
 89 | Run `python datasets/prepare_mapillary_sem_seg.py`, to map the mapillary labels to the Cityscapes labels
 90 | 
 91 | 
 92 | ## Expected dataset structure for [BDD](https://www.mapillary.com/dataset/vistas):
 93 | ```
 94 | bdd/
 95 |     images/
 96 |         10k/
 97 |           train/
 98 |           val/
 99 |     labels/
100 |         sem_seg/
101 |           masks/
102 |             train/
103 |             val/
104 | ```
105 | 
106 | 
107 | ## Expected dataset structure for [Cityscapes-c]():
108 | 
109 | ```
110 | cityscapes-c/
111 |      clean/
112 |      brightness/
113 |         1/
114 |         2/
115 |         3/
116 |         4/
117 |         5/
118 |      ...
119 | ```
120 | 
121 | The folder clean should include the cityscapes images of val set.
122 | 
123 | The folders of corruption types (e.g. brightness) are generated by run `python datasets/generate_cityscapes_c.py`
124 | 
125 | 
126 | ## Expected dataset structure for [GTAV](https://download.visinf.tu-darmstadt.de/data/from_games/):
127 | ```
128 | gta/
129 |     images/
130 |         train/
131 |         valid/
132 |         test/
133 |     labels/
134 |         train/
135 |         valid/
136 |         test/
137 |     labels_detectron2/
138 |         train/
139 |         valid/
140 |         test/
141 | ```
142 | Downlaod the GTA from https://download.visinf.tu-darmstadt.de/data/from_games/
143 | 
144 | Then unzip the images and labels.
145 | 
146 | We split the dataset following [RobustNet](https://github.com/shachoi/RobustNet)
147 | ``` 
148 | python datasets/split_data/gta/split_gta.py
149 | ```
150 | For the GTA dataset, a small set of label maps (60 frames) has a different resolution than their corresponding image.
151 | Therefore, we need to resize these label maps.
152 | ```
153 | python datasets/split_data/gta/resize_img.py
154 | mv datasets/GTA/labels/valid_resize/* datasets/GTA/labels/valid/
155 | rm -rf datasets/GTA/labels/valid_resize/
156 | ```
157 | Finally, we map the labels for detectron2:
158 | ```
159 | python datasets/prepare_gta_sem_seg.py
160 | ```
161 | 
162 | ## Expected dataset structure for [Synthia](https://synthia-dataset.net/downloads/):
163 | ```
164 | synthia/
165 |     Depth/
166 |         Depth
167 |     GT/
168 |         COLOR/
169 |         LABELS/
170 |             train/
171 |             val/
172 |     RGB/
173 |         train/
174 |         val/
175 | ```
176 | We follow the [RobustNet]() to split the dataset.
177 | ```
178 | python datasets/synthia/split_synthia.py
179 | ```
180 | We then map the labels from synthia to cityscapes.
181 | ```
182 | python datasets/prepare_synthia_sem_seg.py
183 | ```
184 | 
185 | 


--------------------------------------------------------------------------------
/hgformer/data/samplers/balanced_sampler.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | import itertools
  3 | import logging
  4 | import math
  5 | from collections import defaultdict
  6 | from typing import Optional
  7 | import torch
  8 | from torch.utils.data.sampler import Sampler
  9 | from detectron2.utils import comm
 10 | 
 11 | class BalancedTrainingSampler(Sampler):
 12 |     """
 13 |     This is modified from repeat sampler
 14 |     Similar to TrainingSampler, but a sample may appear more times than others based
 15 |     on its "repeat factor".
 16 |     """
 17 | 
 18 |     def __init__(self, repeat_factors, *, shuffle=True, seed=None):
 19 |         """
 20 |         Args:
 21 |             repeat_factors (Tensor): a float vector, the repeat factor for each indice. When it's
 22 |                 full of ones, it is equivalent to ``TrainingSampler(len(repeat_factors), ...)``.
 23 |             shuffle (bool): whether to shuffle the indices or not
 24 |             seed (int): the initial seed of the shuffle. Must be the same
 25 |                 across all workers. If None, will use a random seed shared
 26 |                 among workers (require synchronization among all workers).
 27 |         """
 28 |         self._shuffle = shuffle
 29 |         if seed is None:
 30 |             seed = comm.shared_random_seed()
 31 |         self._seed = int(seed)
 32 | 
 33 |         self._rank = comm.get_rank()
 34 |         self._world_size = comm.get_world_size()
 35 | 
 36 |         # Split into whole number (_int_part) and fractional (_frac_part) parts.
 37 |         self._int_part = torch.trunc(repeat_factors)
 38 |         self._frac_part = repeat_factors - self._int_part
 39 | 
 40 |     @staticmethod
 41 |     def repeat_factors_from_category_frequency(dataset_dicts, repeat_thresh):
 42 |         """
 43 |         Compute (fractional) per-image repeat factors based on category frequency.
 44 |         The repeat factor for an image is a function of the frequency of the rarest
 45 |         category labeled in that image. The "frequency of category c" in [0, 1] is defined
 46 |         as the fraction of images in the training set (without repeats) in which category c
 47 |         appears.
 48 |         See :paper:`lvis` (>= v2) Appendix B.2.
 49 | 
 50 |         Args:
 51 |             dataset_dicts (list[dict]): annotations in Detectron2 dataset format.
 52 |             repeat_thresh (float): frequency threshold below which data is repeated.
 53 |                 If the frequency is half of `repeat_thresh`, the image will be
 54 |                 repeated twice.
 55 | 
 56 |         Returns:
 57 |             torch.Tensor:
 58 |                 the i-th element is the repeat factor for the dataset image at index i.
 59 |         """
 60 |         # 1. For each category c, compute the fraction of images that contain it: f(c)
 61 |         category_freq = defaultdict(int)
 62 |         import ipdb; ipdb.set_trace()
 63 |         for dataset_dict in dataset_dicts:  # For each image (without repeats)
 64 |             cat_ids = {ann["category_id"] for ann in dataset_dict["annotations"]}
 65 |             for cat_id in cat_ids:
 66 |                 category_freq[cat_id] += 1
 67 |         num_images = len(dataset_dicts)
 68 |         for k, v in category_freq.items():
 69 |             category_freq[k] = v / num_images
 70 | 
 71 |         # 2. For each category c, compute the category-level repeat factor:
 72 |         #    r(c) = max(1, sqrt(t / f(c)))
 73 |         category_rep = {
 74 |             cat_id: max(1.0, math.sqrt(repeat_thresh / cat_freq))
 75 |             for cat_id, cat_freq in category_freq.items()
 76 |         }
 77 | 
 78 |         # 3. For each image I, compute the image-level repeat factor:
 79 |         #    r(I) = max_{c in I} r(c)
 80 |         rep_factors = []
 81 |         for dataset_dict in dataset_dicts:
 82 |             cat_ids = {ann["category_id"] for ann in dataset_dict["annotations"]}
 83 |             rep_factor = max({category_rep[cat_id] for cat_id in cat_ids}, default=1.0)
 84 |             rep_factors.append(rep_factor)
 85 | 
 86 |         return torch.tensor(rep_factors, dtype=torch.float32)
 87 | 
 88 |     def _get_epoch_indices(self, generator):
 89 |         """
 90 |         Create a list of dataset indices (with repeats) to use for one epoch.
 91 | 
 92 |         Args:
 93 |             generator (torch.Generator): pseudo random number generator used for
 94 |                 stochastic rounding.
 95 | 
 96 |         Returns:
 97 |             torch.Tensor: list of dataset indices to use in one epoch. Each index
 98 |                 is repeated based on its calculated repeat factor.
 99 |         """
100 |         # Since repeat factors are fractional, we use stochastic rounding so
101 |         # that the target repeat factor is achieved in expectation over the
102 |         # course of training
103 |         rands = torch.rand(len(self._frac_part), generator=generator)
104 |         rep_factors = self._int_part + (rands < self._frac_part).float()
105 |         # Construct a list of indices in which we repeat images as specified
106 |         indices = []
107 |         for dataset_index, rep_factor in enumerate(rep_factors):
108 |             indices.extend([dataset_index] * int(rep_factor.item()))
109 |         return torch.tensor(indices, dtype=torch.int64)
110 | 
111 |     def __iter__(self):
112 |         start = self._rank
113 |         yield from itertools.islice(self._infinite_indices(), start, None, self._world_size)
114 | 
115 |     def _infinite_indices(self):
116 |         g = torch.Generator()
117 |         g.manual_seed(self._seed)
118 |         while True:
119 |             # Sample indices with repeats determined by stochastic rounding; each
120 |             # "epoch" may have a slightly different size due to the rounding.
121 |             indices = self._get_epoch_indices(g)
122 |             if self._shuffle:
123 |                 randperm = torch.randperm(len(indices), generator=g)
124 |                 yield from indices[randperm].tolist()
125 |             else:
126 |                 yield from indices.tolist()
127 | 


--------------------------------------------------------------------------------
/hgformer/modeling/meta_arch/mask_former_head.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | import logging
  3 | from copy import deepcopy
  4 | from typing import Callable, Dict, List, Optional, Tuple, Union
  5 | 
  6 | import fvcore.nn.weight_init as weight_init
  7 | from torch import nn
  8 | from torch.nn import functional as F
  9 | 
 10 | from detectron2.config import configurable
 11 | from detectron2.layers import Conv2d, ShapeSpec, get_norm
 12 | from detectron2.modeling import SEM_SEG_HEADS_REGISTRY
 13 | 
 14 | from ..transformer_decoder.maskformer_transformer_decoder import build_transformer_decoder
 15 | from ..pixel_decoder.fpn import build_pixel_decoder
 16 | 
 17 | 
 18 | @SEM_SEG_HEADS_REGISTRY.register()
 19 | class MaskFormerHead(nn.Module):
 20 | 
 21 |     _version = 2
 22 | 
 23 |     # def _load_from_state_dict(
 24 |     #     self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
 25 |     # ):
 26 |     #     version = local_metadata.get("version", None)
 27 |     #     if version is None or version < 2:
 28 |     #         # Do not warn if train from scratch
 29 |     #         scratch = True
 30 |     #         logger = logging.getLogger(__name__)
 31 |     #         for k in list(state_dict.keys()):
 32 |     #             newk = k
 33 |     #             if "sem_seg_head" in k and not k.startswith(prefix + "predictor"):
 34 |     #                 newk = k.replace(prefix, prefix + "pixel_decoder.")
 35 |     #                 # logger.debug(f"{k} ==> {newk}")
 36 |     #             if newk != k:
 37 |     #                 state_dict[newk] = state_dict[k]
 38 |     #                 del state_dict[k]
 39 |     #                 scratch = False
 40 |     #
 41 |     #         if not scratch:
 42 |     #             logger.warning(
 43 |     #                 f"Weight format of {self.__class__.__name__} have changed! "
 44 |     #                 "Please upgrade your models. Applying automatic conversion now ..."
 45 |     #             )
 46 | 
 47 |     @configurable
 48 |     def __init__(
 49 |         self,
 50 |         input_shape: Dict[str, ShapeSpec],
 51 |         *,
 52 |         num_classes: int,
 53 |         pixel_decoder: nn.Module,
 54 |         loss_weight: float = 1.0,
 55 |         ignore_value: int = -1,
 56 |         # extra parameters
 57 |         transformer_predictor: nn.Module,
 58 |         transformer_in_feature: str,
 59 |     ):
 60 |         """
 61 |         NOTE: this interface is experimental.
 62 |         Args:
 63 |             input_shape: shapes (channels and stride) of the input features
 64 |             num_classes: number of classes to predict
 65 |             pixel_decoder: the pixel decoder module
 66 |             loss_weight: loss weight
 67 |             ignore_value: category id to be ignored during training.
 68 |             transformer_predictor: the transformer decoder that makes prediction
 69 |             transformer_in_feature: input feature name to the transformer_predictor
 70 |         """
 71 |         super().__init__()
 72 |         input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride)
 73 |         self.in_features = [k for k, v in input_shape]
 74 |         feature_strides = [v.stride for k, v in input_shape]
 75 |         feature_channels = [v.channels for k, v in input_shape]
 76 | 
 77 |         self.ignore_value = ignore_value
 78 |         self.common_stride = 4
 79 |         self.loss_weight = loss_weight
 80 | 
 81 |         self.pixel_decoder = pixel_decoder
 82 |         self.predictor = transformer_predictor
 83 |         self.transformer_in_feature = transformer_in_feature
 84 | 
 85 |         self.num_classes = num_classes
 86 | 
 87 |     @classmethod
 88 |     def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
 89 |         # figure out in_channels to transformer predictor
 90 |         if cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "transformer_encoder":
 91 |             transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
 92 |         elif cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "pixel_embedding":
 93 |             transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM
 94 |         elif cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "multi_scale_pixel_decoder":  # for maskformer2
 95 |             transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
 96 |         else:
 97 |             transformer_predictor_in_channels = input_shape[cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE].channels
 98 | 
 99 |         return {
100 |             "input_shape": {
101 |                 k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES
102 |             },
103 |             "ignore_value": cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
104 |             "num_classes": cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES,
105 |             "pixel_decoder": build_pixel_decoder(cfg, input_shape),
106 |             "loss_weight": cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT,
107 |             "transformer_in_feature": cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE,
108 |             "transformer_predictor": build_transformer_decoder(
109 |                 cfg,
110 |                 transformer_predictor_in_channels,
111 |                 mask_classification=True,
112 |             ),
113 |         }
114 | 
115 |     def forward(self, features, mask=None):
116 |         return self.layers(features, mask)
117 | 
118 |     def layers(self, features, mask=None):
119 |         mask_features, transformer_encoder_features, multi_scale_features = self.pixel_decoder.forward_features(features)
120 |         if self.transformer_in_feature == "multi_scale_pixel_decoder":
121 |             predictions = self.predictor(multi_scale_features, mask_features, mask)
122 |         else:
123 |             if self.transformer_in_feature == "transformer_encoder":
124 |                 assert (
125 |                     transformer_encoder_features is not None
126 |                 ), "Please use the TransformerEncoderPixelDecoder."
127 |                 predictions = self.predictor(transformer_encoder_features, mask_features, mask)
128 |             elif self.transformer_in_feature == "pixel_embedding":
129 |                 predictions = self.predictor(mask_features, mask_features, mask)
130 |             else:
131 |                 predictions = self.predictor(features[self.transformer_in_feature], mask_features, mask)
132 |         return predictions
133 | 


--------------------------------------------------------------------------------
/tools/analyze_model.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Copyright (c) Facebook, Inc. and its affiliates.
  3 | # Modified by Bowen Cheng from https://github.com/facebookresearch/detectron2/blob/main/tools/analyze_model.py
  4 | 
  5 | import logging
  6 | import numpy as np
  7 | from collections import Counter
  8 | import tqdm
  9 | from fvcore.nn import flop_count_table  # can also try flop_count_str
 10 | 
 11 | from detectron2.checkpoint import DetectionCheckpointer
 12 | from detectron2.config import CfgNode, LazyConfig, get_cfg, instantiate
 13 | from detectron2.data import build_detection_test_loader
 14 | from detectron2.engine import default_argument_parser
 15 | from detectron2.modeling import build_model
 16 | from detectron2.projects.deeplab import add_deeplab_config
 17 | from detectron2.utils.analysis import (
 18 |     FlopCountAnalysis,
 19 |     activation_count_operators,
 20 |     parameter_count_table,
 21 | )
 22 | from detectron2.utils.logger import setup_logger
 23 | 
 24 | # fmt: off
 25 | import os
 26 | import sys
 27 | sys.path.insert(1, os.path.join(sys.path[0], '..'))
 28 | # fmt: on
 29 | 
 30 | from hgformer import add_maskformer2_config
 31 | 
 32 | logger = logging.getLogger("detectron2")
 33 | 
 34 | 
 35 | def setup(args):
 36 |     if args.config_file.endswith(".yaml"):
 37 |         cfg = get_cfg()
 38 |         add_deeplab_config(cfg)
 39 |         add_maskformer2_config(cfg)
 40 |         cfg.merge_from_file(args.config_file)
 41 |         cfg.DATALOADER.NUM_WORKERS = 0
 42 |         cfg.merge_from_list(args.opts)
 43 |         cfg.freeze()
 44 |     else:
 45 |         cfg = LazyConfig.load(args.config_file)
 46 |         cfg = LazyConfig.apply_overrides(cfg, args.opts)
 47 |     setup_logger(name="fvcore")
 48 |     setup_logger()
 49 |     return cfg
 50 | 
 51 | 
 52 | def do_flop(cfg):
 53 |     if isinstance(cfg, CfgNode):
 54 |         data_loader = build_detection_test_loader(cfg, cfg.DATASETS.TEST[0])
 55 |         model = build_model(cfg)
 56 |         DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS)
 57 |     else:
 58 |         data_loader = instantiate(cfg.dataloader.test)
 59 |         model = instantiate(cfg.model)
 60 |         model.to(cfg.train.device)
 61 |         DetectionCheckpointer(model).load(cfg.train.init_checkpoint)
 62 |     model.eval()
 63 | 
 64 |     counts = Counter()
 65 |     total_flops = []
 66 |     for idx, data in zip(tqdm.trange(args.num_inputs), data_loader):  # noqa
 67 |         if args.use_fixed_input_size and isinstance(cfg, CfgNode):
 68 |             import torch
 69 |             crop_size = cfg.INPUT.CROP.SIZE[0]
 70 |             data[0]["image"] = torch.zeros((3, crop_size, crop_size))
 71 |         flops = FlopCountAnalysis(model, data)
 72 |         if idx > 0:
 73 |             flops.unsupported_ops_warnings(False).uncalled_modules_warnings(False)
 74 |         counts += flops.by_operator()
 75 |         total_flops.append(flops.total())
 76 | 
 77 |     logger.info("Flops table computed from only one input sample:\n" + flop_count_table(flops))
 78 |     logger.info(
 79 |         "Average GFlops for each type of operators:\n"
 80 |         + str([(k, v / (idx + 1) / 1e9) for k, v in counts.items()])
 81 |     )
 82 |     logger.info(
 83 |         "Total GFlops: {:.1f}±{:.1f}".format(np.mean(total_flops) / 1e9, np.std(total_flops) / 1e9)
 84 |     )
 85 | 
 86 | 
 87 | def do_activation(cfg):
 88 |     if isinstance(cfg, CfgNode):
 89 |         data_loader = build_detection_test_loader(cfg, cfg.DATASETS.TEST[0])
 90 |         model = build_model(cfg)
 91 |         DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS)
 92 |     else:
 93 |         data_loader = instantiate(cfg.dataloader.test)
 94 |         model = instantiate(cfg.model)
 95 |         model.to(cfg.train.device)
 96 |         DetectionCheckpointer(model).load(cfg.train.init_checkpoint)
 97 |     model.eval()
 98 | 
 99 |     counts = Counter()
100 |     total_activations = []
101 |     for idx, data in zip(tqdm.trange(args.num_inputs), data_loader):  # noqa
102 |         count = activation_count_operators(model, data)
103 |         counts += count
104 |         total_activations.append(sum(count.values()))
105 |     logger.info(
106 |         "(Million) Activations for Each Type of Operators:\n"
107 |         + str([(k, v / idx) for k, v in counts.items()])
108 |     )
109 |     logger.info(
110 |         "Total (Million) Activations: {}±{}".format(
111 |             np.mean(total_activations), np.std(total_activations)
112 |         )
113 |     )
114 | 
115 | 
116 | def do_parameter(cfg):
117 |     if isinstance(cfg, CfgNode):
118 |         model = build_model(cfg)
119 |     else:
120 |         model = instantiate(cfg.model)
121 |     logger.info("Parameter Count:\n" + parameter_count_table(model, max_depth=5))
122 | 
123 | 
124 | def do_structure(cfg):
125 |     if isinstance(cfg, CfgNode):
126 |         model = build_model(cfg)
127 |     else:
128 |         model = instantiate(cfg.model)
129 |     logger.info("Model Structure:\n" + str(model))
130 | 
131 | 
132 | if __name__ == "__main__":
133 |     parser = default_argument_parser(
134 |         epilog="""
135 | Examples:
136 | To show parameters of a model:
137 | $ ./analyze_model.py --tasks parameter \\
138 |     --config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml
139 | Flops and activations are data-dependent, therefore inputs and model weights
140 | are needed to count them:
141 | $ ./analyze_model.py --num-inputs 100 --tasks flop \\
142 |     --config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \\
143 |     MODEL.WEIGHTS /path/to/model.pkl
144 | """
145 |     )
146 |     parser.add_argument(
147 |         "--tasks",
148 |         choices=["flop", "activation", "parameter", "structure"],
149 |         required=True,
150 |         nargs="+",
151 |     )
152 |     parser.add_argument(
153 |         "-n",
154 |         "--num-inputs",
155 |         default=100,
156 |         type=int,
157 |         help="number of inputs used to compute statistics for flops/activations, "
158 |         "both are data dependent.",
159 |     )
160 |     parser.add_argument(
161 |         "--use-fixed-input-size",
162 |         action="store_true",
163 |         help="use fixed input size when calculating flops",
164 |     )
165 |     args = parser.parse_args()
166 |     assert not args.eval_only
167 |     assert args.num_gpus == 1
168 | 
169 |     cfg = setup(args)
170 | 
171 |     for task in args.tasks:
172 |         {
173 |             "flop": do_flop,
174 |             "activation": do_activation,
175 |             "parameter": do_parameter,
176 |             "structure": do_structure,
177 |         }[task](cfg)
178 | 


--------------------------------------------------------------------------------
/hgformer/data/dataset_mappers/mask_former_panoptic_dataset_mapper.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | import copy
  3 | import logging
  4 | 
  5 | import numpy as np
  6 | import torch
  7 | from torch.nn import functional as F
  8 | 
  9 | from detectron2.config import configurable
 10 | from detectron2.data import detection_utils as utils
 11 | from detectron2.data import transforms as T
 12 | from detectron2.structures import BitMasks, Instances
 13 | 
 14 | from .mask_former_semantic_dataset_mapper import MaskFormerSemanticDatasetMapper
 15 | 
 16 | __all__ = ["MaskFormerPanopticDatasetMapper"]
 17 | 
 18 | 
 19 | class MaskFormerPanopticDatasetMapper(MaskFormerSemanticDatasetMapper):
 20 |     """
 21 |     A callable which takes a dataset dict in Detectron2 Dataset format,
 22 |     and map it into a format used by MaskFormer for panoptic segmentation.
 23 | 
 24 |     The callable currently does the following:
 25 | 
 26 |     1. Read the image from "file_name"
 27 |     2. Applies geometric transforms to the image and annotation
 28 |     3. Find and applies suitable cropping to the image and annotation
 29 |     4. Prepare image and annotation to Tensors
 30 |     """
 31 | 
 32 |     @configurable
 33 |     def __init__(
 34 |         self,
 35 |         is_train=True,
 36 |         *,
 37 |         augmentations,
 38 |         image_format,
 39 |         ignore_label,
 40 |         size_divisibility,
 41 |     ):
 42 |         """
 43 |         NOTE: this interface is experimental.
 44 |         Args:
 45 |             is_train: for training or inference
 46 |             augmentations: a list of augmentations or deterministic transforms to apply
 47 |             image_format: an image format supported by :func:`detection_utils.read_image`.
 48 |             ignore_label: the label that is ignored to evaluation
 49 |             size_divisibility: pad image size to be divisible by this value
 50 |         """
 51 |         super().__init__(
 52 |             is_train,
 53 |             augmentations=augmentations,
 54 |             image_format=image_format,
 55 |             ignore_label=ignore_label,
 56 |             size_divisibility=size_divisibility,
 57 |         )
 58 | 
 59 |     def __call__(self, dataset_dict):
 60 |         """
 61 |         Args:
 62 |             dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
 63 | 
 64 |         Returns:
 65 |             dict: a format that builtin models in detectron2 accept
 66 |         """
 67 |         assert self.is_train, "MaskFormerPanopticDatasetMapper should only be used for training!"
 68 | 
 69 |         dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
 70 |         image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
 71 |         utils.check_image_size(dataset_dict, image)
 72 | 
 73 |         # semantic segmentation
 74 |         if "sem_seg_file_name" in dataset_dict:
 75 |             # PyTorch transformation not implemented for uint16, so converting it to double first
 76 |             sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name")).astype("double")
 77 |         else:
 78 |             sem_seg_gt = None
 79 | 
 80 |         # panoptic segmentation
 81 |         if "pan_seg_file_name" in dataset_dict:
 82 |             pan_seg_gt = utils.read_image(dataset_dict.pop("pan_seg_file_name"), "RGB")
 83 |             segments_info = dataset_dict["segments_info"]
 84 |         else:
 85 |             pan_seg_gt = None
 86 |             segments_info = None
 87 | 
 88 |         if pan_seg_gt is None:
 89 |             raise ValueError(
 90 |                 "Cannot find 'pan_seg_file_name' for panoptic segmentation dataset {}.".format(
 91 |                     dataset_dict["file_name"]
 92 |                 )
 93 |             )
 94 | 
 95 |         aug_input = T.AugInput(image, sem_seg=sem_seg_gt)
 96 |         aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input)
 97 |         image = aug_input.image
 98 |         if sem_seg_gt is not None:
 99 |             sem_seg_gt = aug_input.sem_seg
100 | 
101 |         # apply the same transformation to panoptic segmentation
102 |         pan_seg_gt = transforms.apply_segmentation(pan_seg_gt)
103 | 
104 |         from panopticapi.utils import rgb2id
105 | 
106 |         pan_seg_gt = rgb2id(pan_seg_gt)
107 | 
108 |         # Pad image and segmentation label here!
109 |         image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
110 |         if sem_seg_gt is not None:
111 |             sem_seg_gt = torch.as_tensor(sem_seg_gt.astype("long"))
112 |         pan_seg_gt = torch.as_tensor(pan_seg_gt.astype("long"))
113 | 
114 |         if self.size_divisibility > 0:
115 |             image_size = (image.shape[-2], image.shape[-1])
116 |             padding_size = [
117 |                 0,
118 |                 self.size_divisibility - image_size[1],
119 |                 0,
120 |                 self.size_divisibility - image_size[0],
121 |             ]
122 |             image = F.pad(image, padding_size, value=128).contiguous()
123 |             if sem_seg_gt is not None:
124 |                 sem_seg_gt = F.pad(sem_seg_gt, padding_size, value=self.ignore_label).contiguous()
125 |             pan_seg_gt = F.pad(
126 |                 pan_seg_gt, padding_size, value=0
127 |             ).contiguous()  # 0 is the VOID panoptic label
128 | 
129 |         image_shape = (image.shape[-2], image.shape[-1])  # h, w
130 | 
131 |         # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
132 |         # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
133 |         # Therefore it's important to use torch.Tensor.
134 |         dataset_dict["image"] = image
135 |         if sem_seg_gt is not None:
136 |             dataset_dict["sem_seg"] = sem_seg_gt.long()
137 | 
138 |         if "annotations" in dataset_dict:
139 |             raise ValueError("Pemantic segmentation dataset should not have 'annotations'.")
140 | 
141 |         # Prepare per-category binary masks
142 |         pan_seg_gt = pan_seg_gt.numpy()
143 |         instances = Instances(image_shape)
144 |         classes = []
145 |         masks = []
146 |         for segment_info in segments_info:
147 |             class_id = segment_info["category_id"]
148 |             if not segment_info["iscrowd"]:
149 |                 classes.append(class_id)
150 |                 masks.append(pan_seg_gt == segment_info["id"])
151 | 
152 |         classes = np.array(classes)
153 |         instances.gt_classes = torch.tensor(classes, dtype=torch.int64)
154 |         if len(masks) == 0:
155 |             # Some image does not have annotation (all ignored)
156 |             instances.gt_masks = torch.zeros((0, pan_seg_gt.shape[-2], pan_seg_gt.shape[-1]))
157 |         else:
158 |             masks = BitMasks(
159 |                 torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks])
160 |             )
161 |             instances.gt_masks = masks.tensor
162 | 
163 |         dataset_dict["instances"] = instances
164 | 
165 |         return dataset_dict
166 | 


--------------------------------------------------------------------------------
/hgformer/modeling/pixel_decoder/ops/modules/ms_deform_attn.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------------------------------
  2 | # Deformable DETR
  3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
  4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  5 | # ------------------------------------------------------------------------------------------------
  6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
  7 | # ------------------------------------------------------------------------------------------------
  8 | 
  9 | # Copyright (c) Facebook, Inc. and its affiliates.
 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
 11 | 
 12 | from __future__ import absolute_import
 13 | from __future__ import print_function
 14 | from __future__ import division
 15 | 
 16 | import warnings
 17 | import math
 18 | 
 19 | import torch
 20 | from torch import nn
 21 | import torch.nn.functional as F
 22 | from torch.nn.init import xavier_uniform_, constant_
 23 | 
 24 | from ..functions import MSDeformAttnFunction
 25 | from ..functions.ms_deform_attn_func import ms_deform_attn_core_pytorch
 26 | 
 27 | 
 28 | def _is_power_of_2(n):
 29 |     if (not isinstance(n, int)) or (n < 0):
 30 |         raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n)))
 31 |     return (n & (n-1) == 0) and n != 0
 32 | 
 33 | 
 34 | class MSDeformAttn(nn.Module):
 35 |     def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4):
 36 |         """
 37 |         Multi-Scale Deformable Attention Module
 38 |         :param d_model      hidden dimension
 39 |         :param n_levels     number of feature levels
 40 |         :param n_heads      number of attention heads
 41 |         :param n_points     number of sampling points per attention head per feature level
 42 |         """
 43 |         super().__init__()
 44 |         if d_model % n_heads != 0:
 45 |             raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads))
 46 |         _d_per_head = d_model // n_heads
 47 |         # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation
 48 |         if not _is_power_of_2(_d_per_head):
 49 |             warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 "
 50 |                           "which is more efficient in our CUDA implementation.")
 51 | 
 52 |         self.im2col_step = 128
 53 | 
 54 |         self.d_model = d_model
 55 |         self.n_levels = n_levels
 56 |         self.n_heads = n_heads
 57 |         self.n_points = n_points
 58 | 
 59 |         self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2)
 60 |         self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points)
 61 |         self.value_proj = nn.Linear(d_model, d_model)
 62 |         self.output_proj = nn.Linear(d_model, d_model)
 63 | 
 64 |         self._reset_parameters()
 65 | 
 66 |     def _reset_parameters(self):
 67 |         constant_(self.sampling_offsets.weight.data, 0.)
 68 |         thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
 69 |         grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
 70 |         grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1)
 71 |         for i in range(self.n_points):
 72 |             grid_init[:, :, i, :] *= i + 1
 73 |         with torch.no_grad():
 74 |             self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
 75 |         constant_(self.attention_weights.weight.data, 0.)
 76 |         constant_(self.attention_weights.bias.data, 0.)
 77 |         xavier_uniform_(self.value_proj.weight.data)
 78 |         constant_(self.value_proj.bias.data, 0.)
 79 |         xavier_uniform_(self.output_proj.weight.data)
 80 |         constant_(self.output_proj.bias.data, 0.)
 81 | 
 82 |     def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None):
 83 |         """
 84 |         :param query                       (N, Length_{query}, C)
 85 |         :param reference_points            (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area
 86 |                                         or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes
 87 |         :param input_flatten               (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C)
 88 |         :param input_spatial_shapes        (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
 89 |         :param input_level_start_index     (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}]
 90 |         :param input_padding_mask          (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements
 91 | 
 92 |         :return output                     (N, Length_{query}, C)
 93 |         """
 94 |         N, Len_q, _ = query.shape
 95 |         N, Len_in, _ = input_flatten.shape
 96 |         assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in
 97 | 
 98 |         value = self.value_proj(input_flatten)
 99 |         if input_padding_mask is not None:
100 |             value = value.masked_fill(input_padding_mask[..., None], float(0))
101 |         value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads)
102 |         sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2)
103 |         attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points)
104 |         attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points)
105 |         # N, Len_q, n_heads, n_levels, n_points, 2
106 |         if reference_points.shape[-1] == 2:
107 |             offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1)
108 |             sampling_locations = reference_points[:, :, None, :, None, :] \
109 |                                  + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
110 |         elif reference_points.shape[-1] == 4:
111 |             sampling_locations = reference_points[:, :, None, :, None, :2] \
112 |                                  + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
113 |         else:
114 |             raise ValueError(
115 |                 'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1]))
116 |         try:
117 |             output = MSDeformAttnFunction.apply(
118 |                 value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step)
119 |         except:
120 |             # CPU
121 |             output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights)
122 |         # # For FLOPs calculation only
123 |         # output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights)
124 |         output = self.output_proj(output)
125 |         return output
126 | 


--------------------------------------------------------------------------------
/hgformer/data/dataset_mappers/mask_former_instance_dataset_mapper.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | import copy
  3 | import logging
  4 | 
  5 | import numpy as np
  6 | import pycocotools.mask as mask_util
  7 | import torch
  8 | from torch.nn import functional as F
  9 | 
 10 | from detectron2.config import configurable
 11 | from detectron2.data import detection_utils as utils
 12 | from detectron2.data import transforms as T
 13 | from detectron2.projects.point_rend import ColorAugSSDTransform
 14 | from detectron2.structures import BitMasks, Instances, polygons_to_bitmask
 15 | 
 16 | __all__ = ["MaskFormerInstanceDatasetMapper"]
 17 | 
 18 | 
 19 | class MaskFormerInstanceDatasetMapper:
 20 |     """
 21 |     A callable which takes a dataset dict in Detectron2 Dataset format,
 22 |     and map it into a format used by MaskFormer for instance segmentation.
 23 | 
 24 |     The callable currently does the following:
 25 | 
 26 |     1. Read the image from "file_name"
 27 |     2. Applies geometric transforms to the image and annotation
 28 |     3. Find and applies suitable cropping to the image and annotation
 29 |     4. Prepare image and annotation to Tensors
 30 |     """
 31 | 
 32 |     @configurable
 33 |     def __init__(
 34 |         self,
 35 |         is_train=True,
 36 |         *,
 37 |         augmentations,
 38 |         image_format,
 39 |         size_divisibility,
 40 |     ):
 41 |         """
 42 |         NOTE: this interface is experimental.
 43 |         Args:
 44 |             is_train: for training or inference
 45 |             augmentations: a list of augmentations or deterministic transforms to apply
 46 |             image_format: an image format supported by :func:`detection_utils.read_image`.
 47 |             size_divisibility: pad image size to be divisible by this value
 48 |         """
 49 |         self.is_train = is_train
 50 |         self.tfm_gens = augmentations
 51 |         self.img_format = image_format
 52 |         self.size_divisibility = size_divisibility
 53 | 
 54 |         logger = logging.getLogger(__name__)
 55 |         mode = "training" if is_train else "inference"
 56 |         logger.info(f"[{self.__class__.__name__}] Augmentations used in {mode}: {augmentations}")
 57 | 
 58 |     @classmethod
 59 |     def from_config(cls, cfg, is_train=True):
 60 |         # Build augmentation
 61 |         augs = [
 62 |             T.ResizeShortestEdge(
 63 |                 cfg.INPUT.MIN_SIZE_TRAIN,
 64 |                 cfg.INPUT.MAX_SIZE_TRAIN,
 65 |                 cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING,
 66 |             )
 67 |         ]
 68 |         if cfg.INPUT.CROP.ENABLED:
 69 |             augs.append(
 70 |                 T.RandomCrop(
 71 |                     cfg.INPUT.CROP.TYPE,
 72 |                     cfg.INPUT.CROP.SIZE,
 73 |                 )
 74 |             )
 75 |         if cfg.INPUT.COLOR_AUG_SSD:
 76 |             augs.append(ColorAugSSDTransform(img_format=cfg.INPUT.FORMAT))
 77 |         augs.append(T.RandomFlip())
 78 | 
 79 |         ret = {
 80 |             "is_train": is_train,
 81 |             "augmentations": augs,
 82 |             "image_format": cfg.INPUT.FORMAT,
 83 |             "size_divisibility": cfg.INPUT.SIZE_DIVISIBILITY,
 84 |         }
 85 |         return ret
 86 | 
 87 |     def __call__(self, dataset_dict):
 88 |         """
 89 |         Args:
 90 |             dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
 91 | 
 92 |         Returns:
 93 |             dict: a format that builtin models in detectron2 accept
 94 |         """
 95 |         assert self.is_train, "MaskFormerPanopticDatasetMapper should only be used for training!"
 96 | 
 97 |         dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
 98 |         image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
 99 |         utils.check_image_size(dataset_dict, image)
100 | 
101 |         aug_input = T.AugInput(image)
102 |         aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input)
103 |         image = aug_input.image
104 | 
105 |         # transform instnace masks
106 |         assert "annotations" in dataset_dict
107 |         for anno in dataset_dict["annotations"]:
108 |             anno.pop("keypoints", None)
109 | 
110 |         annos = [
111 |             utils.transform_instance_annotations(obj, transforms, image.shape[:2])
112 |             for obj in dataset_dict.pop("annotations")
113 |             if obj.get("iscrowd", 0) == 0
114 |         ]
115 | 
116 |         if len(annos):
117 |             assert "segmentation" in annos[0]
118 |         segms = [obj["segmentation"] for obj in annos]
119 |         masks = []
120 |         for segm in segms:
121 |             if isinstance(segm, list):
122 |                 # polygon
123 |                 masks.append(polygons_to_bitmask(segm, *image.shape[:2]))
124 |             elif isinstance(segm, dict):
125 |                 # COCO RLE
126 |                 masks.append(mask_util.decode(segm))
127 |             elif isinstance(segm, np.ndarray):
128 |                 assert segm.ndim == 2, "Expect segmentation of 2 dimensions, got {}.".format(
129 |                     segm.ndim
130 |                 )
131 |                 # mask array
132 |                 masks.append(segm)
133 |             else:
134 |                 raise ValueError(
135 |                     "Cannot convert segmentation of type '{}' to BitMasks!"
136 |                     "Supported types are: polygons as list[list[float] or ndarray],"
137 |                     " COCO-style RLE as a dict, or a binary segmentation mask "
138 |                     " in a 2D numpy array of shape HxW.".format(type(segm))
139 |                 )
140 | 
141 |         # Pad image and segmentation label here!
142 |         image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
143 |         masks = [torch.from_numpy(np.ascontiguousarray(x)) for x in masks]
144 | 
145 |         classes = [int(obj["category_id"]) for obj in annos]
146 |         classes = torch.tensor(classes, dtype=torch.int64)
147 | 
148 |         if self.size_divisibility > 0:
149 |             image_size = (image.shape[-2], image.shape[-1])
150 |             padding_size = [
151 |                 0,
152 |                 self.size_divisibility - image_size[1],
153 |                 0,
154 |                 self.size_divisibility - image_size[0],
155 |             ]
156 |             # pad image
157 |             image = F.pad(image, padding_size, value=128).contiguous()
158 |             # pad mask
159 |             masks = [F.pad(x, padding_size, value=0).contiguous() for x in masks]
160 | 
161 |         image_shape = (image.shape[-2], image.shape[-1])  # h, w
162 | 
163 |         # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
164 |         # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
165 |         # Therefore it's important to use torch.Tensor.
166 |         dataset_dict["image"] = image
167 | 
168 |         # Prepare per-category binary masks
169 |         instances = Instances(image_shape)
170 |         instances.gt_classes = classes
171 |         if len(masks) == 0:
172 |             # Some image does not have annotation (all ignored)
173 |             instances.gt_masks = torch.zeros((0, image.shape[-2], image.shape[-1]))
174 |         else:
175 |             masks = BitMasks(torch.stack(masks))
176 |             instances.gt_masks = masks.tensor
177 | 
178 |         dataset_dict["instances"] = instances
179 | 
180 |         return dataset_dict
181 | 


--------------------------------------------------------------------------------
/hgformer/modeling/transformer_decoder/maskformer_transformer_decoder.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/detr.py
  3 | import fvcore.nn.weight_init as weight_init
  4 | import torch
  5 | from torch import nn
  6 | from torch.nn import functional as F
  7 | 
  8 | from detectron2.config import configurable
  9 | from detectron2.layers import Conv2d
 10 | from detectron2.utils.registry import Registry
 11 | 
 12 | from .position_encoding import PositionEmbeddingSine
 13 | from .transformer import Transformer
 14 | 
 15 | 
 16 | TRANSFORMER_DECODER_REGISTRY = Registry("TRANSFORMER_MODULE")
 17 | TRANSFORMER_DECODER_REGISTRY.__doc__ = """
 18 | Registry for transformer module in MaskFormer.
 19 | """
 20 | 
 21 | 
 22 | def build_transformer_decoder(cfg, in_channels, mask_classification=True):
 23 |     """
 24 |     Build a instance embedding branch from `cfg.MODEL.INS_EMBED_HEAD.NAME`.
 25 |     """
 26 |     name = cfg.MODEL.MASK_FORMER.TRANSFORMER_DECODER_NAME
 27 |     return TRANSFORMER_DECODER_REGISTRY.get(name)(cfg, in_channels, mask_classification)
 28 | 
 29 | 
 30 | @TRANSFORMER_DECODER_REGISTRY.register()
 31 | class StandardTransformerDecoder(nn.Module):
 32 |     @configurable
 33 |     def __init__(
 34 |         self,
 35 |         in_channels,
 36 |         mask_classification=True,
 37 |         *,
 38 |         num_classes: int,
 39 |         hidden_dim: int,
 40 |         num_queries: int,
 41 |         nheads: int,
 42 |         dropout: float,
 43 |         dim_feedforward: int,
 44 |         enc_layers: int,
 45 |         dec_layers: int,
 46 |         pre_norm: bool,
 47 |         deep_supervision: bool,
 48 |         mask_dim: int,
 49 |         enforce_input_project: bool,
 50 |     ):
 51 |         """
 52 |         NOTE: this interface is experimental.
 53 |         Args:
 54 |             in_channels: channels of the input features
 55 |             mask_classification: whether to add mask classifier or not
 56 |             num_classes: number of classes
 57 |             hidden_dim: Transformer feature dimension
 58 |             num_queries: number of queries
 59 |             nheads: number of heads
 60 |             dropout: dropout in Transformer
 61 |             dim_feedforward: feature dimension in feedforward network
 62 |             enc_layers: number of Transformer encoder layers
 63 |             dec_layers: number of Transformer decoder layers
 64 |             pre_norm: whether to use pre-LayerNorm or not
 65 |             deep_supervision: whether to add supervision to every decoder layers
 66 |             mask_dim: mask feature dimension
 67 |             enforce_input_project: add input project 1x1 conv even if input
 68 |                 channels and hidden dim is identical
 69 |         """
 70 |         super().__init__()
 71 | 
 72 |         self.mask_classification = mask_classification
 73 | 
 74 |         # positional encoding
 75 |         N_steps = hidden_dim // 2
 76 |         self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True)
 77 | 
 78 |         transformer = Transformer(
 79 |             d_model=hidden_dim,
 80 |             dropout=dropout,
 81 |             nhead=nheads,
 82 |             dim_feedforward=dim_feedforward,
 83 |             num_encoder_layers=enc_layers,
 84 |             num_decoder_layers=dec_layers,
 85 |             normalize_before=pre_norm,
 86 |             return_intermediate_dec=deep_supervision,
 87 |         )
 88 | 
 89 |         self.num_queries = num_queries
 90 |         self.transformer = transformer
 91 |         hidden_dim = transformer.d_model
 92 | 
 93 |         self.query_embed = nn.Embedding(num_queries, hidden_dim)
 94 | 
 95 |         if in_channels != hidden_dim or enforce_input_project:
 96 |             self.input_proj = Conv2d(in_channels, hidden_dim, kernel_size=1)
 97 |             weight_init.c2_xavier_fill(self.input_proj)
 98 |         else:
 99 |             self.input_proj = nn.Sequential()
100 |         self.aux_loss = deep_supervision
101 | 
102 |         # output FFNs
103 |         if self.mask_classification:
104 |             self.class_embed = nn.Linear(hidden_dim, num_classes + 1)
105 |         self.mask_embed = MLP(hidden_dim, hidden_dim, mask_dim, 3)
106 | 
107 |     @classmethod
108 |     def from_config(cls, cfg, in_channels, mask_classification):
109 |         ret = {}
110 |         ret["in_channels"] = in_channels
111 |         ret["mask_classification"] = mask_classification
112 | 
113 |         ret["num_classes"] = cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES
114 |         ret["hidden_dim"] = cfg.MODEL.MASK_FORMER.HIDDEN_DIM
115 |         ret["num_queries"] = cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES
116 |         # Transformer parameters:
117 |         ret["nheads"] = cfg.MODEL.MASK_FORMER.NHEADS
118 |         ret["dropout"] = cfg.MODEL.MASK_FORMER.DROPOUT
119 |         ret["dim_feedforward"] = cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD
120 |         ret["enc_layers"] = cfg.MODEL.MASK_FORMER.ENC_LAYERS
121 |         ret["dec_layers"] = cfg.MODEL.MASK_FORMER.DEC_LAYERS
122 |         ret["pre_norm"] = cfg.MODEL.MASK_FORMER.PRE_NORM
123 |         ret["deep_supervision"] = cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION
124 |         ret["enforce_input_project"] = cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ
125 | 
126 |         ret["mask_dim"] = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM
127 | 
128 |         return ret
129 | 
130 |     def forward(self, x, mask_features, mask=None):
131 |         if mask is not None:
132 |             mask = F.interpolate(mask[None].float(), size=x.shape[-2:]).to(torch.bool)[0]
133 |         pos = self.pe_layer(x, mask)
134 | 
135 |         src = x
136 |         hs, memory = self.transformer(self.input_proj(src), mask, self.query_embed.weight, pos)
137 | 
138 |         if self.mask_classification:
139 |             outputs_class = self.class_embed(hs)
140 |             out = {"pred_logits": outputs_class[-1]}
141 |         else:
142 |             out = {}
143 | 
144 |         if self.aux_loss:
145 |             # [l, bs, queries, embed]
146 |             mask_embed = self.mask_embed(hs)
147 |             outputs_seg_masks = torch.einsum("lbqc,bchw->lbqhw", mask_embed, mask_features)
148 |             out["pred_masks"] = outputs_seg_masks[-1]
149 |             out["aux_outputs"] = self._set_aux_loss(
150 |                 outputs_class if self.mask_classification else None, outputs_seg_masks
151 |             )
152 |         else:
153 |             # FIXME h_boxes takes the last one computed, keep this in mind
154 |             # [bs, queries, embed]
155 |             mask_embed = self.mask_embed(hs[-1])
156 |             outputs_seg_masks = torch.einsum("bqc,bchw->bqhw", mask_embed, mask_features)
157 |             out["pred_masks"] = outputs_seg_masks
158 |         return out
159 | 
160 |     @torch.jit.unused
161 |     def _set_aux_loss(self, outputs_class, outputs_seg_masks):
162 |         # this is a workaround to make torchscript happy, as torchscript
163 |         # doesn't support dictionary with non-homogeneous values, such
164 |         # as a dict having both a Tensor and a list.
165 |         if self.mask_classification:
166 |             return [
167 |                 {"pred_logits": a, "pred_masks": b}
168 |                 for a, b in zip(outputs_class[:-1], outputs_seg_masks[:-1])
169 |             ]
170 |         else:
171 |             return [{"pred_masks": b} for b in outputs_seg_masks[:-1]]
172 | 
173 | 
174 | class MLP(nn.Module):
175 |     """Very simple multi-layer perceptron (also called FFN)"""
176 | 
177 |     def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
178 |         super().__init__()
179 |         self.num_layers = num_layers
180 |         h = [hidden_dim] * (num_layers - 1)
181 |         self.layers = nn.ModuleList(
182 |             nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])
183 |         )
184 | 
185 |     def forward(self, x):
186 |         for i, layer in enumerate(self.layers):
187 |             x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
188 |         return x
189 | 


--------------------------------------------------------------------------------
/hgformer/data/dataset_mappers/mask_former_semantic_dataset_mapper.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | import copy
  3 | import logging
  4 | 
  5 | import numpy as np
  6 | import torch
  7 | from torch.nn import functional as F
  8 | 
  9 | from detectron2.config import configurable
 10 | from detectron2.data import MetadataCatalog
 11 | from detectron2.data import detection_utils as utils
 12 | from detectron2.data import transforms as T
 13 | from detectron2.projects.point_rend import ColorAugSSDTransform
 14 | from detectron2.structures import BitMasks, Instances
 15 | 
 16 | __all__ = ["MaskFormerSemanticDatasetMapper"]
 17 | 
 18 | 
 19 | class MaskFormerSemanticDatasetMapper:
 20 |     """
 21 |     A callable which takes a dataset dict in Detectron2 Dataset format,
 22 |     and map it into a format used by MaskFormer for semantic segmentation.
 23 | 
 24 |     The callable currently does the following:
 25 | 
 26 |     1. Read the image from "file_name"
 27 |     2. Applies geometric transforms to the image and annotation
 28 |     3. Find and applies suitable cropping to the image and annotation
 29 |     4. Prepare image and annotation to Tensors
 30 |     """
 31 | 
 32 |     @configurable
 33 |     def __init__(
 34 |         self,
 35 |         is_train=True,
 36 |         *,
 37 |         augmentations,
 38 |         image_format,
 39 |         ignore_label,
 40 |         size_divisibility,
 41 |     ):
 42 |         """
 43 |         NOTE: this interface is experimental.
 44 |         Args:
 45 |             is_train: for training or inference
 46 |             augmentations: a list of augmentations or deterministic transforms to apply
 47 |             image_format: an image format supported by :func:`detection_utils.read_image`.
 48 |             ignore_label: the label that is ignored to evaluation
 49 |             size_divisibility: pad image size to be divisible by this value
 50 |         """
 51 |         self.is_train = is_train
 52 |         self.tfm_gens = augmentations
 53 |         self.img_format = image_format
 54 |         self.ignore_label = ignore_label
 55 |         self.size_divisibility = size_divisibility
 56 | 
 57 |         logger = logging.getLogger(__name__)
 58 |         mode = "training" if is_train else "inference"
 59 |         logger.info(f"[{self.__class__.__name__}] Augmentations used in {mode}: {augmentations}")
 60 | 
 61 |     @classmethod
 62 |     def from_config(cls, cfg, is_train=True):
 63 |         # Build augmentation
 64 |         augs = [
 65 |             T.ResizeShortestEdge(
 66 |                 cfg.INPUT.MIN_SIZE_TRAIN,
 67 |                 cfg.INPUT.MAX_SIZE_TRAIN,
 68 |                 cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING,
 69 |             )
 70 |         ]
 71 |         if cfg.INPUT.CROP.ENABLED:
 72 |             augs.append(
 73 |                 T.RandomCrop_CategoryAreaConstraint(
 74 |                     cfg.INPUT.CROP.TYPE,
 75 |                     cfg.INPUT.CROP.SIZE,
 76 |                     cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA,
 77 |                     cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
 78 |                 )
 79 |             )
 80 |         # import ipdb; ipdb.set_trace()
 81 |         if cfg.INPUT.COLOR_AUG_SSD:
 82 |             augs.append(ColorAugSSDTransform(img_format=cfg.INPUT.FORMAT))
 83 |         augs.append(T.RandomFlip())
 84 | 
 85 |         # Assume always applies to the training set.
 86 |         dataset_names = cfg.DATASETS.TRAIN
 87 |         meta = MetadataCatalog.get(dataset_names[0])
 88 |         ignore_label = meta.ignore_label
 89 | 
 90 |         ret = {
 91 |             "is_train": is_train,
 92 |             "augmentations": augs,
 93 |             "image_format": cfg.INPUT.FORMAT,
 94 |             "ignore_label": ignore_label,
 95 |             "size_divisibility": cfg.INPUT.SIZE_DIVISIBILITY,
 96 |         }
 97 |         return ret
 98 | 
 99 |     def __call__(self, dataset_dict):
100 |         """
101 |         Args:
102 |             dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
103 | 
104 |         Returns:
105 |             dict: a format that builtin models in detectron2 accept
106 |         """
107 |         assert self.is_train, "MaskFormerSemanticDatasetMapper should only be used for training!"
108 | 
109 |         dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
110 |         image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
111 |         utils.check_image_size(dataset_dict, image)
112 |         # import ipdb; ipdb.set_trace()
113 |         if "sem_seg_file_name" in dataset_dict:
114 |             # PyTorch transformation not implemented for uint16, so converting it to double first
115 |             sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name")).astype("double")
116 |         else:
117 |             sem_seg_gt = None
118 | 
119 |         if sem_seg_gt is None:
120 |             raise ValueError(
121 |                 "Cannot find 'sem_seg_file_name' for semantic segmentation dataset {}.".format(
122 |                     dataset_dict["file_name"]
123 |                 )
124 |             )
125 | 
126 |         aug_input = T.AugInput(image, sem_seg=sem_seg_gt)
127 |         aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input)
128 |         image = aug_input.image
129 |         sem_seg_gt = aug_input.sem_seg
130 | 
131 |         # Pad image and segmentation label here!
132 |         image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
133 |         if sem_seg_gt is not None:
134 |             sem_seg_gt = torch.as_tensor(sem_seg_gt.astype("long"))
135 | 
136 |         if self.size_divisibility > 0:
137 |             image_size = (image.shape[-2], image.shape[-1])
138 |             padding_size = [
139 |                 0,
140 |                 self.size_divisibility - image_size[1],
141 |                 0,
142 |                 self.size_divisibility - image_size[0],
143 |             ]
144 |             image = F.pad(image, padding_size, value=128).contiguous()
145 |             if sem_seg_gt is not None:
146 |                 sem_seg_gt = F.pad(sem_seg_gt, padding_size, value=self.ignore_label).contiguous()
147 | 
148 |         image_shape = (image.shape[-2], image.shape[-1])  # h, w
149 | 
150 |         # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
151 |         # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
152 |         # Therefore it's important to use torch.Tensor.
153 |         dataset_dict["image"] = image
154 | 
155 |         if sem_seg_gt is not None:
156 |             dataset_dict["sem_seg"] = sem_seg_gt.long()
157 |         # import ipdb; ipdb.set_trace()
158 |         if "annotations" in dataset_dict:
159 |             raise ValueError("Semantic segmentation dataset should not have 'annotations'.")
160 |         # import ipdb; ipdb.set_trace()
161 |         # Prepare per-category binary masks
162 |         if sem_seg_gt is not None:
163 |             sem_seg_gt = sem_seg_gt.numpy()
164 |             instances = Instances(image_shape)
165 |             classes = np.unique(sem_seg_gt)
166 |             # remove ignored region
167 |             classes = classes[classes != self.ignore_label]
168 |             instances.gt_classes = torch.tensor(classes, dtype=torch.int64)
169 | 
170 |             masks = []
171 |             for class_id in classes:
172 |                 masks.append(sem_seg_gt == class_id)
173 | 
174 |             if len(masks) == 0:
175 |                 # Some image does not have annotation (all ignored)
176 |                 instances.gt_masks = torch.zeros((0, sem_seg_gt.shape[-2], sem_seg_gt.shape[-1]))
177 |             else:
178 |                 try:
179 |                     masks = BitMasks(
180 |                         torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks])
181 |                     )
182 |                     instances.gt_masks = masks.tensor
183 |                 except:
184 |                     import ipdb; ipdb.set_trace()
185 | 
186 |             dataset_dict["instances"] = instances
187 |         # import ipdb; ipdb.set_trace()
188 |         return dataset_dict
189 | 


--------------------------------------------------------------------------------
/hgformer/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.cu:
--------------------------------------------------------------------------------
  1 | /*!
  2 | **************************************************************************************************
  3 | * Deformable DETR
  4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
  5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  6 | **************************************************************************************************
  7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
  8 | **************************************************************************************************
  9 | */
 10 | 
 11 | /*!
 12 | * Copyright (c) Facebook, Inc. and its affiliates.
 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
 14 | */
 15 | 
 16 | #include <vector>
 17 | #include "cuda/ms_deform_im2col_cuda.cuh"
 18 | 
 19 | #include <ATen/ATen.h>
 20 | #include <ATen/cuda/CUDAContext.h>
 21 | #include <cuda.h>
 22 | #include <cuda_runtime.h>
 23 | 
 24 | 
 25 | at::Tensor ms_deform_attn_cuda_forward(
 26 |     const at::Tensor &value, 
 27 |     const at::Tensor &spatial_shapes,
 28 |     const at::Tensor &level_start_index,
 29 |     const at::Tensor &sampling_loc,
 30 |     const at::Tensor &attn_weight,
 31 |     const int im2col_step)
 32 | {
 33 |     AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
 34 |     AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
 35 |     AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
 36 |     AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
 37 |     AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
 38 | 
 39 |     AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
 40 |     AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
 41 |     AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
 42 |     AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
 43 |     AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
 44 | 
 45 |     const int batch = value.size(0);
 46 |     const int spatial_size = value.size(1);
 47 |     const int num_heads = value.size(2);
 48 |     const int channels = value.size(3);
 49 | 
 50 |     const int num_levels = spatial_shapes.size(0);
 51 | 
 52 |     const int num_query = sampling_loc.size(1);
 53 |     const int num_point = sampling_loc.size(4);
 54 | 
 55 |     const int im2col_step_ = std::min(batch, im2col_step);
 56 | 
 57 |     AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
 58 |     
 59 |     auto output = at::zeros({batch, num_query, num_heads, channels}, value.options());
 60 | 
 61 |     const int batch_n = im2col_step_;
 62 |     auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
 63 |     auto per_value_size = spatial_size * num_heads * channels;
 64 |     auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
 65 |     auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
 66 |     for (int n = 0; n < batch/im2col_step_; ++n)
 67 |     {
 68 |         auto columns = output_n.select(0, n);
 69 |         AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_forward_cuda", ([&] {
 70 |             ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(),
 71 |                 value.data<scalar_t>() + n * im2col_step_ * per_value_size,
 72 |                 spatial_shapes.data<int64_t>(),
 73 |                 level_start_index.data<int64_t>(),
 74 |                 sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
 75 |                 attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
 76 |                 batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
 77 |                 columns.data<scalar_t>());
 78 | 
 79 |         }));
 80 |     }
 81 | 
 82 |     output = output.view({batch, num_query, num_heads*channels});
 83 | 
 84 |     return output;
 85 | }
 86 | 
 87 | 
 88 | std::vector<at::Tensor> ms_deform_attn_cuda_backward(
 89 |     const at::Tensor &value, 
 90 |     const at::Tensor &spatial_shapes,
 91 |     const at::Tensor &level_start_index,
 92 |     const at::Tensor &sampling_loc,
 93 |     const at::Tensor &attn_weight,
 94 |     const at::Tensor &grad_output,
 95 |     const int im2col_step)
 96 | {
 97 | 
 98 |     AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
 99 |     AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
100 |     AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
101 |     AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
102 |     AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
103 |     AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous");
104 | 
105 |     AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
106 |     AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
107 |     AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
108 |     AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
109 |     AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
110 |     AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor");
111 | 
112 |     const int batch = value.size(0);
113 |     const int spatial_size = value.size(1);
114 |     const int num_heads = value.size(2);
115 |     const int channels = value.size(3);
116 | 
117 |     const int num_levels = spatial_shapes.size(0);
118 | 
119 |     const int num_query = sampling_loc.size(1);
120 |     const int num_point = sampling_loc.size(4);
121 | 
122 |     const int im2col_step_ = std::min(batch, im2col_step);
123 | 
124 |     AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
125 | 
126 |     auto grad_value = at::zeros_like(value);
127 |     auto grad_sampling_loc = at::zeros_like(sampling_loc);
128 |     auto grad_attn_weight = at::zeros_like(attn_weight);
129 | 
130 |     const int batch_n = im2col_step_;
131 |     auto per_value_size = spatial_size * num_heads * channels;
132 |     auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
133 |     auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
134 |     auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
135 |     
136 |     for (int n = 0; n < batch/im2col_step_; ++n)
137 |     {
138 |         auto grad_output_g = grad_output_n.select(0, n);
139 |         AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_backward_cuda", ([&] {
140 |             ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(),
141 |                                     grad_output_g.data<scalar_t>(),
142 |                                     value.data<scalar_t>() + n * im2col_step_ * per_value_size,
143 |                                     spatial_shapes.data<int64_t>(),
144 |                                     level_start_index.data<int64_t>(),
145 |                                     sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
146 |                                     attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
147 |                                     batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
148 |                                     grad_value.data<scalar_t>() +  n * im2col_step_ * per_value_size,
149 |                                     grad_sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
150 |                                     grad_attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size);
151 | 
152 |         }));
153 |     }
154 | 
155 |     return {
156 |         grad_value, grad_sampling_loc, grad_attn_weight
157 |     };
158 | }


--------------------------------------------------------------------------------
/hgformer/config.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Copyright (c) Facebook, Inc. and its affiliates.
  3 | from detectron2.config import CfgNode as CN
  4 | 
  5 | 
  6 | def add_maskformer2_config(cfg):
  7 |     """
  8 |     Add config for MASK_FORMER.
  9 |     """
 10 |     # NOTE: configs from original maskformer
 11 |     # data config
 12 |     # select the dataset mapper
 13 |     cfg.INPUT.DATASET_MAPPER_NAME = "mask_former_semantic"
 14 |     # Color augmentation
 15 |     cfg.INPUT.COLOR_AUG_SSD = False
 16 |     cfg.INPUT.COLOR_AUG_MIX = 'partial'
 17 |     # We retry random cropping until no single category in semantic segmentation GT occupies more
 18 |     # than `SINGLE_CATEGORY_MAX_AREA` part of the crop.
 19 |     cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA = 1.0
 20 |     # Pad image and segmentation GT in dataset mapper.
 21 |     cfg.INPUT.SIZE_DIVISIBILITY = -1
 22 | 
 23 |     # solver config
 24 |     # weight decay on embedding
 25 |     cfg.SOLVER.WEIGHT_DECAY_EMBED = 0.0
 26 |     # optimizer
 27 |     cfg.SOLVER.OPTIMIZER = "ADAMW"
 28 |     cfg.SOLVER.BACKBONE_MULTIPLIER = 0.1
 29 | 
 30 |     # mask_former model config
 31 |     cfg.MODEL.MASK_FORMER = CN()
 32 | 
 33 |     # loss
 34 |     cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION = True
 35 |     cfg.MODEL.MASK_FORMER.DEEP_MASK_SUPERVISION = False
 36 |     cfg.MODEL.MASK_FORMER.NO_OBJECT_WEIGHT = 0.1
 37 |     cfg.MODEL.MASK_FORMER.CLASS_WEIGHT = 1.0
 38 |     cfg.MODEL.MASK_FORMER.DICE_WEIGHT = 1.0
 39 |     cfg.MODEL.MASK_FORMER.MASK_WEIGHT = 20.0
 40 |     cfg.MODEL.MASK_FORMER.SPIX_MASK_WEIGHT = 20.0
 41 |     cfg.MODEL.MASK_FORMER.SPIX_COLOR_WEIGHT = 1.0
 42 |     cfg.MODEL.MASK_FORMER.SPIX_CLASS_WEIGHT = 1.0
 43 |     cfg.MODEL.MASK_FORMER.PIXEL_CLASS_WEIGHT = 2.0
 44 |     cfg.MODEL.MASK_FORMER.REGION_PROXY_CLS_WEIGHT = 2.0
 45 |     cfg.MODEL.MASK_FORMER.CONTRASTIVE_WEIGH = 2.0
 46 |     cfg.MODEL.MASK_FORMER.CONTRASTIVE_LOSS = False
 47 |     # cfg.MODEL.MASK_FORMER.EDGE_DISTANCES = [1, 2, 4, 8]
 48 |     cfg.MODEL.MASK_FORMER.HIGH_THRESHOLD = 0.3
 49 |     cfg.MODEL.MASK_FORMER.LOW_THRESHOLD = 0.05
 50 |     cfg.MODEL.MASK_FORMER.RETURN_ITERATION = False
 51 |     cfg.MODEL.MASK_FORMER.OBLIQUE_DISTANCES = [1, 2, 4, 8]
 52 |     # cfg.MODEL.MASK_FORMER.BYOL_WEIGH = 2.0
 53 |     # cfg.MODEL.MASK_FORMER.EDGE_WEIGH = 2.0
 54 |     # cfg.MODEL.MASK_FORMER.PSEUDO_EDGE_WEIGH = 2.0
 55 |     cfg.MODEL.MASK_FORMER.SPIX_PIXEL_CLS_WEIGH = 2.0
 56 |     # cfg.MODEL.MASK_FORMER.BYOL_LOSS = False
 57 |     # cfg.MODEL.MASK_FORMER.EDGE_LOSS = False
 58 |     cfg.MODEL.MASK_FORMER.CONTRASTIVE_TAU = 0.3
 59 |     cfg.MODEL.MASK_FORMER.COMPUTE_RAMA = False
 60 |     cfg.MODEL.MASK_FORMER.RECONSTRUCT_LOSS = False
 61 |     cfg.MODEL.MASK_FORMER.RECONSTRUCT_COLOR = False
 62 |     cfg.MODEL.MASK_FORMER.RECONSTRUCT_COORD = False
 63 |     cfg.MODEL.MASK_FORMER.STAGE_WEIGHTS = [1.0, 1.0]
 64 |     cfg.MODEL.MASK_FORMER.SPIX_MASK_STAGE2 = 1.0
 65 | 
 66 |     # transformer config
 67 |     cfg.MODEL.MASK_FORMER.NHEADS = 8
 68 |     cfg.MODEL.MASK_FORMER.DROPOUT = 0.1
 69 |     cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD = 2048
 70 |     cfg.MODEL.MASK_FORMER.ENC_LAYERS = 0
 71 |     cfg.MODEL.MASK_FORMER.DEC_LAYERS = 6
 72 |     cfg.MODEL.MASK_FORMER.SPIX_SELF_ATTEN_LAYERS = 6
 73 |     cfg.MODEL.MASK_FORMER.PRE_NORM = False
 74 | 
 75 |     cfg.MODEL.MASK_FORMER.HIDDEN_DIM = 256
 76 |     cfg.MODEL.MASK_FORMER.CONTRASTIVE_DIM = 128
 77 |     cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES = 100
 78 | 
 79 |     cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE = "res5"
 80 |     cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ = False
 81 | 
 82 |     # mask_former inference config
 83 |     cfg.MODEL.MASK_FORMER.TEST = CN()
 84 |     cfg.MODEL.MASK_FORMER.TEST.SEMANTIC_ON = True
 85 |     cfg.MODEL.MASK_FORMER.TEST.INSTANCE_ON = False
 86 |     cfg.MODEL.MASK_FORMER.TEST.PANOPTIC_ON = False
 87 |     cfg.MODEL.MASK_FORMER.TEST.OBJECT_MASK_THRESHOLD = 0.0
 88 |     cfg.MODEL.MASK_FORMER.TEST.OVERLAP_THRESHOLD = 0.0
 89 |     cfg.MODEL.MASK_FORMER.TEST.SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE = False
 90 |     # cfg.TEST.MODE = "whole" # "whole" or "slide"
 91 |     # cfg.TEST.STRIDE = (300, 768)
 92 |     # cfg.TEST.CROP_SIZE = (512, 1024)
 93 |     cfg.TEST.CLUSTER_SOFTMAX = False
 94 |     cfg.TEST.PRED_STAGE = "all"
 95 | 
 96 |     # Sometimes `backbone.size_divisibility` is set to 0 for some backbone (e.g. ResNet)
 97 |     # you can use this config to override
 98 |     cfg.MODEL.MASK_FORMER.SIZE_DIVISIBILITY = 32
 99 | 
100 |     cfg.MODEL.MASK_FORMER.GZERO_CALIBRATE = -1.0
101 |     cfg.MODEL.MASK_FORMER.ENSEMBLING = False
102 |     cfg.MODEL.MASK_FORMER.ENSEMBLING_ALL_CLS = False
103 | 
104 |     # vis
105 |     cfg.MODEL.MASK_FORMER.VIS = False
106 |     cfg.MODEL.MASK_FORMER.QUERY_SHAPE = [8, 16] # h, w
107 |     cfg.MODEL.MASK_FORMER.ENSEMBLING_START = 1
108 | 
109 |     # pixel decoder config
110 |     cfg.MODEL.SEM_SEG_HEAD.MASK_DIM = 256
111 |     # adding transformer in pixel decoder
112 |     cfg.MODEL.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS = 0
113 |     # pixel decoder
114 |     cfg.MODEL.SEM_SEG_HEAD.PIXEL_DECODER_NAME = "BasePixelDecoder"
115 |     # gzero calibrate
116 |     cfg.MODEL.SEM_SEG_HEAD.GZERO_CALIBRATE = -1.0
117 | 
118 |     # swin transformer backbone
119 |     cfg.MODEL.SWIN = CN()
120 |     cfg.MODEL.SWIN.PRETRAIN_IMG_SIZE = 224
121 |     cfg.MODEL.SWIN.PATCH_SIZE = 4
122 |     cfg.MODEL.SWIN.EMBED_DIM = 96
123 |     cfg.MODEL.SWIN.DEPTHS = [2, 2, 6, 2]
124 |     cfg.MODEL.SWIN.NUM_HEADS = [3, 6, 12, 24]
125 |     cfg.MODEL.SWIN.WINDOW_SIZE = 7
126 |     cfg.MODEL.SWIN.MLP_RATIO = 4.0
127 |     cfg.MODEL.SWIN.QKV_BIAS = True
128 |     cfg.MODEL.SWIN.QK_SCALE = None
129 |     cfg.MODEL.SWIN.DROP_RATE = 0.0
130 |     cfg.MODEL.SWIN.ATTN_DROP_RATE = 0.0
131 |     cfg.MODEL.SWIN.DROP_PATH_RATE = 0.3
132 |     cfg.MODEL.SWIN.APE = False
133 |     cfg.MODEL.SWIN.PATCH_NORM = True
134 |     cfg.MODEL.SWIN.OUT_FEATURES = ["res2", "res3", "res4", "res5"]
135 |     cfg.MODEL.SWIN.USE_CHECKPOINT = False
136 | 
137 |     # pvt backbone
138 |     cfg.MODEL.PVTV2 = CN()
139 |     cfg.MODEL.PVTV2.PATCH_SIZE = 4
140 |     cfg.MODEL.PVTV2.IN_CHANS = 3
141 |     cfg.MODEL.PVTV2.EMBED_DIMS = [32, 64, 160, 256]
142 |     cfg.MODEL.PVTV2.NUM_HEADS = [1, 2, 5, 8]
143 |     cfg.MODEL.PVTV2.MLP_RATIO = [8, 8, 4, 4]
144 |     cfg.MODEL.PVTV2.QKV_BIAS = True
145 |     cfg.MODEL.PVTV2.DROP_RATE = 0.0
146 |     cfg.MODEL.PVTV2.DROP_PATH_RATE = 0.
147 |     cfg.MODEL.PVTV2.QK_SCALE = None
148 |     cfg.MODEL.PVTV2.DEPTHS = [2, 2, 2, 2]
149 |     cfg.MODEL.PVTV2.SR_RATIOS = [8, 4, 2, 1]
150 |     cfg.MODEL.PVTV2.OUT_FEATURES = ["res2", "res3", "res4", "res5"]
151 | 
152 | 
153 |     cfg.MODEL.SEM_SEG_HEAD.MASKATTENTIONPOOL = False
154 |     cfg.MODEL.SEM_SEG_HEAD.TEMPERATURE = 0.01
155 |     cfg.MODEL.SEM_SEG_HEAD.GAT_NUM_LAYERS = 2
156 |     cfg.MODEL.SEM_SEG_HEAD.DOWNSAMPLE_RATE = 4
157 |     # cfg.MODEL.CRITERION = "spix" # default
158 | 
159 |     # self training config
160 |     cfg.MODEL.PSEUDO_LABEL = False
161 |     cfg.MODEL.PSEUDO_WEIGHT = 1.0
162 |     cfg.MODEL.PSEUDO_THR = -1.
163 | 
164 | 
165 |     cfg.MODEL.DYNAMIC_MEN_STD = False
166 |     # cfg.MODEL.LAB_INPUT = False
167 | 
168 |     # NOTE: maskformer2 extra conffigs
169 |     # transformer module
170 |     cfg.MODEL.MASK_FORMER.TRANSFORMER_DECODER_NAME = "MultiScaleMaskedTransformerDecoder"
171 | 
172 |     # LSJ aug
173 |     cfg.INPUT.IMAGE_SIZE = 1024
174 |     cfg.INPUT.MIN_SCALE = 0.1
175 |     cfg.INPUT.MAX_SCALE = 2.0
176 | 
177 |     # MSDeformAttn encoder configs
178 |     cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES = ["res3", "res4", "res5"]
179 |     cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_POINTS = 4
180 |     cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_HEADS = 8
181 | 
182 |     # point loss configs
183 |     # Number of points sampled during training for a mask point head.
184 |     cfg.MODEL.MASK_FORMER.TRAIN_NUM_POINTS = 112 * 112
185 |     # Oversampling parameter for PointRend point sampling during training. Parameter `k` in the
186 |     # original paper.
187 |     cfg.MODEL.MASK_FORMER.OVERSAMPLE_RATIO = 3.0
188 |     # Importance sampling parameter for PointRend point sampling during training. Parametr `beta` in
189 |     # the original paper.
190 |     cfg.MODEL.MASK_FORMER.IMPORTANCE_SAMPLE_RATIO = 0.75
191 | 
192 |     # params for groupformer
193 |     cfg.MODEL.SEM_SEG_HEAD.NUM_GROUP_TOKENS = [256, 128, 64]
194 |     cfg.MODEL.SEM_SEG_HEAD.NUM_OUTPUT_GROUPS = [256, 128, 64]
195 |     cfg.MODEL.SEM_SEG_HEAD.NUM_HEADS = [8, 8, 8]
196 |     cfg.MODEL.SEM_SEG_HEAD.SPIX_RES = [32, 32]
197 |     cfg.MODEL.SEM_SEG_HEAD.MASK_POOL_STYLE = "attn_pool"
198 |     cfg.MODEL.SEM_SEG_HEAD.TAU = 0.07
199 | 
200 |     cfg.MODEL.OUT_SUBMISSION_FORMAT = False
201 | 
202 |     cfg.MODEL.SEM_SEG_HEAD.SPIX_SELF_ATTEN = True
203 |     cfg.MODEL.SEM_SEG_HEAD.SPIX_FFN = True
204 | 


--------------------------------------------------------------------------------
/demo/predictor.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # Copied from: https://github.com/facebookresearch/detectron2/blob/master/demo/predictor.py
  3 | import atexit
  4 | import bisect
  5 | import multiprocessing as mp
  6 | from collections import deque
  7 | 
  8 | import cv2
  9 | import torch
 10 | 
 11 | from detectron2.data import MetadataCatalog
 12 | from detectron2.engine.defaults import DefaultPredictor
 13 | from detectron2.utils.video_visualizer import VideoVisualizer
 14 | from detectron2.utils.visualizer import ColorMode, Visualizer
 15 | 
 16 | 
 17 | class VisualizationDemo(object):
 18 |     def __init__(self, cfg, instance_mode=ColorMode.IMAGE, parallel=False):
 19 |         """
 20 |         Args:
 21 |             cfg (CfgNode):
 22 |             instance_mode (ColorMode):
 23 |             parallel (bool): whether to run the model in different processes from visualization.
 24 |                 Useful since the visualization logic can be slow.
 25 |         """
 26 |         # import ipdb; ipdb.set_trace()
 27 |         # self.metadata = MetadataCatalog.get(
 28 |         #     cfg.DATASETS.TEST[0] if len(cfg.DATASETS.TEST) else "__unused"
 29 |         # )
 30 |         # TODO: fix it, sorry, hard coded for cityscapes categories
 31 |         self.metadata = MetadataCatalog.get(
 32 |             cfg.DATASETS.TEST[0] if len(cfg.DATASETS.TEST) else "__unused"
 33 |         )
 34 |         self.cpu_device = torch.device("cpu")
 35 |         self.instance_mode = instance_mode
 36 | 
 37 |         self.parallel = parallel
 38 |         if parallel:
 39 |             num_gpu = torch.cuda.device_count()
 40 |             self.predictor = AsyncPredictor(cfg, num_gpus=num_gpu)
 41 |         else:
 42 |             self.predictor = DefaultPredictor(cfg)
 43 | 
 44 |     def run_on_image(self, image):
 45 |         """
 46 |         Args:
 47 |             image (np.ndarray): an image of shape (H, W, C) (in BGR order).
 48 |                 This is the format used by OpenCV.
 49 |         Returns:
 50 |             predictions (dict): the output of the model.
 51 |             vis_output (VisImage): the visualized image output.
 52 |         """
 53 |         vis_output = None
 54 |         predictions = self.predictor(image)
 55 |         # Convert image from OpenCV BGR format to Matplotlib RGB format.
 56 |         image = image[:, :, ::-1]
 57 |         # import ipdb; ipdb.set_trace()
 58 |         visualizer = Visualizer(image, self.metadata, instance_mode=self.instance_mode)
 59 |         if "panoptic_seg" in predictions:
 60 |             panoptic_seg, segments_info = predictions["panoptic_seg"]
 61 |             vis_output = visualizer.draw_panoptic_seg_predictions(
 62 |                 panoptic_seg.to(self.cpu_device), segments_info
 63 |             )
 64 |         else:
 65 |             if "sem_seg" in predictions:
 66 |                 vis_output = visualizer.draw_sem_seg(
 67 |                     predictions["sem_seg"].argmax(dim=0).to(self.cpu_device)
 68 |                 )
 69 |             if "instances" in predictions:
 70 |                 instances = predictions["instances"].to(self.cpu_device)
 71 |                 vis_output = visualizer.draw_instance_predictions(predictions=instances)
 72 | 
 73 |         return predictions, vis_output
 74 | 
 75 |     def _frame_from_video(self, video):
 76 |         while video.isOpened():
 77 |             success, frame = video.read()
 78 |             if success:
 79 |                 yield frame
 80 |             else:
 81 |                 break
 82 | 
 83 |     def run_on_video(self, video):
 84 |         """
 85 |         Visualizes predictions on frames of the input video.
 86 |         Args:
 87 |             video (cv2.VideoCapture): a :class:`VideoCapture` object, whose source can be
 88 |                 either a webcam or a video file.
 89 |         Yields:
 90 |             ndarray: BGR visualizations of each video frame.
 91 |         """
 92 |         video_visualizer = VideoVisualizer(self.metadata, self.instance_mode)
 93 | 
 94 |         def process_predictions(frame, predictions):
 95 |             frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
 96 |             if "panoptic_seg" in predictions:
 97 |                 panoptic_seg, segments_info = predictions["panoptic_seg"]
 98 |                 vis_frame = video_visualizer.draw_panoptic_seg_predictions(
 99 |                     frame, panoptic_seg.to(self.cpu_device), segments_info
100 |                 )
101 |             elif "instances" in predictions:
102 |                 predictions = predictions["instances"].to(self.cpu_device)
103 |                 vis_frame = video_visualizer.draw_instance_predictions(frame, predictions)
104 |             elif "sem_seg" in predictions:
105 |                 vis_frame = video_visualizer.draw_sem_seg(
106 |                     frame, predictions["sem_seg"].argmax(dim=0).to(self.cpu_device)
107 |                 )
108 | 
109 |             # Converts Matplotlib RGB format to OpenCV BGR format
110 |             vis_frame = cv2.cvtColor(vis_frame.get_image(), cv2.COLOR_RGB2BGR)
111 |             return vis_frame
112 | 
113 |         frame_gen = self._frame_from_video(video)
114 |         if self.parallel:
115 |             buffer_size = self.predictor.default_buffer_size
116 | 
117 |             frame_data = deque()
118 | 
119 |             for cnt, frame in enumerate(frame_gen):
120 |                 frame_data.append(frame)
121 |                 self.predictor.put(frame)
122 | 
123 |                 if cnt >= buffer_size:
124 |                     frame = frame_data.popleft()
125 |                     predictions = self.predictor.get()
126 |                     yield process_predictions(frame, predictions)
127 | 
128 |             while len(frame_data):
129 |                 frame = frame_data.popleft()
130 |                 predictions = self.predictor.get()
131 |                 yield process_predictions(frame, predictions)
132 |         else:
133 |             for frame in frame_gen:
134 |                 yield process_predictions(frame, self.predictor(frame))
135 | 
136 | 
137 | class AsyncPredictor:
138 |     """
139 |     A predictor that runs the model asynchronously, possibly on >1 GPUs.
140 |     Because rendering the visualization takes considerably amount of time,
141 |     this helps improve throughput a little bit when rendering videos.
142 |     """
143 | 
144 |     class _StopToken:
145 |         pass
146 | 
147 |     class _PredictWorker(mp.Process):
148 |         def __init__(self, cfg, task_queue, result_queue):
149 |             self.cfg = cfg
150 |             self.task_queue = task_queue
151 |             self.result_queue = result_queue
152 |             super().__init__()
153 | 
154 |         def run(self):
155 |             predictor = DefaultPredictor(self.cfg)
156 | 
157 |             while True:
158 |                 task = self.task_queue.get()
159 |                 if isinstance(task, AsyncPredictor._StopToken):
160 |                     break
161 |                 idx, data = task
162 |                 result = predictor(data)
163 |                 self.result_queue.put((idx, result))
164 | 
165 |     def __init__(self, cfg, num_gpus: int = 1):
166 |         """
167 |         Args:
168 |             cfg (CfgNode):
169 |             num_gpus (int): if 0, will run on CPU
170 |         """
171 |         num_workers = max(num_gpus, 1)
172 |         self.task_queue = mp.Queue(maxsize=num_workers * 3)
173 |         self.result_queue = mp.Queue(maxsize=num_workers * 3)
174 |         self.procs = []
175 |         for gpuid in range(max(num_gpus, 1)):
176 |             cfg = cfg.clone()
177 |             cfg.defrost()
178 |             cfg.MODEL.DEVICE = "cuda:{}".format(gpuid) if num_gpus > 0 else "cpu"
179 |             self.procs.append(
180 |                 AsyncPredictor._PredictWorker(cfg, self.task_queue, self.result_queue)
181 |             )
182 | 
183 |         self.put_idx = 0
184 |         self.get_idx = 0
185 |         self.result_rank = []
186 |         self.result_data = []
187 | 
188 |         for p in self.procs:
189 |             p.start()
190 |         atexit.register(self.shutdown)
191 | 
192 |     def put(self, image):
193 |         self.put_idx += 1
194 |         self.task_queue.put((self.put_idx, image))
195 | 
196 |     def get(self):
197 |         self.get_idx += 1  # the index needed for this request
198 |         if len(self.result_rank) and self.result_rank[0] == self.get_idx:
199 |             res = self.result_data[0]
200 |             del self.result_data[0], self.result_rank[0]
201 |             return res
202 | 
203 |         while True:
204 |             # make sure the results are returned in the correct order
205 |             idx, res = self.result_queue.get()
206 |             if idx == self.get_idx:
207 |                 return res
208 |             insert = bisect.bisect(self.result_rank, idx)
209 |             self.result_rank.insert(insert, idx)
210 |             self.result_data.insert(insert, res)
211 | 
212 |     def __len__(self):
213 |         return self.put_idx - self.get_idx
214 | 
215 |     def __call__(self, image):
216 |         self.put(image)
217 |         return self.get()
218 | 
219 |     def shutdown(self):
220 |         for _ in self.procs:
221 |             self.task_queue.put(AsyncPredictor._StopToken())
222 | 
223 |     @property
224 |     def default_buffer_size(self):
225 |         return len(self.procs) * 5
226 | 


--------------------------------------------------------------------------------