├── lbvq
    ├── utils
    │   ├── __init__.py
    │   └── misc.py
    ├── modeling
    │   ├── __init__.py
    │   ├── transformer_decoder
    │   │   ├── __init__.py
    │   │   └── position_encoding.py
    │   └── lbvq_matcher.py
    ├── data
    │   ├── datasets
    │   │   ├── ytvis_api
    │   │   │   └── __init__.py
    │   │   ├── __init__.py
    │   │   ├── ovis.py
    │   │   └── builtin.py
    │   ├── __init__.py
    │   ├── combined_loader.py
    │   └── ytvis_eval.py
    ├── __init__.py
    └── config.py
├── mask2former
    ├── evaluation
    │   ├── __init__.py
    │   └── instance_evaluation.py
    ├── utils
    │   ├── __init__.py
    │   └── misc.py
    ├── modeling
    │   ├── backbone
    │   │   └── __init__.py
    │   ├── meta_arch
    │   │   ├── __init__.py
    │   │   └── mask_former_head.py
    │   ├── pixel_decoder
    │   │   ├── __init__.py
    │   │   └── ops
    │   │   │   ├── make.sh
    │   │   │   ├── modules
    │   │   │       ├── __init__.py
    │   │   │       └── ms_deform_attn.py
    │   │   │   ├── functions
    │   │   │       ├── __init__.py
    │   │   │       └── ms_deform_attn_func.py
    │   │   │   ├── src
    │   │   │       ├── vision.cpp
    │   │   │       ├── cuda
    │   │   │       │   ├── ms_deform_attn_cuda.h
    │   │   │       │   └── ms_deform_attn_cuda.cu
    │   │   │       ├── cpu
    │   │   │       │   ├── ms_deform_attn_cpu.h
    │   │   │       │   └── ms_deform_attn_cpu.cpp
    │   │   │       └── ms_deform_attn.h
    │   │   │   ├── setup.py
    │   │   │   └── test.py
    │   ├── transformer_decoder
    │   │   ├── __init__.py
    │   │   ├── position_encoding.py
    │   │   └── maskformer_transformer_decoder.py
    │   ├── __init__.py
    │   └── matcher.py
    ├── data
    │   ├── dataset_mappers
    │   │   ├── __init__.py
    │   │   ├── coco_panoptic_new_baseline_dataset_mapper.py
    │   │   ├── mask_former_panoptic_dataset_mapper.py
    │   │   ├── mask_former_instance_dataset_mapper.py
    │   │   ├── mask_former_semantic_dataset_mapper.py
    │   │   └── coco_instance_new_baseline_dataset_mapper.py
    │   ├── __init__.py
    │   └── datasets
    │   │   ├── __init__.py
    │   │   ├── register_ade20k_instance.py
    │   │   └── register_coco_panoptic_annos_semseg.py
    ├── __init__.py
    ├── test_time_augmentation.py
    └── config.py
├── LBVQ.png
├── requirements.txt
├── .gitignore
├── configs
    ├── youtubevis_2019
    │   ├── Base-YouTubeVIS-VideoInstanceSegmentation.yaml
    │   ├── lbvq_R50_bs8.yaml
    │   └── lbvq_R101_bs8.yaml
    ├── youtubevis_2021
    │   ├── Base-YouTubeVIS-VideoInstanceSegmentation.yaml
    │   └── lbvq_R50_bs8.yaml
    ├── youtubevis_2022
    │   ├── Base-YouTubeVIS-VideoInstanceSegmentation.yaml
    │   └── lbvq_R50_bs8.yaml
    └── ovis
    │   └── lbvq_R50_bs8.yaml
├── INSTALL.md
├── convert_coco2ytvis.py
├── datasets
    └── README.md
├── README.md
└── demo_lbvq
    ├── demo.py
    └── predictor.py


/lbvq/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lbvq/modeling/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/mask2former/evaluation/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lbvq/data/datasets/ytvis_api/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/LBVQ.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fanghaook/LBVQ/HEAD/LBVQ.png


--------------------------------------------------------------------------------
/mask2former/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/mask2former/modeling/backbone/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/mask2former/modeling/meta_arch/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/mask2former/data/dataset_mappers/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/mask2former/modeling/pixel_decoder/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/mask2former/data/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from . import datasets
3 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | cython
2 | scipy
3 | shapely
4 | timm
5 | h5py
6 | submitit
7 | scikit-image
8 | setuptools==59.4.0
9 | 


--------------------------------------------------------------------------------
/lbvq/modeling/transformer_decoder/__init__.py:
--------------------------------------------------------------------------------
1 | from .lbvq_mask2former_transformer_decoder import LbvqMultiScaleMaskedTransformerDecoder
2 | 


--------------------------------------------------------------------------------
/lbvq/data/__init__.py:
--------------------------------------------------------------------------------
1 | from .datasets import *
2 | from .dataset_mapper import YTVISDatasetMapper, CocoClipDatasetMapper
3 | from .build import *
4 | from .ytvis_eval import YTVISEvaluator
5 | 


--------------------------------------------------------------------------------
/lbvq/data/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | from . import builtin  # ensure the builtin datasets are registered
2 | 
3 | __all__ = [k for k in globals().keys() if "builtin" not in k and not k.startswith("_")]
4 | 


--------------------------------------------------------------------------------
/lbvq/__init__.py:
--------------------------------------------------------------------------------
 1 | # model code
 2 | from . import modeling
 3 | 
 4 | # config
 5 | from .config import add_lbvq_config
 6 | 
 7 | # models
 8 | from .lbvq_model import Lbvq
 9 | 
10 | # video
11 | from .data import *
12 | 


--------------------------------------------------------------------------------
/mask2former/modeling/transformer_decoder/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from .maskformer_transformer_decoder import StandardTransformerDecoder
3 | from .mask2former_transformer_decoder import MultiScaleMaskedTransformerDecoder
4 | 


--------------------------------------------------------------------------------
/mask2former/data/datasets/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | from . import (
 3 |     register_ade20k_full,
 4 |     register_ade20k_panoptic,
 5 |     register_coco_stuff_10k,
 6 |     register_mapillary_vistas,
 7 |     register_coco_panoptic_annos_semseg,
 8 |     register_ade20k_instance,
 9 |     register_mapillary_vistas_panoptic,
10 | )
11 | 


--------------------------------------------------------------------------------
/mask2former/modeling/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from .backbone.swin import D2SwinTransformer
3 | from .pixel_decoder.fpn import BasePixelDecoder
4 | from .pixel_decoder.msdeformattn import MSDeformAttnPixelDecoder
5 | from .meta_arch.mask_former_head import MaskFormerHead
6 | from .meta_arch.per_pixel_baseline import PerPixelBaselineHead, PerPixelBaselinePlusHead
7 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # output dir
 2 | output*
 3 | 
 4 | *.json
 5 | *.diff
 6 | *.jpg
 7 | !/projects/DensePose/doc/images/*.jpg
 8 | 
 9 | # compilation and distribution
10 | __pycache__
11 | _ext
12 | *.pyc
13 | *.pyd
14 | *.so
15 | *.dll
16 | *.egg-info/
17 | build/
18 | dist/
19 | wheels/
20 | 
21 | # pytorch/python/numpy formats
22 | *.pth
23 | *.pkl
24 | *.npy
25 | *.ts
26 | model_ts*.txt
27 | 
28 | # ipython/jupyter notebooks
29 | *.ipynb
30 | **/.ipynb_checkpoints/
31 | 
32 | # Editor temporaries
33 | *.swn
34 | *.swo
35 | *.swp
36 | *~
37 | 
38 | # editor settings
39 | .idea
40 | .vscode
41 | _darcs
42 | 
43 | # project dirs
44 | /detectron2/model_zoo/configs
45 | /datasets/*
46 | !/datasets/*.*
47 | /projects/*/datasets
48 | /models
49 | /snippet
50 | 
51 | detectron2


--------------------------------------------------------------------------------
/mask2former/modeling/pixel_decoder/ops/make.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # ------------------------------------------------------------------------------------------------
 3 | # Deformable DETR
 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | # ------------------------------------------------------------------------------------------------
 7 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | # ------------------------------------------------------------------------------------------------
 9 | 
10 | # Copyright (c) Facebook, Inc. and its affiliates.
11 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
12 | 
13 | python setup.py build install
14 | 


--------------------------------------------------------------------------------
/mask2former/modeling/pixel_decoder/ops/modules/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from .ms_deform_attn import MSDeformAttn
13 | 


--------------------------------------------------------------------------------
/mask2former/modeling/pixel_decoder/ops/functions/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from .ms_deform_attn_func import MSDeformAttnFunction
13 | 
14 | 


--------------------------------------------------------------------------------
/mask2former/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | from . import data  # register all new datasets
 3 | from . import modeling
 4 | 
 5 | # config
 6 | from .config import add_maskformer2_config
 7 | 
 8 | # dataset loading
 9 | from .data.dataset_mappers.coco_instance_new_baseline_dataset_mapper import COCOInstanceNewBaselineDatasetMapper
10 | from .data.dataset_mappers.coco_panoptic_new_baseline_dataset_mapper import COCOPanopticNewBaselineDatasetMapper
11 | from .data.dataset_mappers.mask_former_instance_dataset_mapper import (
12 |     MaskFormerInstanceDatasetMapper,
13 | )
14 | from .data.dataset_mappers.mask_former_panoptic_dataset_mapper import (
15 |     MaskFormerPanopticDatasetMapper,
16 | )
17 | from .data.dataset_mappers.mask_former_semantic_dataset_mapper import (
18 |     MaskFormerSemanticDatasetMapper,
19 | )
20 | 
21 | # models
22 | from .maskformer_model import MaskFormer
23 | from .test_time_augmentation import SemanticSegmentorWithTTA
24 | 
25 | # evaluation
26 | from .evaluation.instance_evaluation import InstanceSegEvaluator
27 | 


--------------------------------------------------------------------------------
/mask2former/modeling/pixel_decoder/ops/src/vision.cpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #include "ms_deform_attn.h"
17 | 
18 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
19 |   m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward");
20 |   m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward");
21 | }
22 | 


--------------------------------------------------------------------------------
/mask2former/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #pragma once
17 | #include <torch/extension.h>
18 | 
19 | at::Tensor ms_deform_attn_cuda_forward(
20 |     const at::Tensor &value, 
21 |     const at::Tensor &spatial_shapes,
22 |     const at::Tensor &level_start_index,
23 |     const at::Tensor &sampling_loc,
24 |     const at::Tensor &attn_weight,
25 |     const int im2col_step);
26 | 
27 | std::vector<at::Tensor> ms_deform_attn_cuda_backward(
28 |     const at::Tensor &value, 
29 |     const at::Tensor &spatial_shapes,
30 |     const at::Tensor &level_start_index,
31 |     const at::Tensor &sampling_loc,
32 |     const at::Tensor &attn_weight,
33 |     const at::Tensor &grad_output,
34 |     const int im2col_step);
35 | 
36 | 


--------------------------------------------------------------------------------
/mask2former/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #pragma once
17 | #include <torch/extension.h>
18 | 
19 | at::Tensor
20 | ms_deform_attn_cpu_forward(
21 |     const at::Tensor &value, 
22 |     const at::Tensor &spatial_shapes,
23 |     const at::Tensor &level_start_index,
24 |     const at::Tensor &sampling_loc,
25 |     const at::Tensor &attn_weight,
26 |     const int im2col_step);
27 | 
28 | std::vector<at::Tensor>
29 | ms_deform_attn_cpu_backward(
30 |     const at::Tensor &value, 
31 |     const at::Tensor &spatial_shapes,
32 |     const at::Tensor &level_start_index,
33 |     const at::Tensor &sampling_loc,
34 |     const at::Tensor &attn_weight,
35 |     const at::Tensor &grad_output,
36 |     const int im2col_step);
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/configs/youtubevis_2019/Base-YouTubeVIS-VideoInstanceSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   MASK_ON: True
 9 |   RESNETS:
10 |     DEPTH: 50
11 |     STEM_TYPE: "basic"  # not used
12 |     STEM_OUT_CHANNELS: 64
13 |     STRIDE_IN_1X1: False
14 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     # NORM: "SyncBN"
16 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
17 | DATASETS:
18 |   TRAIN: ("ytvis_2019_train",)
19 |   TEST: ("ytvis_2019_val",)
20 | SOLVER:
21 |   IMS_PER_BATCH: 16
22 |   BASE_LR: 0.0001
23 |   STEPS: (4000,)
24 |   MAX_ITER: 6000
25 |   WARMUP_FACTOR: 1.0
26 |   WARMUP_ITERS: 10
27 |   WEIGHT_DECAY: 0.05
28 |   OPTIMIZER: "ADAMW"
29 |   BACKBONE_MULTIPLIER: 0.1
30 |   CLIP_GRADIENTS:
31 |     ENABLED: True
32 |     CLIP_TYPE: "full_model"
33 |     CLIP_VALUE: 0.01
34 |     NORM_TYPE: 2.0
35 |   AMP:
36 |     ENABLED: True
37 | INPUT:
38 |   MIN_SIZE_TRAIN_SAMPLING: "choice_by_clip"
39 |   RANDOM_FLIP: "flip_by_clip"
40 |   #AUGMENTATIONS: []
41 |   MIN_SIZE_TRAIN: (360, 480)
42 |   MIN_SIZE_TEST: 360
43 |   CROP:
44 |     ENABLED: False
45 |     TYPE: "absolute_range"
46 |     SIZE: (600, 720)
47 |   FORMAT: "RGB"
48 | TEST:
49 |   EVAL_PERIOD: 0
50 |   DETECTIONS_PER_IMAGE: 10
51 | DATALOADER:
52 |   FILTER_EMPTY_ANNOTATIONS: False
53 |   NUM_WORKERS: 4
54 | VERSION: 2
55 | 


--------------------------------------------------------------------------------
/configs/youtubevis_2021/Base-YouTubeVIS-VideoInstanceSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   MASK_ON: True
 9 |   RESNETS:
10 |     DEPTH: 50
11 |     STEM_TYPE: "basic"  # not used
12 |     STEM_OUT_CHANNELS: 64
13 |     STRIDE_IN_1X1: False
14 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     # NORM: "SyncBN"
16 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
17 | DATASETS:
18 |   TRAIN: ("ytvis_2021_train",)
19 |   TEST: ("ytvis_2021_val",)
20 | SOLVER:
21 |   IMS_PER_BATCH: 16
22 |   BASE_LR: 0.0001
23 |   STEPS: (5500,)
24 |   MAX_ITER: 8000
25 |   WARMUP_FACTOR: 1.0
26 |   WARMUP_ITERS: 10
27 |   WEIGHT_DECAY: 0.05
28 |   OPTIMIZER: "ADAMW"
29 |   BACKBONE_MULTIPLIER: 0.1
30 |   CLIP_GRADIENTS:
31 |     ENABLED: True
32 |     CLIP_TYPE: "full_model"
33 |     CLIP_VALUE: 0.01
34 |     NORM_TYPE: 2.0
35 |   AMP:
36 |     ENABLED: True
37 | INPUT:
38 |   MIN_SIZE_TRAIN_SAMPLING: "choice_by_clip"
39 |   RANDOM_FLIP: "flip_by_clip"
40 |   AUGMENTATIONS: []
41 |   MIN_SIZE_TRAIN: (360, 480)
42 |   MIN_SIZE_TEST: 360
43 |   CROP:
44 |     ENABLED: False
45 |     TYPE: "absolute_range"
46 |     SIZE: (600, 720)
47 |   FORMAT: "RGB"
48 | TEST:
49 |   EVAL_PERIOD: 0
50 |   DETECTIONS_PER_IMAGE: 10
51 | DATALOADER:
52 |   FILTER_EMPTY_ANNOTATIONS: False
53 |   NUM_WORKERS: 4
54 | VERSION: 2
55 | 


--------------------------------------------------------------------------------
/configs/youtubevis_2022/Base-YouTubeVIS-VideoInstanceSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   MASK_ON: True
 9 |   RESNETS:
10 |     DEPTH: 50
11 |     STEM_TYPE: "basic"  # not used
12 |     STEM_OUT_CHANNELS: 64
13 |     STRIDE_IN_1X1: False
14 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     # NORM: "SyncBN"
16 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
17 | DATASETS:
18 |   TRAIN: ("ytvis_2021_train",)
19 |   TEST: ("ytvis_2021_val",)
20 | SOLVER:
21 |   IMS_PER_BATCH: 16
22 |   BASE_LR: 0.0001
23 |   STEPS: (5500,)
24 |   MAX_ITER: 8000
25 |   WARMUP_FACTOR: 1.0
26 |   WARMUP_ITERS: 10
27 |   WEIGHT_DECAY: 0.05
28 |   OPTIMIZER: "ADAMW"
29 |   BACKBONE_MULTIPLIER: 0.1
30 |   CLIP_GRADIENTS:
31 |     ENABLED: True
32 |     CLIP_TYPE: "full_model"
33 |     CLIP_VALUE: 0.01
34 |     NORM_TYPE: 2.0
35 |   AMP:
36 |     ENABLED: True
37 | INPUT:
38 |   MIN_SIZE_TRAIN_SAMPLING: "choice_by_clip"
39 |   RANDOM_FLIP: "flip_by_clip"
40 |   AUGMENTATIONS: []
41 |   MIN_SIZE_TRAIN: (360, 480)
42 |   MIN_SIZE_TEST: 360
43 |   CROP:
44 |     ENABLED: False
45 |     TYPE: "absolute_range"
46 |     SIZE: (600, 720)
47 |   FORMAT: "RGB"
48 | TEST:
49 |   EVAL_PERIOD: 0
50 |   DETECTIONS_PER_IMAGE: 10
51 | DATALOADER:
52 |   FILTER_EMPTY_ANNOTATIONS: False
53 |   NUM_WORKERS: 4
54 | VERSION: 2
55 | 


--------------------------------------------------------------------------------
/lbvq/data/combined_loader.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | from collections import deque
 3 | from typing import Any, Collection, Deque, Iterable, Iterator, List, Sequence
 4 | 
 5 | Loader = Iterable[Any]
 6 | 
 7 | 
 8 | def _pooled_next(iterator: Iterator[Any], pool: Deque[Any]):
 9 |     if not pool:
10 |         pool.extend(next(iterator))
11 |     return pool.popleft()
12 | 
13 | 
14 | class CombinedDataLoader:
15 |     """
16 |     Combines data loaders using the provided sampling ratios
17 |     """
18 | 
19 |     BATCH_COUNT = 100
20 | 
21 |     def __init__(self, loaders: Collection[Loader], batch_size: int, ratios: Sequence[float]):
22 |         self.loaders = loaders
23 |         self.batch_size = batch_size
24 |         self.ratios = ratios
25 | 
26 |     def __iter__(self) -> Iterator[List[Any]]:
27 |         iters = [iter(loader) for loader in self.loaders]
28 |         indices = []
29 |         pool = [deque()] * len(iters)
30 |         # infinite iterator, as in D2
31 |         while True:
32 |             if not indices:
33 |                 # just a buffer of indices, its size doesn't matter
34 |                 # as long as it's a multiple of batch_size
35 |                 k = self.batch_size * self.BATCH_COUNT
36 |                 indices = random.choices(range(len(self.loaders)), self.ratios, k=k)
37 |             try:
38 |                 batch = [_pooled_next(iters[i], pool[i]) for i in indices[: self.batch_size]]
39 |             except StopIteration:
40 |                 break
41 |             indices = indices[self.batch_size :]
42 |             yield batch
43 | 


--------------------------------------------------------------------------------
/mask2former/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.cpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #include <vector>
17 | 
18 | #include <ATen/ATen.h>
19 | #include <ATen/cuda/CUDAContext.h>
20 | 
21 | 
22 | at::Tensor
23 | ms_deform_attn_cpu_forward(
24 |     const at::Tensor &value, 
25 |     const at::Tensor &spatial_shapes,
26 |     const at::Tensor &level_start_index,
27 |     const at::Tensor &sampling_loc,
28 |     const at::Tensor &attn_weight,
29 |     const int im2col_step)
30 | {
31 |     AT_ERROR("Not implement on cpu");
32 | }
33 | 
34 | std::vector<at::Tensor>
35 | ms_deform_attn_cpu_backward(
36 |     const at::Tensor &value, 
37 |     const at::Tensor &spatial_shapes,
38 |     const at::Tensor &level_start_index,
39 |     const at::Tensor &sampling_loc,
40 |     const at::Tensor &attn_weight,
41 |     const at::Tensor &grad_output,
42 |     const int im2col_step)
43 | {
44 |     AT_ERROR("Not implement on cpu");
45 | }
46 | 
47 | 


--------------------------------------------------------------------------------
/INSTALL.md:
--------------------------------------------------------------------------------
 1 | ## Installation
 2 | 
 3 | ### Requirements
 4 | - Linux or macOS with Python ≥ 3.6
 5 | - PyTorch ≥ 1.9 and [torchvision](https://github.com/pytorch/vision/) that matches the PyTorch installation.
 6 |   Install them together at [pytorch.org](https://pytorch.org) to make sure of this. Note, please check
 7 |   PyTorch version matches that is required by Detectron2.
 8 | - Detectron2: follow [Detectron2 installation instructions](https://detectron2.readthedocs.io/tutorials/install.html).
 9 | - OpenCV is optional but needed by demo and visualization
10 | - SAM is optional but using sam refinement requires
11 | - `pip install -r requirements.txt`
12 | 
13 | ### CUDA kernel for MSDeformAttn
14 | After preparing the required environment, run the following command to compile CUDA kernel for MSDeformAttn:
15 | 
16 | `CUDA_HOME` must be defined and points to the directory of the installed CUDA toolkit.
17 | 
18 | ```bash
19 | cd mask2former/modeling/pixel_decoder/ops
20 | sh make.sh
21 | ```
22 | 
23 | #### Building on another system
24 | To build on a system that does not have a GPU device but provide the drivers:
25 | ```bash
26 | TORCH_CUDA_ARCH_LIST='8.0' FORCE_CUDA=1 python setup.py build install
27 | ```
28 | 
29 | ### Example conda environment setup
30 | ```bash
31 | conda create --name lbvq python=3.8 -y
32 | conda activate lbvq
33 | conda install pytorch==1.9.0 torchvision==0.10.0 cudatoolkit=11.1 -c pytorch -c nvidia
34 | pip install -U opencv-python
35 | pip install git+https://github.com/facebookresearch/segment-anything.git
36 | 
37 | # under your working directory
38 | git clone git@github.com:facebookresearch/detectron2.git
39 | cd detectron2
40 | pip install -e .
41 | 
42 | git clone https://github.com/SysCV/sam-hq.git
43 | cd sam-hq
44 | pip install -e .
45 | 
46 | cd ..
47 | git clone https://github.com/fanghaook/LBVQ.git
48 | cd LBVQ
49 | pip install -r requirements.txt
50 | cd mask2former/modeling/pixel_decoder/ops
51 | sh make.sh
52 | ```
53 | 


--------------------------------------------------------------------------------
/lbvq/config.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from detectron2.config import CfgNode as CN
 3 | 
 4 | 
 5 | def add_lbvq_config(cfg):
 6 |     cfg.DATASETS.DATASET_RATIO = []
 7 | 
 8 |     # DataLoader
 9 |     cfg.INPUT.SAMPLING_FRAME_NUM = 2
10 |     cfg.INPUT.SAMPLING_FRAME_RANGE = 20
11 |     cfg.INPUT.SAMPLING_FRAME_SHUFFLE = False
12 |     cfg.INPUT.AUGMENTATIONS = [] # "brightness", "contrast", "saturation", "rotation"
13 | 
14 |     # Pseudo Data Use
15 |     cfg.INPUT.PSEUDO = CN()
16 |     cfg.INPUT.PSEUDO.AUGMENTATIONS = ['rotation']
17 |     cfg.INPUT.PSEUDO.MIN_SIZE_TRAIN = (480, 512, 544, 576, 608, 640, 672, 704, 736, 768)
18 |     cfg.INPUT.PSEUDO.MAX_SIZE_TRAIN = 768
19 |     cfg.INPUT.PSEUDO.MIN_SIZE_TRAIN_SAMPLING = "choice_by_clip"
20 |     cfg.INPUT.PSEUDO.CROP = CN()
21 |     cfg.INPUT.PSEUDO.CROP.ENABLED = False
22 |     cfg.INPUT.PSEUDO.CROP.TYPE = "absolute_range"
23 |     cfg.INPUT.PSEUDO.CROP.SIZE = (384, 600)
24 | 
25 |     # LSJ
26 |     cfg.INPUT.LSJ_AUG = CN()
27 |     cfg.INPUT.LSJ_AUG.ENABLED = False
28 |     cfg.INPUT.LSJ_AUG.IMAGE_SIZE = 1024
29 |     cfg.INPUT.LSJ_AUG.MIN_SCALE = 0.1
30 |     cfg.INPUT.LSJ_AUG.MAX_SCALE = 2.0
31 | 
32 |     # LBVQ
33 |     cfg.MODEL.LBVQ = CN()
34 |     cfg.MODEL.LBVQ.NHEADS = 8
35 |     cfg.MODEL.LBVQ.DROPOUT = 0.0
36 |     cfg.MODEL.LBVQ.DIM_FEEDFORWARD = 2048
37 |     cfg.MODEL.LBVQ.DEC_LAYERS = 6
38 |     cfg.MODEL.LBVQ.PRE_NORM = False
39 |     cfg.MODEL.LBVQ.HIDDEN_DIM = 256
40 |     cfg.MODEL.LBVQ.NUM_OBJECT_QUERIES = 100
41 |     cfg.MODEL.LBVQ.ENFORCE_INPUT_PROJ = True
42 | 
43 |     cfg.MODEL.LBVQ.NO_OBJECT_WEIGHT = 0.1
44 |     cfg.MODEL.LBVQ.DEEP_SUPERVISION = True
45 |     cfg.MODEL.LBVQ.LAST_LAYER_NUM = 3
46 |     cfg.MODEL.LBVQ.MULTI_CLS_ON = True
47 |     cfg.MODEL.LBVQ.APPLY_CLS_THRES = 0.01
48 | 
49 |     cfg.MODEL.LBVQ.SIM_USE_CLIP = True
50 |     cfg.MODEL.LBVQ.SIM_WEIGHT = 0.5
51 | 
52 |     cfg.MODEL.LBVQ.FREEZE_DETECTOR = False
53 |     cfg.MODEL.LBVQ.TEST_RUN_CHUNK_SIZE = 18
54 |     cfg.MODEL.LBVQ.TEST_INTERPOLATE_CHUNK_SIZE = 5
55 | 
56 |     # SAM
57 |     cfg.SAM = False
58 | 


--------------------------------------------------------------------------------
/convert_coco2ytvis.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | 
 4 | from lbvq.data.datasets.ytvis import (
 5 |     COCO_TO_YTVIS_2019,
 6 |     COCO_TO_YTVIS_2021,
 7 | )
 8 | from lbvq.data.datasets.ovis import (
 9 |     COCO_TO_OVIS,
10 | )
11 | 
12 | _root = os.getenv("DETECTRON2_DATASETS", "datasets")
13 | 
14 | convert_list = [
15 |     (COCO_TO_YTVIS_2019, 
16 |         os.path.join(_root, "coco/annotations/instances_train2017.json"),
17 |         os.path.join(_root, "coco/annotations/coco2ytvis2019_train.json"), "COCO to YTVIS 2019:"),
18 |     (COCO_TO_YTVIS_2019, 
19 |         os.path.join(_root, "coco/annotations/instances_val2017.json"),
20 |         os.path.join(_root, "coco/annotations/coco2ytvis2019_val.json"), "COCO val to YTVIS 2019:"),
21 |     (COCO_TO_YTVIS_2021, 
22 |         os.path.join(_root, "coco/annotations/instances_train2017.json"),
23 |         os.path.join(_root, "coco/annotations/coco2ytvis2021_train.json"), "COCO to YTVIS 2021:"),
24 |     (COCO_TO_YTVIS_2021, 
25 |         os.path.join(_root, "coco/annotations/instances_val2017.json"),
26 |         os.path.join(_root, "coco/annotations/coco2ytvis2021_val.json"), "COCO val to YTVIS 2021:"),
27 |     (COCO_TO_OVIS, 
28 |         os.path.join(_root, "coco/annotations/instances_train2017.json"),
29 |         os.path.join(_root, "coco/annotations/coco2ovis_train.json"), "COCO to OVIS:"),
30 | ]
31 | 
32 | for convert_dict, src_path, out_path, msg in convert_list:
33 |     src_f = open(src_path, "r")
34 |     out_f = open(out_path, "w")
35 |     src_json = json.load(src_f)
36 |     # print(src_json.keys())   dict_keys(['info', 'licenses', 'images', 'annotations', 'categories'])
37 | 
38 |     out_json = {}
39 |     for k, v in src_json.items():
40 |         if k != 'annotations':
41 |             out_json[k] = v
42 | 
43 |     converted_item_num = 0
44 |     out_json['annotations'] = []
45 |     for anno in src_json['annotations']:
46 |         if anno["category_id"] not in convert_dict:
47 |             continue
48 | 
49 |         out_json['annotations'].append(anno)
50 |         converted_item_num += 1
51 | 
52 |     json.dump(out_json, out_f)
53 |     print(msg, converted_item_num, "items converted.")
54 | 


--------------------------------------------------------------------------------
/mask2former/modeling/pixel_decoder/ops/src/ms_deform_attn.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #pragma once
17 | 
18 | #include "cpu/ms_deform_attn_cpu.h"
19 | 
20 | #ifdef WITH_CUDA
21 | #include "cuda/ms_deform_attn_cuda.h"
22 | #endif
23 | 
24 | 
25 | at::Tensor
26 | ms_deform_attn_forward(
27 |     const at::Tensor &value, 
28 |     const at::Tensor &spatial_shapes,
29 |     const at::Tensor &level_start_index,
30 |     const at::Tensor &sampling_loc,
31 |     const at::Tensor &attn_weight,
32 |     const int im2col_step)
33 | {
34 |     if (value.type().is_cuda())
35 |     {
36 | #ifdef WITH_CUDA
37 |         return ms_deform_attn_cuda_forward(
38 |             value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step);
39 | #else
40 |         AT_ERROR("Not compiled with GPU support");
41 | #endif
42 |     }
43 |     AT_ERROR("Not implemented on the CPU");
44 | }
45 | 
46 | std::vector<at::Tensor>
47 | ms_deform_attn_backward(
48 |     const at::Tensor &value, 
49 |     const at::Tensor &spatial_shapes,
50 |     const at::Tensor &level_start_index,
51 |     const at::Tensor &sampling_loc,
52 |     const at::Tensor &attn_weight,
53 |     const at::Tensor &grad_output,
54 |     const int im2col_step)
55 | {
56 |     if (value.type().is_cuda())
57 |     {
58 | #ifdef WITH_CUDA
59 |         return ms_deform_attn_cuda_backward(
60 |             value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step);
61 | #else
62 |         AT_ERROR("Not compiled with GPU support");
63 | #endif
64 |     }
65 |     AT_ERROR("Not implemented on the CPU");
66 | }
67 | 
68 | 


--------------------------------------------------------------------------------
/datasets/README.md:
--------------------------------------------------------------------------------
  1 | # Prepare Datasets for LBVQ
  2 | 
  3 | LBVQ has builtin support for a few datasets.
  4 | The datasets are assumed to exist in a directory specified by the environment variable
  5 | `DETECTRON2_DATASETS`.
  6 | Under this directory, detectron2 will look for datasets in the structure described below, if needed.
  7 | ```
  8 | $DETECTRON2_DATASETS/
  9 |   coco/
 10 |   ytvis_2019/
 11 |   ytvis_2021/
 12 |   ytvis_2022/
 13 |   ovis/
 14 | ```
 15 | 
 16 | You can set the location for builtin datasets by `export DETECTRON2_DATASETS=/path/to/datasets`.
 17 | If left unset, the default is `./datasets` relative to your current working directory.
 18 | 
 19 | <!-- The [model zoo](https://github.com/facebookresearch/MaskFormer/blob/master/MODEL_ZOO.md)
 20 | contains configs and models that use these builtin datasets. -->
 21 | 
 22 | ## STEP-1: Prepare Image & Video Instance Segmentation datasets
 23 | ### Expected dataset structure for [COCO](https://cocodataset.org/#download):
 24 | 
 25 | ```
 26 | coco/
 27 |   annotations/
 28 |     instances_{train,val}2017.json
 29 |   {train,val}2017/
 30 |     # image files that are mentioned in the corresponding json
 31 | ```
 32 | 
 33 | ### Expected dataset structure for [YouTubeVIS 2019](https://codalab.lisn.upsaclay.fr/competitions/7682):
 34 | 
 35 | ```
 36 | ytvis_2019/
 37 |   {train,valid,test}.json
 38 |   {train,valid,test}/
 39 |     Annotations/
 40 |     JPEGImages/
 41 | ```
 42 | 
 43 | ### Expected dataset structure for [YouTubeVIS 2021](https://codalab.lisn.upsaclay.fr/competitions/7680):
 44 | 
 45 | ```
 46 | ytvis_2021/
 47 |   {train,valid,test}.json
 48 |   {train,valid,test}/
 49 |     JPEGImages/
 50 | ```
 51 | 
 52 | ### Expected dataset structure for [YouTubeVIS 2022](https://codalab.lisn.upsaclay.fr/competitions/3410):
 53 | 
 54 | ```
 55 | ytvis_2022/
 56 |   {valid,test}.json
 57 |   {valid,test}/
 58 |     JPEGImages/
 59 | ```
 60 | 
 61 | ### Expected dataset structure for [OVIS](https://codalab.lisn.upsaclay.fr/competitions/4763):
 62 | 
 63 | ```
 64 | ovis/
 65 |   annotations/
 66 |     {train,valid,test}.json
 67 |   {train,valid,test}/
 68 | ```
 69 | 
 70 | ## STEP-2: Prepare annotations for combined data
 71 | ```bash
 72 | python convert_coco2ytvis.py
 73 | ```
 74 | ### Expected final dataset structure for all:
 75 | ```
 76 | $DETECTRON2_DATASETS
 77 | +-- coco
 78 | |   |
 79 | |   +-- annotations
 80 | |   |   |
 81 | |   |   +-- instances_{train,val}2017.json
 82 | |   |   +-- coco2ytvis2019_train.json
 83 | |   |   +-- coco2ytvis2021_train.json
 84 | |   |   +-- coco2ovis_train.json
 85 | |   |
 86 | |   +-- {train,val}2017
 87 | |       |
 88 | |       +-- *.jpg
 89 | |
 90 | +-- ytvis_2019
 91 | |   ...
 92 | |
 93 | +-- ytvis_2021
 94 | |   ...
 95 | |
 96 | +-- ytvis_2022
 97 | |   ...
 98 | |
 99 | +-- ovis
100 |     ...
101 | ```
102 | 


--------------------------------------------------------------------------------
/mask2former/modeling/transformer_decoder/position_encoding.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py
 3 | """
 4 | Various positional encodings for the transformer.
 5 | """
 6 | import math
 7 | 
 8 | import torch
 9 | from torch import nn
10 | 
11 | 
12 | class PositionEmbeddingSine(nn.Module):
13 |     """
14 |     This is a more standard version of the position embedding, very similar to the one
15 |     used by the Attention is all you need paper, generalized to work on images.
16 |     """
17 | 
18 |     def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
19 |         super().__init__()
20 |         self.num_pos_feats = num_pos_feats
21 |         self.temperature = temperature
22 |         self.normalize = normalize
23 |         if scale is not None and normalize is False:
24 |             raise ValueError("normalize should be True if scale is passed")
25 |         if scale is None:
26 |             scale = 2 * math.pi
27 |         self.scale = scale
28 | 
29 |     def forward(self, x, mask=None):
30 |         if mask is None:
31 |             mask = torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool)
32 |         not_mask = ~mask
33 |         y_embed = not_mask.cumsum(1, dtype=torch.float32)
34 |         x_embed = not_mask.cumsum(2, dtype=torch.float32)
35 |         if self.normalize:
36 |             eps = 1e-6
37 |             y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
38 |             x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
39 | 
40 |         dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
41 |         dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.num_pos_feats)
42 | 
43 |         pos_x = x_embed[:, :, :, None] / dim_t
44 |         pos_y = y_embed[:, :, :, None] / dim_t
45 |         pos_x = torch.stack(
46 |             (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4
47 |         ).flatten(3)
48 |         pos_y = torch.stack(
49 |             (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4
50 |         ).flatten(3)
51 |         pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
52 |         return pos
53 |     
54 |     def __repr__(self, _repr_indent=4):
55 |         head = "Positional encoding " + self.__class__.__name__
56 |         body = [
57 |             "num_pos_feats: {}".format(self.num_pos_feats),
58 |             "temperature: {}".format(self.temperature),
59 |             "normalize: {}".format(self.normalize),
60 |             "scale: {}".format(self.scale),
61 |         ]
62 |         # _repr_indent = 4
63 |         lines = [head] + [" " * _repr_indent + line for line in body]
64 |         return "\n".join(lines)
65 | 


--------------------------------------------------------------------------------
/lbvq/data/datasets/ovis.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | """
 4 | This file contains functions to parse OVIS dataset of
 5 | COCO-format annotations into dicts in "Detectron2 format".
 6 | """
 7 | 
 8 | logger = logging.getLogger(__name__)
 9 | 
10 | 
11 | COCO_TO_OVIS = {
12 |     1:1, 2:21, 3:25, 4:22, 5:23, 6:25, 8:25, 9:24, 17:3, 18:4, 19:5, 20:6, 21:7, 22:8, 23:9, 24:10, 25:11, 
13 | }
14 | 
15 | 
16 | OVIS_CATEGORIES = [
17 |     {"color": [220, 20, 60], "isthing": 1, "id": 1, "name": "Person"},
18 |     {"color": [255, 109, 65], "isthing": 1, "id": 2, "name": "Bird"},
19 |     {"color": [255, 77, 255], "isthing": 1, "id": 3, "name": "Cat"},
20 |     {"color": [0, 226, 252], "isthing": 1, "id": 4, "name": "Dog"},
21 |     {"color": [182, 182, 255], "isthing": 1, "id": 5, "name": "Horse"},
22 |     {"color": [255, 208, 186], "isthing": 1, "id": 6, "name": "Sheep"},
23 |     {"color": [120, 166, 157], "isthing": 1, "id": 7, "name": "Cow"},
24 |     {"color": [110, 76, 0], "isthing": 1, "id": 8, "name": "Elephant"},
25 |     {"color": [174, 57, 255], "isthing": 1, "id": 9, "name": "Bear"},
26 |     {"color": [199, 100, 0], "isthing": 1, "id": 10, "name": "Zebra"},
27 |     {"color": [72, 0, 118], "isthing": 1, "id": 11, "name": "Giraffe"},
28 |     {"color": [107, 142, 35], "isthing": 1, "id": 12, "name": "Poultry"},
29 |     {"color": [0, 82, 0], "isthing": 1, "id": 13, "name": "Giant_panda"},
30 |     {"color": [119, 11, 32], "isthing": 1, "id": 14, "name": "Lizard"},
31 |     {"color": [165, 42, 42], "isthing": 1, "id": 15, "name": "Parrot"},
32 |     {"color": [0, 60, 100], "isthing": 1, "id": 16, "name": "Monkey"},
33 |     {"color": [100, 170, 30], "isthing": 1, "id": 17, "name": "Rabbit"},
34 |     {"color": [166, 196, 102], "isthing": 1, "id": 18, "name": "Tiger"},
35 |     {"color": [73, 77, 174], "isthing": 1, "id": 19, "name": "Fish"},
36 |     {"color": [0, 143, 149], "isthing": 1, "id": 20, "name": "Turtle"},
37 |     {"color": [134, 134, 103], "isthing": 1, "id": 21, "name": "Bicycle"},
38 |     {"color": [0, 0, 230], "isthing": 1, "id": 22, "name": "Motorcycle"},
39 |     {"color": [106, 0, 228], "isthing": 1, "id": 23, "name": "Airplane"},
40 |     {"color": [0, 0, 192], "isthing": 1, "id": 24, "name": "Boat"},
41 |     {"color": [0, 0, 142], "isthing": 1, "id": 25, "name": "Vehical"},
42 | ]
43 | 
44 | 
45 | def _get_ovis_instances_meta():
46 |     thing_ids = [k["id"] for k in OVIS_CATEGORIES if k["isthing"] == 1]
47 |     thing_colors = [k["color"] for k in OVIS_CATEGORIES if k["isthing"] == 1]
48 |     assert len(thing_ids) == 25, len(thing_ids)
49 |     # Mapping from the incontiguous YTVIS category id to an id in [0, 39]
50 |     thing_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(thing_ids)}
51 |     thing_classes = [k["name"] for k in OVIS_CATEGORIES if k["isthing"] == 1]
52 |     ret = {
53 |         "thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id,
54 |         "thing_classes": thing_classes,
55 |         "thing_colors": thing_colors,
56 |     }
57 |     return ret


--------------------------------------------------------------------------------
/configs/youtubevis_2022/lbvq_R50_bs8.yaml:
--------------------------------------------------------------------------------
  1 | _BASE_: Base-YouTubeVIS-VideoInstanceSegmentation.yaml
  2 | MODEL:
  3 |   WEIGHTS: "lbvq_r50_ytvis21.pth"
  4 |   META_ARCHITECTURE: "Lbvq"
  5 |   MASK_ON: True
  6 |   SEM_SEG_HEAD:
  7 |     NAME: "MaskFormerHead"
  8 |     IGNORE_VALUE: 255
  9 |     NUM_CLASSES: 40
 10 |     LOSS_WEIGHT: 1.0
 11 |     CONVS_DIM: 256
 12 |     MASK_DIM: 256
 13 |     NORM: "GN"
 14 |     # pixel decoder
 15 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
 16 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
 17 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
 18 |     COMMON_STRIDE: 4
 19 |     TRANSFORMER_ENC_LAYERS: 6
 20 |   MASK_FORMER:
 21 |     TRANSFORMER_DECODER_NAME: "LbvqMultiScaleMaskedTransformerDecoder"
 22 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
 23 |     DEEP_SUPERVISION: True
 24 |     NO_OBJECT_WEIGHT: 0.1
 25 |     CLASS_WEIGHT: 2.0
 26 |     MASK_WEIGHT: 5.0
 27 |     DICE_WEIGHT: 5.0
 28 |     HIDDEN_DIM: 256
 29 |     NUM_OBJECT_QUERIES: 100
 30 |     NHEADS: 8
 31 |     DROPOUT: 0.0
 32 |     DIM_FEEDFORWARD: 2048
 33 |     ENC_LAYERS: 0
 34 |     PRE_NORM: False
 35 |     ENFORCE_INPUT_PROJ: False
 36 |     SIZE_DIVISIBILITY: 32
 37 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
 38 |     TRAIN_NUM_POINTS: 12544
 39 |     OVERSAMPLE_RATIO: 3.0
 40 |     IMPORTANCE_SAMPLE_RATIO: 0.75
 41 |     TEST:
 42 |       SEMANTIC_ON: False
 43 |       INSTANCE_ON: True
 44 |       PANOPTIC_ON: False
 45 |       OVERLAP_THRESHOLD: 0.8
 46 |       OBJECT_MASK_THRESHOLD: 0.8
 47 | DATASETS:
 48 |   DATASET_RATIO: [1.0, 0.8]
 49 |   TRAIN: ("coco2ytvis2021_train", "ytvis_2021_train")
 50 |   TEST: ("ytvis_2022_val",)
 51 | SOLVER:
 52 |   IMS_PER_BATCH: 8
 53 |   BASE_LR: 0.00005
 54 |   STEPS: (100000,)
 55 |   MAX_ITER: 187500
 56 |   WARMUP_FACTOR: 1.0
 57 |   WARMUP_ITERS: 10
 58 |   WEIGHT_DECAY: 0.05
 59 |   OPTIMIZER: "ADAMW"
 60 |   BACKBONE_MULTIPLIER: 0.1
 61 |   CLIP_GRADIENTS:
 62 |     ENABLED: True
 63 |     CLIP_TYPE: "full_model"
 64 |     CLIP_VALUE: 0.01
 65 |     NORM_TYPE: 2.0
 66 | INPUT:
 67 |   SAMPLING_FRAME_NUM: 6
 68 |   SAMPLING_FRAME_RANGE: 8
 69 |   SAMPLING_FRAME_SHUFFLE: False
 70 |   # MIN_SIZE_TRAIN_SAMPLING : ["range", "choice", "range_by_clip", "choice_by_clip"]
 71 |   MIN_SIZE_TRAIN_SAMPLING: "choice_by_clip"
 72 |   # RANDOM_FLIP : ["none", "horizontal", "flip_by_clip"]. "horizontal" is set by default.
 73 |   RANDOM_FLIP: "flip_by_clip"
 74 |   AUGMENTATIONS: []
 75 |   MIN_SIZE_TRAIN: (288, 320, 352, 384, 416, 448, 480, 512)
 76 |   MAX_SIZE_TRAIN: 768
 77 |   MIN_SIZE_TEST: 360
 78 |   FORMAT: "RGB"
 79 |   CROP:
 80 |     ENABLED: True
 81 |     TYPE: "absolute_range"
 82 |     SIZE: (384, 600)
 83 |   # For pseudo videos
 84 |   PSEUDO:
 85 |     AUGMENTATIONS: ['rotation']
 86 |     MIN_SIZE_TRAIN: (288, 320, 352, 384, 416, 448, 480, 512)
 87 |     MAX_SIZE_TRAIN: 768
 88 |     CROP:
 89 |       ENABLED: True
 90 |       TYPE: "absolute_range"
 91 |       SIZE: (384, 600)
 92 |   LSJ_AUG:
 93 |     ENABLED: False
 94 |     IMAGE_SIZE: 768
 95 |     MIN_SCALE: 0.1
 96 |     MAX_SCALE: 2.0
 97 | DATALOADER:
 98 |   FILTER_EMPTY_ANNOTATIONS: True
 99 |   NUM_WORKERS: 8
100 | 


--------------------------------------------------------------------------------
/configs/youtubevis_2019/lbvq_R50_bs8.yaml:
--------------------------------------------------------------------------------
  1 | _BASE_: Base-YouTubeVIS-VideoInstanceSegmentation.yaml
  2 | MODEL:
  3 |   WEIGHTS: "mask2former_r50_coco.pkl"
  4 |   META_ARCHITECTURE: "Lbvq"
  5 |   MASK_ON: True
  6 |   SEM_SEG_HEAD:
  7 |     NAME: "MaskFormerHead"
  8 |     IGNORE_VALUE: 255
  9 |     NUM_CLASSES: 40
 10 |     LOSS_WEIGHT: 1.0
 11 |     CONVS_DIM: 256
 12 |     MASK_DIM: 256
 13 |     NORM: "GN"
 14 |     # pixel decoder
 15 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
 16 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
 17 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
 18 |     COMMON_STRIDE: 4
 19 |     TRANSFORMER_ENC_LAYERS: 6
 20 |   MASK_FORMER:
 21 |     TRANSFORMER_DECODER_NAME: "LbvqMultiScaleMaskedTransformerDecoder"
 22 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
 23 |     DEEP_SUPERVISION: True
 24 |     NO_OBJECT_WEIGHT: 0.1
 25 |     CLASS_WEIGHT: 2.0
 26 |     MASK_WEIGHT: 5.0
 27 |     DICE_WEIGHT: 5.0
 28 |     HIDDEN_DIM: 256
 29 |     NUM_OBJECT_QUERIES: 100
 30 |     NHEADS: 8
 31 |     DROPOUT: 0.0
 32 |     DIM_FEEDFORWARD: 2048
 33 |     ENC_LAYERS: 0
 34 |     PRE_NORM: False
 35 |     ENFORCE_INPUT_PROJ: False
 36 |     SIZE_DIVISIBILITY: 32
 37 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
 38 |     TRAIN_NUM_POINTS: 12544
 39 |     OVERSAMPLE_RATIO: 3.0
 40 |     IMPORTANCE_SAMPLE_RATIO: 0.75
 41 |     TEST:
 42 |       SEMANTIC_ON: False
 43 |       INSTANCE_ON: True
 44 |       PANOPTIC_ON: False
 45 |       OVERLAP_THRESHOLD: 0.8
 46 |       OBJECT_MASK_THRESHOLD: 0.8
 47 | DATASETS:
 48 |   DATASET_RATIO: [1.0, 0.75]
 49 |   TRAIN: ("coco2ytvis2019_train", "ytvis_2019_train")
 50 |   TEST: ("ytvis_2019_val",)
 51 | SOLVER:
 52 |   IMS_PER_BATCH: 16
 53 |   BASE_LR: 0.00005
 54 |   STEPS: (75000,)
 55 |   MAX_ITER: 140000
 56 |   WARMUP_FACTOR: 1.0
 57 |   WARMUP_ITERS: 10
 58 |   WEIGHT_DECAY: 0.05
 59 |   OPTIMIZER: "ADAMW"
 60 |   BACKBONE_MULTIPLIER: 0.1
 61 |   CLIP_GRADIENTS:
 62 |     ENABLED: True
 63 |     CLIP_TYPE: "full_model"
 64 |     CLIP_VALUE: 0.01
 65 |     NORM_TYPE: 2.0
 66 | INPUT:
 67 |   SAMPLING_FRAME_NUM: 5
 68 |   SAMPLING_FRAME_RANGE: 20
 69 |   SAMPLING_FRAME_SHUFFLE: False
 70 |   # MIN_SIZE_TRAIN_SAMPLING : ["range", "choice", "range_by_clip", "choice_by_clip"]
 71 |   MIN_SIZE_TRAIN_SAMPLING: "choice_by_clip"
 72 |   # RANDOM_FLIP : ["none", "horizontal", "flip_by_clip"]. "horizontal" is set by default.
 73 |   RANDOM_FLIP: "flip_by_clip"
 74 |   AUGMENTATIONS: []
 75 |   MIN_SIZE_TRAIN: (288, 320, 352, 384, 416, 448, 480, 512)
 76 |   MAX_SIZE_TRAIN: 768
 77 |   MIN_SIZE_TEST: 360
 78 |   FORMAT: "RGB"
 79 |   CROP:
 80 |     ENABLED: True
 81 |     TYPE: "absolute_range"
 82 |     SIZE: (384, 600)
 83 |   # For pseudo videos
 84 |   PSEUDO:
 85 |     AUGMENTATIONS: ['rotation']
 86 |     MIN_SIZE_TRAIN: (288, 320, 352, 384, 416, 448, 480, 512)
 87 |     MAX_SIZE_TRAIN: 768
 88 |     CROP:
 89 |       ENABLED: True
 90 |       TYPE: "absolute_range"
 91 |       SIZE: (384, 600)
 92 |   LSJ_AUG:
 93 |     ENABLED: False
 94 |     IMAGE_SIZE: 768
 95 |     MIN_SCALE: 0.1
 96 |     MAX_SCALE: 2.0
 97 | DATALOADER:
 98 |   FILTER_EMPTY_ANNOTATIONS: True
 99 |   NUM_WORKERS: 8
100 | 


--------------------------------------------------------------------------------
/configs/youtubevis_2021/lbvq_R50_bs8.yaml:
--------------------------------------------------------------------------------
  1 | _BASE_: Base-YouTubeVIS-VideoInstanceSegmentation.yaml
  2 | MODEL:
  3 |   WEIGHTS: "mask2former_r50_coco.pkl"
  4 |   META_ARCHITECTURE: "Lbvq"
  5 |   MASK_ON: True
  6 |   SEM_SEG_HEAD:
  7 |     NAME: "MaskFormerHead"
  8 |     IGNORE_VALUE: 255
  9 |     NUM_CLASSES: 40
 10 |     LOSS_WEIGHT: 1.0
 11 |     CONVS_DIM: 256
 12 |     MASK_DIM: 256
 13 |     NORM: "GN"
 14 |     # pixel decoder
 15 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
 16 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
 17 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
 18 |     COMMON_STRIDE: 4
 19 |     TRANSFORMER_ENC_LAYERS: 6
 20 |   MASK_FORMER:
 21 |     TRANSFORMER_DECODER_NAME: "LbvqMultiScaleMaskedTransformerDecoder"
 22 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
 23 |     DEEP_SUPERVISION: True
 24 |     NO_OBJECT_WEIGHT: 0.1
 25 |     CLASS_WEIGHT: 2.0
 26 |     MASK_WEIGHT: 5.0
 27 |     DICE_WEIGHT: 5.0
 28 |     HIDDEN_DIM: 256
 29 |     NUM_OBJECT_QUERIES: 100
 30 |     NHEADS: 8
 31 |     DROPOUT: 0.0
 32 |     DIM_FEEDFORWARD: 2048
 33 |     ENC_LAYERS: 0
 34 |     PRE_NORM: False
 35 |     ENFORCE_INPUT_PROJ: False
 36 |     SIZE_DIVISIBILITY: 32
 37 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
 38 |     TRAIN_NUM_POINTS: 12544
 39 |     OVERSAMPLE_RATIO: 3.0
 40 |     IMPORTANCE_SAMPLE_RATIO: 0.75
 41 |     TEST:
 42 |       SEMANTIC_ON: False
 43 |       INSTANCE_ON: True
 44 |       PANOPTIC_ON: False
 45 |       OVERLAP_THRESHOLD: 0.8
 46 |       OBJECT_MASK_THRESHOLD: 0.8
 47 | DATASETS:
 48 |   DATASET_RATIO: [1.0, 0.8]
 49 |   TRAIN: ("coco2ytvis2021_train", "ytvis_2021_train")
 50 |   TEST: ("ytvis_2021_val",)
 51 | SOLVER:
 52 |   IMS_PER_BATCH: 8
 53 |   BASE_LR: 0.00005
 54 |   STEPS: (100000,)
 55 |   MAX_ITER: 187500
 56 |   WARMUP_FACTOR: 1.0
 57 |   WARMUP_ITERS: 10
 58 |   WEIGHT_DECAY: 0.05
 59 |   OPTIMIZER: "ADAMW"
 60 |   BACKBONE_MULTIPLIER: 0.1
 61 |   CLIP_GRADIENTS:
 62 |     ENABLED: True
 63 |     CLIP_TYPE: "full_model"
 64 |     CLIP_VALUE: 0.01
 65 |     NORM_TYPE: 2.0
 66 | INPUT:
 67 |   SAMPLING_FRAME_NUM: 6
 68 |   SAMPLING_FRAME_RANGE: 8
 69 |   SAMPLING_FRAME_SHUFFLE: False
 70 |   # MIN_SIZE_TRAIN_SAMPLING : ["range", "choice", "range_by_clip", "choice_by_clip"]
 71 |   MIN_SIZE_TRAIN_SAMPLING: "choice_by_clip"
 72 |   # RANDOM_FLIP : ["none", "horizontal", "flip_by_clip"]. "horizontal" is set by default.
 73 |   RANDOM_FLIP: "flip_by_clip"
 74 |   AUGMENTATIONS: []
 75 |   MIN_SIZE_TRAIN: (288, 320, 352, 384, 416, 448, 480, 512)
 76 |   MAX_SIZE_TRAIN: 768
 77 |   MIN_SIZE_TEST: 360
 78 |   FORMAT: "RGB"
 79 |   CROP:
 80 |     ENABLED: True
 81 |     TYPE: "absolute_range"
 82 |     SIZE: (384, 600)
 83 |   # For pseudo videos
 84 |   PSEUDO:
 85 |     AUGMENTATIONS: ['rotation']
 86 |     MIN_SIZE_TRAIN: (288, 320, 352, 384, 416, 448, 480, 512)
 87 |     MAX_SIZE_TRAIN: 768
 88 |     CROP:
 89 |       ENABLED: True
 90 |       TYPE: "absolute_range"
 91 |       SIZE: (384, 600)
 92 |   LSJ_AUG:
 93 |     ENABLED: False
 94 |     IMAGE_SIZE: 768
 95 |     MIN_SCALE: 0.1
 96 |     MAX_SCALE: 2.0
 97 | DATALOADER:
 98 |   FILTER_EMPTY_ANNOTATIONS: True
 99 |   NUM_WORKERS: 8
100 | 


--------------------------------------------------------------------------------
/configs/ovis/lbvq_R50_bs8.yaml:
--------------------------------------------------------------------------------
  1 | _BASE_: ../youtubevis_2019/Base-YouTubeVIS-VideoInstanceSegmentation.yaml
  2 | MODEL:
  3 |   WEIGHTS: "mask2former_r50_coco.pkl"
  4 |   META_ARCHITECTURE: "Lbvq"
  5 |   MASK_ON: True
  6 |   SEM_SEG_HEAD:
  7 |     NAME: "MaskFormerHead"
  8 |     IGNORE_VALUE: 255
  9 |     NUM_CLASSES: 25
 10 |     LOSS_WEIGHT: 1.0
 11 |     CONVS_DIM: 256
 12 |     MASK_DIM: 256
 13 |     NORM: "GN"
 14 |     # pixel decoder
 15 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
 16 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
 17 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
 18 |     COMMON_STRIDE: 4
 19 |     TRANSFORMER_ENC_LAYERS: 6
 20 |   MASK_FORMER:
 21 |     TRANSFORMER_DECODER_NAME: "LbvqMultiScaleMaskedTransformerDecoder"
 22 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
 23 |     DEEP_SUPERVISION: True
 24 |     NO_OBJECT_WEIGHT: 0.1
 25 |     CLASS_WEIGHT: 2.0
 26 |     MASK_WEIGHT: 5.0
 27 |     DICE_WEIGHT: 5.0
 28 |     HIDDEN_DIM: 256
 29 |     NUM_OBJECT_QUERIES: 100
 30 |     NHEADS: 8
 31 |     DROPOUT: 0.0
 32 |     DIM_FEEDFORWARD: 2048
 33 |     ENC_LAYERS: 0
 34 |     PRE_NORM: False
 35 |     ENFORCE_INPUT_PROJ: False
 36 |     SIZE_DIVISIBILITY: 32
 37 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
 38 |     TRAIN_NUM_POINTS: 12544
 39 |     OVERSAMPLE_RATIO: 3.0
 40 |     IMPORTANCE_SAMPLE_RATIO: 0.75
 41 |     TEST:
 42 |       SEMANTIC_ON: False
 43 |       INSTANCE_ON: True
 44 |       PANOPTIC_ON: False
 45 |       OVERLAP_THRESHOLD: 0.8
 46 |       OBJECT_MASK_THRESHOLD: 0.8
 47 | DATASETS:
 48 |   DATASET_RATIO: [1.0, 0.25]
 49 |   TRAIN: ("coco2ovis_train", "ovis_train")
 50 |   TEST: ("ovis_val",)
 51 | SOLVER:
 52 |   IMS_PER_BATCH: 8
 53 |   BASE_LR: 0.00005
 54 |   STEPS: (50000, 125000)
 55 |   MAX_ITER: 150000
 56 |   WARMUP_FACTOR: 1.0
 57 |   WARMUP_ITERS: 10
 58 |   WEIGHT_DECAY: 0.05
 59 |   OPTIMIZER: "ADAMW"
 60 |   BACKBONE_MULTIPLIER: 0.1
 61 |   CLIP_GRADIENTS:
 62 |     ENABLED: True
 63 |     CLIP_TYPE: "full_model"
 64 |     CLIP_VALUE: 0.01
 65 |     NORM_TYPE: 2.0
 66 | INPUT:
 67 |   SAMPLING_FRAME_NUM: 6
 68 |   SAMPLING_FRAME_RANGE: 8
 69 |   SAMPLING_FRAME_SHUFFLE: False
 70 |   # MIN_SIZE_TRAIN_SAMPLING : ["range", "choice", "range_by_clip", "choice_by_clip"]
 71 |   MIN_SIZE_TRAIN_SAMPLING: "choice_by_clip"
 72 |   # RANDOM_FLIP : ["none", "horizontal", "flip_by_clip"]. "horizontal" is set by default.
 73 |   RANDOM_FLIP: "flip_by_clip"
 74 |   AUGMENTATIONS: []
 75 |   MIN_SIZE_TRAIN: (288, 320, 352, 384, 416, 448, 480, 512)
 76 |   MAX_SIZE_TRAIN: 768
 77 |   MIN_SIZE_TEST: 360
 78 |   FORMAT: "RGB"
 79 |   CROP:
 80 |     ENABLED: True
 81 |     TYPE: "absolute_range"
 82 |     SIZE: (384, 600)
 83 |   # For pseudo videos
 84 |   PSEUDO:
 85 |     AUGMENTATIONS: ['rotation']
 86 |     MIN_SIZE_TRAIN: (288, 320, 352, 384, 416, 448, 480, 512)
 87 |     MAX_SIZE_TRAIN: 768
 88 |     CROP:
 89 |       ENABLED: True
 90 |       TYPE: "absolute_range"
 91 |       SIZE: (384, 600)
 92 |   LSJ_AUG:
 93 |     ENABLED: False
 94 |     IMAGE_SIZE: 768
 95 |     MIN_SCALE: 0.1
 96 |     MAX_SCALE: 2.0
 97 | DATALOADER:
 98 |   FILTER_EMPTY_ANNOTATIONS: True
 99 |   NUM_WORKERS: 8
100 | TEST:
101 |   DETECTIONS_PER_IMAGE: 20
102 | 


--------------------------------------------------------------------------------
/mask2former/modeling/pixel_decoder/ops/setup.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | import os
13 | import glob
14 | 
15 | import torch
16 | 
17 | from torch.utils.cpp_extension import CUDA_HOME
18 | from torch.utils.cpp_extension import CppExtension
19 | from torch.utils.cpp_extension import CUDAExtension
20 | 
21 | from setuptools import find_packages
22 | from setuptools import setup
23 | 
24 | requirements = ["torch", "torchvision"]
25 | 
26 | def get_extensions():
27 |     this_dir = os.path.dirname(os.path.abspath(__file__))
28 |     extensions_dir = os.path.join(this_dir, "src")
29 | 
30 |     main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
31 |     source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
32 |     source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
33 | 
34 |     sources = main_file + source_cpu
35 |     extension = CppExtension
36 |     extra_compile_args = {"cxx": []}
37 |     define_macros = []
38 | 
39 |     # Force cuda since torch ask for a device, not if cuda is in fact available.
40 |     if (os.environ.get('FORCE_CUDA') or torch.cuda.is_available()) and CUDA_HOME is not None:
41 |         extension = CUDAExtension
42 |         sources += source_cuda
43 |         define_macros += [("WITH_CUDA", None)]
44 |         extra_compile_args["nvcc"] = [
45 |             "-DCUDA_HAS_FP16=1",
46 |             "-D__CUDA_NO_HALF_OPERATORS__",
47 |             "-D__CUDA_NO_HALF_CONVERSIONS__",
48 |             "-D__CUDA_NO_HALF2_OPERATORS__",
49 |         ]
50 |     else:
51 |         if CUDA_HOME is None:
52 |             raise NotImplementedError('CUDA_HOME is None. Please set environment variable CUDA_HOME.')
53 |         else:
54 |             raise NotImplementedError('No CUDA runtime is found. Please set FORCE_CUDA=1 or test it by running torch.cuda.is_available().')
55 | 
56 |     sources = [os.path.join(extensions_dir, s) for s in sources]
57 |     include_dirs = [extensions_dir]
58 |     ext_modules = [
59 |         extension(
60 |             "MultiScaleDeformableAttention",
61 |             sources,
62 |             include_dirs=include_dirs,
63 |             define_macros=define_macros,
64 |             extra_compile_args=extra_compile_args,
65 |         )
66 |     ]
67 |     return ext_modules
68 | 
69 | setup(
70 |     name="MultiScaleDeformableAttention",
71 |     version="1.0",
72 |     author="Weijie Su",
73 |     url="https://github.com/fundamentalvision/Deformable-DETR",
74 |     description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention",
75 |     packages=find_packages(exclude=("configs", "tests",)),
76 |     ext_modules=get_extensions(),
77 |     cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
78 | )
79 | 


--------------------------------------------------------------------------------
/configs/youtubevis_2019/lbvq_R101_bs8.yaml:
--------------------------------------------------------------------------------
  1 | _BASE_: Base-YouTubeVIS-VideoInstanceSegmentation.yaml
  2 | MODEL:
  3 |   WEIGHTS: "mask2former_r101_coco.pkl"
  4 |   RESNETS:
  5 |     DEPTH: 101
  6 |     STEM_TYPE: "basic"  # not used
  7 |     STEM_OUT_CHANNELS: 64
  8 |     STRIDE_IN_1X1: False
  9 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
 10 |     # NORM: "SyncBN"
 11 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
 12 |   META_ARCHITECTURE: "Lbvq"
 13 |   MASK_ON: True
 14 |   SEM_SEG_HEAD:
 15 |     NAME: "MaskFormerHead"
 16 |     IGNORE_VALUE: 255
 17 |     NUM_CLASSES: 40
 18 |     LOSS_WEIGHT: 1.0
 19 |     CONVS_DIM: 256
 20 |     MASK_DIM: 256
 21 |     NORM: "GN"
 22 |     # pixel decoder
 23 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
 24 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
 25 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
 26 |     COMMON_STRIDE: 4
 27 |     TRANSFORMER_ENC_LAYERS: 6
 28 |   MASK_FORMER:
 29 |     TRANSFORMER_DECODER_NAME: "LbvqMultiScaleMaskedTransformerDecoder"
 30 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
 31 |     DEEP_SUPERVISION: True
 32 |     NO_OBJECT_WEIGHT: 0.1
 33 |     CLASS_WEIGHT: 2.0
 34 |     MASK_WEIGHT: 5.0
 35 |     DICE_WEIGHT: 5.0
 36 |     HIDDEN_DIM: 256
 37 |     NUM_OBJECT_QUERIES: 100
 38 |     NHEADS: 8
 39 |     DROPOUT: 0.0
 40 |     DIM_FEEDFORWARD: 2048
 41 |     ENC_LAYERS: 0
 42 |     PRE_NORM: False
 43 |     ENFORCE_INPUT_PROJ: False
 44 |     SIZE_DIVISIBILITY: 32
 45 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
 46 |     TRAIN_NUM_POINTS: 12544
 47 |     OVERSAMPLE_RATIO: 3.0
 48 |     IMPORTANCE_SAMPLE_RATIO: 0.75
 49 |     TEST:
 50 |       SEMANTIC_ON: False
 51 |       INSTANCE_ON: True
 52 |       PANOPTIC_ON: False
 53 |       OVERLAP_THRESHOLD: 0.8
 54 |       OBJECT_MASK_THRESHOLD: 0.8
 55 | DATASETS:
 56 |   DATASET_RATIO: [1.0, 0.75]
 57 |   TRAIN: ("coco2ytvis2019_train", "ytvis_2019_train")
 58 |   TEST: ("ytvis_2019_val",)
 59 | SOLVER:
 60 |   IMS_PER_BATCH: 8
 61 |   BASE_LR: 0.00005
 62 |   STEPS: (75000,)
 63 |   MAX_ITER: 140000
 64 |   WARMUP_FACTOR: 1.0
 65 |   WARMUP_ITERS: 10
 66 |   WEIGHT_DECAY: 0.05
 67 |   OPTIMIZER: "ADAMW"
 68 |   BACKBONE_MULTIPLIER: 0.1
 69 |   CLIP_GRADIENTS:
 70 |     ENABLED: True
 71 |     CLIP_TYPE: "full_model"
 72 |     CLIP_VALUE: 0.01
 73 |     NORM_TYPE: 2.0
 74 | INPUT:
 75 |   SAMPLING_FRAME_NUM: 5
 76 |   SAMPLING_FRAME_RANGE: 20
 77 |   SAMPLING_FRAME_SHUFFLE: False
 78 |   # MIN_SIZE_TRAIN_SAMPLING : ["range", "choice", "range_by_clip", "choice_by_clip"]
 79 |   MIN_SIZE_TRAIN_SAMPLING: "choice_by_clip"
 80 |   # RANDOM_FLIP : ["none", "horizontal", "flip_by_clip"]. "horizontal" is set by default.
 81 |   RANDOM_FLIP: "flip_by_clip"
 82 |   AUGMENTATIONS: []
 83 |   MIN_SIZE_TRAIN: (288, 320, 352, 384, 416, 448, 480, 512)
 84 |   MAX_SIZE_TRAIN: 768
 85 |   MIN_SIZE_TEST: 360
 86 |   FORMAT: "RGB"
 87 |   CROP:
 88 |     ENABLED: True
 89 |     TYPE: "absolute_range"
 90 |     SIZE: (384, 600)
 91 |   # For pseudo videos
 92 |   PSEUDO:
 93 |     AUGMENTATIONS: ['rotation']
 94 |     MIN_SIZE_TRAIN: (288, 320, 352, 384, 416, 448, 480, 512)
 95 |     MAX_SIZE_TRAIN: 768
 96 |     CROP:
 97 |       ENABLED: True
 98 |       TYPE: "absolute_range"
 99 |       SIZE: (384, 600)
100 |   LSJ_AUG:
101 |     ENABLED: False
102 |     IMAGE_SIZE: 768
103 |     MIN_SCALE: 0.1
104 |     MAX_SCALE: 2.0
105 | DATALOADER:
106 |   FILTER_EMPTY_ANNOTATIONS: True
107 |   NUM_WORKERS: 8
108 | 


--------------------------------------------------------------------------------
/mask2former/modeling/pixel_decoder/ops/functions/ms_deform_attn_func.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from __future__ import absolute_import
13 | from __future__ import print_function
14 | from __future__ import division
15 | 
16 | import torch
17 | import torch.nn.functional as F
18 | from torch.autograd import Function
19 | from torch.autograd.function import once_differentiable
20 | 
21 | try:
22 |     import MultiScaleDeformableAttention as MSDA
23 | except ModuleNotFoundError as e:
24 |     info_string = (
25 |         "\n\nPlease compile MultiScaleDeformableAttention CUDA op with the following commands:\n"
26 |         "\t`cd mask2former/modeling/pixel_decoder/ops`\n"
27 |         "\t`sh make.sh`\n"
28 |     )
29 |     raise ModuleNotFoundError(info_string)
30 | 
31 | 
32 | class MSDeformAttnFunction(Function):
33 |     @staticmethod
34 |     def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step):
35 |         ctx.im2col_step = im2col_step
36 |         output = MSDA.ms_deform_attn_forward(
37 |             value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step)
38 |         ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights)
39 |         return output
40 | 
41 |     @staticmethod
42 |     @once_differentiable
43 |     def backward(ctx, grad_output):
44 |         value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors
45 |         grad_value, grad_sampling_loc, grad_attn_weight = \
46 |             MSDA.ms_deform_attn_backward(
47 |                 value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step)
48 | 
49 |         return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
50 | 
51 | 
52 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
53 |     # for debug and test only,
54 |     # need to use cuda version instead
55 |     N_, S_, M_, D_ = value.shape
56 |     _, Lq_, M_, L_, P_, _ = sampling_locations.shape
57 |     value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
58 |     sampling_grids = 2 * sampling_locations - 1
59 |     sampling_value_list = []
60 |     for lid_, (H_, W_) in enumerate(value_spatial_shapes):
61 |         # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
62 |         value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_)
63 |         # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
64 |         sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)
65 |         # N_*M_, D_, Lq_, P_
66 |         sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,
67 |                                           mode='bilinear', padding_mode='zeros', align_corners=False)
68 |         sampling_value_list.append(sampling_value_l_)
69 |     # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
70 |     attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_)
71 |     output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)
72 |     return output.transpose(1, 2).contiguous()
73 | 


--------------------------------------------------------------------------------
/mask2former/test_time_augmentation.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | import copy
  3 | import logging
  4 | from itertools import count
  5 | 
  6 | import numpy as np
  7 | import torch
  8 | from fvcore.transforms import HFlipTransform
  9 | from torch import nn
 10 | from torch.nn.parallel import DistributedDataParallel
 11 | 
 12 | from detectron2.data.detection_utils import read_image
 13 | from detectron2.modeling import DatasetMapperTTA
 14 | 
 15 | 
 16 | __all__ = [
 17 |     "SemanticSegmentorWithTTA",
 18 | ]
 19 | 
 20 | 
 21 | class SemanticSegmentorWithTTA(nn.Module):
 22 |     """
 23 |     A SemanticSegmentor with test-time augmentation enabled.
 24 |     Its :meth:`__call__` method has the same interface as :meth:`SemanticSegmentor.forward`.
 25 |     """
 26 | 
 27 |     def __init__(self, cfg, model, tta_mapper=None, batch_size=1):
 28 |         """
 29 |         Args:
 30 |             cfg (CfgNode):
 31 |             model (SemanticSegmentor): a SemanticSegmentor to apply TTA on.
 32 |             tta_mapper (callable): takes a dataset dict and returns a list of
 33 |                 augmented versions of the dataset dict. Defaults to
 34 |                 `DatasetMapperTTA(cfg)`.
 35 |             batch_size (int): batch the augmented images into this batch size for inference.
 36 |         """
 37 |         super().__init__()
 38 |         if isinstance(model, DistributedDataParallel):
 39 |             model = model.module
 40 |         self.cfg = cfg.clone()
 41 | 
 42 |         self.model = model
 43 | 
 44 |         if tta_mapper is None:
 45 |             tta_mapper = DatasetMapperTTA(cfg)
 46 |         self.tta_mapper = tta_mapper
 47 |         self.batch_size = batch_size
 48 | 
 49 |     def __call__(self, batched_inputs):
 50 |         """
 51 |         Same input/output format as :meth:`SemanticSegmentor.forward`
 52 |         """
 53 | 
 54 |         def _maybe_read_image(dataset_dict):
 55 |             ret = copy.copy(dataset_dict)
 56 |             if "image" not in ret:
 57 |                 image = read_image(ret.pop("file_name"), self.model.input_format)
 58 |                 image = torch.from_numpy(np.ascontiguousarray(image.transpose(2, 0, 1)))  # CHW
 59 |                 ret["image"] = image
 60 |             if "height" not in ret and "width" not in ret:
 61 |                 ret["height"] = image.shape[1]
 62 |                 ret["width"] = image.shape[2]
 63 |             return ret
 64 | 
 65 |         processed_results = []
 66 |         for x in batched_inputs:
 67 |             result = self._inference_one_image(_maybe_read_image(x))
 68 |             processed_results.append(result)
 69 |         return processed_results
 70 | 
 71 |     def _inference_one_image(self, input):
 72 |         """
 73 |         Args:
 74 |             input (dict): one dataset dict with "image" field being a CHW tensor
 75 |         Returns:
 76 |             dict: one output dict
 77 |         """
 78 |         orig_shape = (input["height"], input["width"])
 79 |         augmented_inputs, tfms = self._get_augmented_inputs(input)
 80 | 
 81 |         final_predictions = None
 82 |         count_predictions = 0
 83 |         for input, tfm in zip(augmented_inputs, tfms):
 84 |             count_predictions += 1
 85 |             with torch.no_grad():
 86 |                 if final_predictions is None:
 87 |                     if any(isinstance(t, HFlipTransform) for t in tfm.transforms):
 88 |                         final_predictions = self.model([input])[0].pop("sem_seg").flip(dims=[2])
 89 |                     else:
 90 |                         final_predictions = self.model([input])[0].pop("sem_seg")
 91 |                 else:
 92 |                     if any(isinstance(t, HFlipTransform) for t in tfm.transforms):
 93 |                         final_predictions += self.model([input])[0].pop("sem_seg").flip(dims=[2])
 94 |                     else:
 95 |                         final_predictions += self.model([input])[0].pop("sem_seg")
 96 | 
 97 |         final_predictions = final_predictions / count_predictions
 98 |         return {"sem_seg": final_predictions}
 99 | 
100 |     def _get_augmented_inputs(self, input):
101 |         augmented_inputs = self.tta_mapper(input)
102 |         tfms = [x.pop("transforms") for x in augmented_inputs]
103 |         return augmented_inputs, tfms
104 | 


--------------------------------------------------------------------------------
/mask2former/utils/misc.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/util/misc.py
  3 | """
  4 | Misc functions, including distributed helpers.
  5 | 
  6 | Mostly copy-paste from torchvision references.
  7 | """
  8 | from typing import List, Optional
  9 | 
 10 | import torch
 11 | import torch.distributed as dist
 12 | import torchvision
 13 | from torch import Tensor
 14 | 
 15 | 
 16 | def _max_by_axis(the_list):
 17 |     # type: (List[List[int]]) -> List[int]
 18 |     maxes = the_list[0]
 19 |     for sublist in the_list[1:]:
 20 |         for index, item in enumerate(sublist):
 21 |             maxes[index] = max(maxes[index], item)
 22 |     return maxes
 23 | 
 24 | 
 25 | class NestedTensor(object):
 26 |     def __init__(self, tensors, mask: Optional[Tensor]):
 27 |         self.tensors = tensors
 28 |         self.mask = mask
 29 | 
 30 |     def to(self, device):
 31 |         # type: (Device) -> NestedTensor # noqa
 32 |         cast_tensor = self.tensors.to(device)
 33 |         mask = self.mask
 34 |         if mask is not None:
 35 |             assert mask is not None
 36 |             cast_mask = mask.to(device)
 37 |         else:
 38 |             cast_mask = None
 39 |         return NestedTensor(cast_tensor, cast_mask)
 40 | 
 41 |     def decompose(self):
 42 |         return self.tensors, self.mask
 43 | 
 44 |     def __repr__(self):
 45 |         return str(self.tensors)
 46 | 
 47 | 
 48 | def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
 49 |     # TODO make this more general
 50 |     if tensor_list[0].ndim == 3:
 51 |         if torchvision._is_tracing():
 52 |             # nested_tensor_from_tensor_list() does not export well to ONNX
 53 |             # call _onnx_nested_tensor_from_tensor_list() instead
 54 |             return _onnx_nested_tensor_from_tensor_list(tensor_list)
 55 | 
 56 |         # TODO make it support different-sized images
 57 |         max_size = _max_by_axis([list(img.shape) for img in tensor_list])
 58 |         # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list]))
 59 |         batch_shape = [len(tensor_list)] + max_size
 60 |         b, c, h, w = batch_shape
 61 |         dtype = tensor_list[0].dtype
 62 |         device = tensor_list[0].device
 63 |         tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
 64 |         mask = torch.ones((b, h, w), dtype=torch.bool, device=device)
 65 |         for img, pad_img, m in zip(tensor_list, tensor, mask):
 66 |             pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
 67 |             m[: img.shape[1], : img.shape[2]] = False
 68 |     else:
 69 |         raise ValueError("not supported")
 70 |     return NestedTensor(tensor, mask)
 71 | 
 72 | 
 73 | # _onnx_nested_tensor_from_tensor_list() is an implementation of
 74 | # nested_tensor_from_tensor_list() that is supported by ONNX tracing.
 75 | @torch.jit.unused
 76 | def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor:
 77 |     max_size = []
 78 |     for i in range(tensor_list[0].dim()):
 79 |         max_size_i = torch.max(
 80 |             torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32)
 81 |         ).to(torch.int64)
 82 |         max_size.append(max_size_i)
 83 |     max_size = tuple(max_size)
 84 | 
 85 |     # work around for
 86 |     # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
 87 |     # m[: img.shape[1], :img.shape[2]] = False
 88 |     # which is not yet supported in onnx
 89 |     padded_imgs = []
 90 |     padded_masks = []
 91 |     for img in tensor_list:
 92 |         padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))]
 93 |         padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0]))
 94 |         padded_imgs.append(padded_img)
 95 | 
 96 |         m = torch.zeros_like(img[0], dtype=torch.int, device=img.device)
 97 |         padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1)
 98 |         padded_masks.append(padded_mask.to(torch.bool))
 99 | 
100 |     tensor = torch.stack(padded_imgs)
101 |     mask = torch.stack(padded_masks)
102 | 
103 |     return NestedTensor(tensor, mask=mask)
104 | 
105 | 
106 | def is_dist_avail_and_initialized():
107 |     if not dist.is_available():
108 |         return False
109 |     if not dist.is_initialized():
110 |         return False
111 |     return True
112 | 


--------------------------------------------------------------------------------
/lbvq/modeling/transformer_decoder/position_encoding.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Various positional encodings for the transformer.
  3 | """
  4 | import math
  5 | 
  6 | import torch
  7 | from torch import nn
  8 | 
  9 | 
 10 | class PositionEmbeddingSine(nn.Module):
 11 |     """
 12 |     This is a more standard version of the position embedding, very similar to the one
 13 |     used by the Attention is all you need paper, generalized to work on images.
 14 |     """
 15 | 
 16 |     def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
 17 |         super().__init__()
 18 |         self.num_pos_feats = num_pos_feats
 19 |         self.temperature = temperature
 20 |         self.normalize = normalize
 21 |         if scale is not None and normalize is False:
 22 |             raise ValueError("normalize should be True if scale is passed")
 23 |         if scale is None:
 24 |             scale = 2 * math.pi
 25 |         self.scale = scale
 26 | 
 27 |     def forward(self, x, mask=None):
 28 |         if mask is None:
 29 |             mask = torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool)
 30 |         not_mask = ~mask
 31 |         y_embed = not_mask.cumsum(1, dtype=torch.float32)
 32 |         x_embed = not_mask.cumsum(2, dtype=torch.float32)
 33 |         if self.normalize:
 34 |             eps = 1e-6
 35 |             y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
 36 |             x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
 37 | 
 38 |         dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
 39 |         dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.num_pos_feats)
 40 | 
 41 |         pos_x = x_embed[:, :, :, None] / dim_t
 42 |         pos_y = y_embed[:, :, :, None] / dim_t
 43 |         pos_x = torch.stack(
 44 |             (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4
 45 |         ).flatten(3)
 46 |         pos_y = torch.stack(
 47 |             (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4
 48 |         ).flatten(3)
 49 |         pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
 50 |         return pos
 51 |     
 52 |     def __repr__(self, _repr_indent=4):
 53 |         head = "Positional encoding " + self.__class__.__name__
 54 |         body = [
 55 |             "num_pos_feats: {}".format(self.num_pos_feats),
 56 |             "temperature: {}".format(self.temperature),
 57 |             "normalize: {}".format(self.normalize),
 58 |             "scale: {}".format(self.scale),
 59 |         ]
 60 |         # _repr_indent = 4
 61 |         lines = [head] + [" " * _repr_indent + line for line in body]
 62 |         return "\n".join(lines)
 63 | 
 64 | 
 65 | class PositionEmbeddingSine1D(nn.Module):
 66 |     """
 67 |     This is a more standard version of the position embedding, very similar to the one
 68 |     used by the Attention is all you need paper, generalized to work on images.
 69 |     """
 70 | 
 71 |     def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
 72 |         super().__init__()
 73 |         self.num_pos_feats = num_pos_feats
 74 |         self.temperature = temperature
 75 |         self.normalize = normalize
 76 |         if scale is not None and normalize is False:
 77 |             raise ValueError("normalize should be True if scale is passed")
 78 |         if scale is None:
 79 |             scale = 2 * math.pi
 80 |         self.scale = scale
 81 | 
 82 |     def forward(self, x, mask=None):
 83 |         """
 84 |         Args:
 85 |             x (Tensor): [T, Q, B, C]
 86 |         Output: temporal positional embedding with the same shape of x.
 87 |         """
 88 |         if mask is None:
 89 |             mask = torch.zeros((x.size(0), x.size(1), x.size(2)), device=x.device, dtype=torch.bool)
 90 |         not_mask = ~mask
 91 |         z_embed = not_mask.cumsum(0, dtype=torch.float32)
 92 |         if self.normalize:
 93 |             eps = 1e-6
 94 |             z_embed = z_embed / (z_embed[-1:, :, :] + eps) * self.scale
 95 | 
 96 |         dim_t_z = torch.arange((self.num_pos_feats * 2), dtype=torch.float32, device=x.device)
 97 |         dim_t_z = self.temperature ** (2 * torch.div(dim_t_z, 2, rounding_mode="floor") / (self.num_pos_feats * 2))
 98 | 
 99 |         pos_z = z_embed[:, :, :, None] / dim_t_z
100 |         pos_z = torch.stack((pos_z[:, :, :, 0::2].sin(), pos_z[:, :, :, 1::2].cos()), dim=4).flatten(3)
101 | 
102 |         pos = pos_z
103 |         return pos
104 | 


--------------------------------------------------------------------------------
/lbvq/data/datasets/builtin.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | from detectron2.data.datasets.builtin_meta import _get_builtin_metadata
  4 | from detectron2.data.datasets.coco import register_coco_instances
  5 | 
  6 | from .ovis import _get_ovis_instances_meta
  7 | from lbvq.data.datasets.ytvis import (
  8 |     register_ytvis_instances,
  9 |     _get_ytvis_2019_instances_meta,
 10 |     _get_ytvis_2021_instances_meta
 11 | )
 12 | 
 13 | # ==== Predefined splits for YTVIS 2019 ===========
 14 | _PREDEFINED_SPLITS_YTVIS_2019 = {
 15 |     "ytvis_2019_train": ("ytvis_2019/train/JPEGImages",
 16 |                          "ytvis_2019/train.json"),
 17 |     "ytvis_2019_val": ("ytvis_2019/valid/JPEGImages",
 18 |                        "ytvis_2019/valid.json"),
 19 |     "ytvis_2019_test": ("ytvis_2019/test/JPEGImages",
 20 |                         "ytvis_2019/test.json"),
 21 | }
 22 | 
 23 | 
 24 | # ==== Predefined splits for YTVIS 2021 ===========
 25 | _PREDEFINED_SPLITS_YTVIS_2021 = {
 26 |     "ytvis_2021_train": ("ytvis_2021/train/JPEGImages",
 27 |                          "ytvis_2021/train.json"),
 28 |     "ytvis_2021_val": ("ytvis_2021/valid/JPEGImages",
 29 |                        "ytvis_2021/valid.json"),
 30 |     "ytvis_2021_test": ("ytvis_2021/test/JPEGImages",
 31 |                         "ytvis_2021/test.json"),
 32 |     "ytvis_2022_val": ("ytvis_2022/valid/JPEGImages",
 33 |                        "ytvis_2022/valid.json"),
 34 | }
 35 | 
 36 | 
 37 | # ====    Predefined splits for OVIS    ===========
 38 | _PREDEFINED_SPLITS_OVIS = {
 39 |     "ovis_train": ("ovis/train",
 40 |                    "ovis/annotations/train.json"),
 41 |     "ovis_val": ("ovis/valid",
 42 |                  "ovis/annotations/valid.json"),
 43 |     "ovis_test": ("ovis/test",
 44 |                   "ovis/annotations/test.json"),
 45 | }
 46 | 
 47 | 
 48 | _PREDEFINED_SPLITS_COCO_VIDEO = {
 49 |     "coco2ytvis2019_train": ("coco/train2017", "coco/annotations/coco2ytvis2019_train.json"),
 50 |     "coco2ytvis2019_val": ("coco/val2017", "coco/annotations/coco2ytvis2019_val.json"),
 51 |     "coco2ytvis2021_train": ("coco/train2017", "coco/annotations/coco2ytvis2021_train.json"),
 52 |     "coco2ytvis2021_val": ("coco/val2017", "coco/annotations/coco2ytvis2021_val.json"),
 53 |     "coco2ovis_train": ("coco/train2017", "coco/annotations/coco2ovis_train.json"),
 54 |     "coco2ovis_val": ("coco/val2017", "coco/annotations/coco2ovis_val.json"),
 55 | }
 56 | 
 57 | 
 58 | def register_all_ytvis_2019(root):
 59 |     for key, (image_root, json_file) in _PREDEFINED_SPLITS_YTVIS_2019.items():
 60 |         # Assume pre-defined datasets live in `./datasets`.
 61 |         register_ytvis_instances(
 62 |             key,
 63 |             _get_ytvis_2019_instances_meta(),
 64 |             os.path.join(root, json_file) if "://" not in json_file else json_file,
 65 |             os.path.join(root, image_root),
 66 |         )
 67 | 
 68 | 
 69 | def register_all_ytvis_2021(root):
 70 |     for key, (image_root, json_file) in _PREDEFINED_SPLITS_YTVIS_2021.items():
 71 |         # Assume pre-defined datasets live in `./datasets`.
 72 |         register_ytvis_instances(
 73 |             key,
 74 |             _get_ytvis_2021_instances_meta(),
 75 |             os.path.join(root, json_file) if "://" not in json_file else json_file,
 76 |             os.path.join(root, image_root),
 77 |         )
 78 | 
 79 | 
 80 | def register_all_coco_video(root):
 81 |     for key, (image_root, json_file) in _PREDEFINED_SPLITS_COCO_VIDEO.items():
 82 |         # Assume pre-defined datasets live in `./datasets`.
 83 |         register_coco_instances(
 84 |             key,
 85 |             _get_builtin_metadata("coco"),
 86 |             os.path.join(root, json_file) if "://" not in json_file else json_file,
 87 |             os.path.join(root, image_root),
 88 |         )
 89 | 
 90 | 
 91 | def register_all_ovis(root):
 92 |     for key, (image_root, json_file) in _PREDEFINED_SPLITS_OVIS.items():
 93 |         # Assume pre-defined datasets live in `./datasets`.
 94 |         register_ytvis_instances(
 95 |             key,
 96 |             _get_ovis_instances_meta(),
 97 |             os.path.join(root, json_file) if "://" not in json_file else json_file,
 98 |             os.path.join(root, image_root),
 99 |         )
100 | 
101 | 
102 | if __name__.endswith(".builtin"):
103 |     # Assume pre-defined datasets live in `./datasets`.
104 |     _root = os.getenv("DETECTRON2_DATASETS", "datasets")
105 |     register_all_ovis(_root)
106 |     register_all_ytvis_2019(_root)
107 |     register_all_ytvis_2021(_root)
108 |     register_all_coco_video(_root)
109 | 


--------------------------------------------------------------------------------
/mask2former/modeling/pixel_decoder/ops/test.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from __future__ import absolute_import
13 | from __future__ import print_function
14 | from __future__ import division
15 | 
16 | import time
17 | import torch
18 | import torch.nn as nn
19 | from torch.autograd import gradcheck
20 | 
21 | from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch
22 | 
23 | 
24 | N, M, D = 1, 2, 2
25 | Lq, L, P = 2, 2, 2
26 | shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda()
27 | level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1]))
28 | S = sum([(H*W).item() for H, W in shapes])
29 | 
30 | 
31 | torch.manual_seed(3)
32 | 
33 | 
34 | @torch.no_grad()
35 | def check_forward_equal_with_pytorch_double():
36 |     value = torch.rand(N, S, M, D).cuda() * 0.01
37 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
38 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
39 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
40 |     im2col_step = 2
41 |     output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu()
42 |     output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu()
43 |     fwdok = torch.allclose(output_cuda, output_pytorch)
44 |     max_abs_err = (output_cuda - output_pytorch).abs().max()
45 |     max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
46 | 
47 |     print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
48 | 
49 | 
50 | @torch.no_grad()
51 | def check_forward_equal_with_pytorch_float():
52 |     value = torch.rand(N, S, M, D).cuda() * 0.01
53 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
54 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
55 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
56 |     im2col_step = 2
57 |     output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu()
58 |     output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu()
59 |     fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3)
60 |     max_abs_err = (output_cuda - output_pytorch).abs().max()
61 |     max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
62 | 
63 |     print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
64 | 
65 | 
66 | def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True):
67 | 
68 |     value = torch.rand(N, S, M, channels).cuda() * 0.01
69 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
70 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
71 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
72 |     im2col_step = 2
73 |     func = MSDeformAttnFunction.apply
74 | 
75 |     value.requires_grad = grad_value
76 |     sampling_locations.requires_grad = grad_sampling_loc
77 |     attention_weights.requires_grad = grad_attn_weight
78 | 
79 |     gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step))
80 | 
81 |     print(f'* {gradok} check_gradient_numerical(D={channels})')
82 | 
83 | 
84 | if __name__ == '__main__':
85 |     check_forward_equal_with_pytorch_double()
86 |     check_forward_equal_with_pytorch_float()
87 | 
88 |     for channels in [30, 32, 64, 71, 1025, 2048, 3096]:
89 |         check_gradient_numerical(channels, True, True, True)
90 | 
91 | 
92 | 
93 | 


--------------------------------------------------------------------------------
/mask2former/config.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Copyright (c) Facebook, Inc. and its affiliates.
  3 | from detectron2.config import CfgNode as CN
  4 | 
  5 | 
  6 | def add_maskformer2_config(cfg):
  7 |     """
  8 |     Add config for MASK_FORMER.
  9 |     """
 10 |     # NOTE: configs from original maskformer
 11 |     # data config
 12 |     # select the dataset mapper
 13 |     cfg.INPUT.DATASET_MAPPER_NAME = "mask_former_semantic"
 14 |     # Color augmentation
 15 |     cfg.INPUT.COLOR_AUG_SSD = False
 16 |     # We retry random cropping until no single category in semantic segmentation GT occupies more
 17 |     # than `SINGLE_CATEGORY_MAX_AREA` part of the crop.
 18 |     cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA = 1.0
 19 |     # Pad image and segmentation GT in dataset mapper.
 20 |     cfg.INPUT.SIZE_DIVISIBILITY = -1
 21 | 
 22 |     # solver config
 23 |     # weight decay on embedding
 24 |     cfg.SOLVER.WEIGHT_DECAY_EMBED = 0.0
 25 |     # optimizer
 26 |     cfg.SOLVER.OPTIMIZER = "ADAMW"
 27 |     cfg.SOLVER.BACKBONE_MULTIPLIER = 0.1
 28 | 
 29 |     # mask_former model config
 30 |     cfg.MODEL.MASK_FORMER = CN()
 31 | 
 32 |     # loss
 33 |     cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION = True
 34 |     cfg.MODEL.MASK_FORMER.NO_OBJECT_WEIGHT = 0.1
 35 |     cfg.MODEL.MASK_FORMER.CLASS_WEIGHT = 1.0
 36 |     cfg.MODEL.MASK_FORMER.DICE_WEIGHT = 1.0
 37 |     cfg.MODEL.MASK_FORMER.MASK_WEIGHT = 20.0
 38 | 
 39 |     # transformer config
 40 |     cfg.MODEL.MASK_FORMER.NHEADS = 8
 41 |     cfg.MODEL.MASK_FORMER.DROPOUT = 0.1
 42 |     cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD = 2048
 43 |     cfg.MODEL.MASK_FORMER.ENC_LAYERS = 0
 44 |     cfg.MODEL.MASK_FORMER.DEC_LAYERS = 6
 45 |     cfg.MODEL.MASK_FORMER.PRE_NORM = False
 46 | 
 47 |     cfg.MODEL.MASK_FORMER.HIDDEN_DIM = 256
 48 |     cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES = 100
 49 | 
 50 |     cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE = "res5"
 51 |     cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ = False
 52 | 
 53 |     # mask_former inference config
 54 |     cfg.MODEL.MASK_FORMER.TEST = CN()
 55 |     cfg.MODEL.MASK_FORMER.TEST.SEMANTIC_ON = True
 56 |     cfg.MODEL.MASK_FORMER.TEST.INSTANCE_ON = False
 57 |     cfg.MODEL.MASK_FORMER.TEST.PANOPTIC_ON = False
 58 |     cfg.MODEL.MASK_FORMER.TEST.OBJECT_MASK_THRESHOLD = 0.0
 59 |     cfg.MODEL.MASK_FORMER.TEST.OVERLAP_THRESHOLD = 0.0
 60 |     cfg.MODEL.MASK_FORMER.TEST.SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE = False
 61 | 
 62 |     # Sometimes `backbone.size_divisibility` is set to 0 for some backbone (e.g. ResNet)
 63 |     # you can use this config to override
 64 |     cfg.MODEL.MASK_FORMER.SIZE_DIVISIBILITY = 32
 65 | 
 66 |     # pixel decoder config
 67 |     cfg.MODEL.SEM_SEG_HEAD.MASK_DIM = 256
 68 |     # adding transformer in pixel decoder
 69 |     cfg.MODEL.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS = 0
 70 |     # pixel decoder
 71 |     cfg.MODEL.SEM_SEG_HEAD.PIXEL_DECODER_NAME = "BasePixelDecoder"
 72 | 
 73 |     # swin transformer backbone
 74 |     cfg.MODEL.SWIN = CN()
 75 |     cfg.MODEL.SWIN.PRETRAIN_IMG_SIZE = 224
 76 |     cfg.MODEL.SWIN.PATCH_SIZE = 4
 77 |     cfg.MODEL.SWIN.EMBED_DIM = 96
 78 |     cfg.MODEL.SWIN.DEPTHS = [2, 2, 6, 2]
 79 |     cfg.MODEL.SWIN.NUM_HEADS = [3, 6, 12, 24]
 80 |     cfg.MODEL.SWIN.WINDOW_SIZE = 7
 81 |     cfg.MODEL.SWIN.MLP_RATIO = 4.0
 82 |     cfg.MODEL.SWIN.QKV_BIAS = True
 83 |     cfg.MODEL.SWIN.QK_SCALE = None
 84 |     cfg.MODEL.SWIN.DROP_RATE = 0.0
 85 |     cfg.MODEL.SWIN.ATTN_DROP_RATE = 0.0
 86 |     cfg.MODEL.SWIN.DROP_PATH_RATE = 0.3
 87 |     cfg.MODEL.SWIN.APE = False
 88 |     cfg.MODEL.SWIN.PATCH_NORM = True
 89 |     cfg.MODEL.SWIN.OUT_FEATURES = ["res2", "res3", "res4", "res5"]
 90 |     cfg.MODEL.SWIN.USE_CHECKPOINT = False
 91 | 
 92 |     # NOTE: maskformer2 extra configs
 93 |     # transformer module
 94 |     cfg.MODEL.MASK_FORMER.TRANSFORMER_DECODER_NAME = "MultiScaleMaskedTransformerDecoder"
 95 | 
 96 |     # LSJ aug
 97 |     cfg.INPUT.IMAGE_SIZE = 1024
 98 |     cfg.INPUT.MIN_SCALE = 0.1
 99 |     cfg.INPUT.MAX_SCALE = 2.0
100 | 
101 |     # MSDeformAttn encoder configs
102 |     cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES = ["res3", "res4", "res5"]
103 |     cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_POINTS = 4
104 |     cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_HEADS = 8
105 | 
106 |     # point loss configs
107 |     # Number of points sampled during training for a mask point head.
108 |     cfg.MODEL.MASK_FORMER.TRAIN_NUM_POINTS = 112 * 112
109 |     # Oversampling parameter for PointRend point sampling during training. Parameter `k` in the
110 |     # original paper.
111 |     cfg.MODEL.MASK_FORMER.OVERSAMPLE_RATIO = 3.0
112 |     # Importance sampling parameter for PointRend point sampling during training. Parametr `beta` in
113 |     # the original paper.
114 |     cfg.MODEL.MASK_FORMER.IMPORTANCE_SAMPLE_RATIO = 0.75
115 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Learning Better Video Query with SAM for Video Instance Segmentation (TCSVT 2024)
  2 | 
  3 | [Hao Fang](https://fanghaook.github.io), Tong Zhang, [Xiaofei Zhou](https://scholar.google.cz/citations?user=2PUAFW8AAAAJ), [Xinxin Zhang](https://scholar.google.cz/citations?user=rPv44PoAAAAJ)
  4 | 
  5 | [[`paper`](https://ieeexplore.ieee.org/abstract/document/10418101)] [[`BibTeX`](#CitingLBVQ)]
  6 | 
  7 | <div align="center">
  8 |   <img src="LBVQ.png" width="100%" height="100%"/>
  9 | </div><br/>
 10 | 
 11 | ## Installation
 12 | 
 13 | See [installation instructions](INSTALL.md).
 14 | 
 15 | ## Getting Started
 16 | 
 17 | We provide a script `train_net.py`, that is made to train all the configs provided in LBVQ.
 18 | 
 19 | To train a model with "train_net.py" on VIS, first
 20 | setup the corresponding datasets following
 21 | [Preparing Datasets for LBVQ](./datasets/README.md).
 22 | 
 23 | Then run with COCO pretrained weights in the Model Zoo:
 24 | ```
 25 | python train_net.py --num-gpus 8 \
 26 |   --config-file configs/youtubevis_2019/lbvq_R50_bs8.yaml \
 27 |   MODEL.WEIGHTS mask2former_r50_coco.pkl
 28 | ```
 29 | 
 30 | To evaluate a model's performance, use
 31 | ```
 32 | python train_net.py \
 33 |   --config-file configs/youtubevis_2019/lbvq_R50_bs8.yaml \
 34 |   --eval-only MODEL.WEIGHTS lbvq_r50_ytvis19.pth
 35 | ```
 36 | If you want to use SAM to refine your results, use
 37 | ```
 38 | python train_net.py \
 39 |   --config-file configs/youtubevis_2019/lbvq_R50_bs8.yaml \
 40 |   --eval-only MODEL.WEIGHTS lbvq_r50_ytvis19.pth SAM True
 41 | ```
 42 | To visualize a video in the dataset, use
 43 | ```
 44 | python demo_lbvq/demo.py --config-file configs/youtubevis_2019/lbvq_R50_bs8.yaml \
 45 |   --input datasets/ytvis_2019/valid/JPEGImages/xxxxxxx/*.jpg \
 46 |   --output output/demo --save-frames True \
 47 |   --opts MODEL.WEIGHTS lbvq_r50_ytvis2019.pth
 48 | ```
 49 | 
 50 | ## <a name="ModelZoo"></a>Model Zoo
 51 | 
 52 | ### Pretrained weights on COCO
 53 | |    Name     | R-50  | R-101 |
 54 | |:-----------:| :---: | :---: |
 55 | | Mask2Former | [model](https://drive.google.com/file/d/1-KqRhmkNu-FCGL6ssfgW-0jl1UPXXEn6/view?usp=sharing) | [model](https://drive.google.com/file/d/1-JHwIq7LDcRf78d2clBZXfji96CX0TUy/view?usp=sharing) |
 56 | 
 57 | ### HQ-SAM
 58 | |  Name  |   vit_h   |
 59 | |:------:|:---------:| 
 60 | | HQ-SAM | [model](https://drive.google.com/file/d/1qobFYrI4eyIANfBSmYcGuWRaSIXfMOQ8/view) |
 61 | 
 62 | ### YouTubeVIS-2019
 63 | | Name | Backbone | AP | AP50 | AP75| AR1 | AR10 | Download |
 64 | |:----:|:--------:| :---: | :---: | :---: | :---: | :---: | :---: |
 65 | | LBVQ |   R-50   | 52.2 | 74.8 | 57.7 | 49.9 | 59.8 | [model](https://drive.google.com/file/d/1-NW-_J3TFta4VyRXYwNp8u5w3Pn5wA7Z/view?usp=sharing) |
 66 | | LBVQ |  R-101   | 53.1 | 76.3 | 60.2 | 50.0 | 59.2 | [model](https://drive.google.com/file/d/1-_mF--EijC24ucgSLqD2i2cCRUKCOeit/view?usp=sharing) |
 67 | 
 68 | ### YouTubeVIS-2021
 69 | | Name | Backbone | AP | AP50 | AP75| AR1 | AR10 | Download |
 70 | | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: |
 71 | | LBVQ | R-50 | 44.8 | 67.4 | 46.0 | 41.6 | 52.3 | [model](https://drive.google.com/file/d/1-J9e0cr-Rnh87ukAu90EqabxMehNqgO_/view?usp=sharing) |
 72 | 
 73 | ### OVIS
 74 | | Name | Backbone | AP | AP50 | AP75| AR1 | AR10 | Download |
 75 | | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: |
 76 | | LBVQ | R-50 | 22.2 | 45.3 | 19.0 | 12.4 | 27.5 | [model](https://drive.google.com/file/d/1-QoQ6BWreWZVl6sReu30-rN5TeO2IRSv/view?usp=sharing) |
 77 | 
 78 | 
 79 | ## License
 80 | The majority of LBVQ is licensed under a
 81 | [Apache-2.0 License](LICENSE).
 82 | However portions of the project are available under separate license terms: Detectron2([Apache-2.0 License](https://github.com/facebookresearch/detectron2/blob/main/LICENSE)), Mask2Former([MIT License](https://github.com/facebookresearch/Mask2Former/blob/main/LICENSE)), and VITA([Apache-2.0 License](https://github.com/sukjunhwang/VITA/blob/main/LICENSE)).
 83 | 
 84 | ## <a name="CitingVITA"></a>Citing LBVQ
 85 | 
 86 | If you use LBVQ in your research or wish to refer to the baseline results published in the Model Zoo, please use the following BibTeX entry.
 87 | 
 88 | ```BibTeX
 89 | @article{Fang2024learning,
 90 |   title={Learning Better Video Query with SAM for Video Instance Segmentation},
 91 |   author={Fang, Hao and Zhang, Tong and Zhou, Xiaofei and Zhang, Xinxin},
 92 |   journal={IEEE Transactions on Circuits and Systems for Video Technology},
 93 |   year={2024},
 94 |   publisher={IEEE}
 95 | }
 96 | ```
 97 | 
 98 | ## Acknowledgement
 99 | 
100 | Our code is largely based on [Detectron2](https://github.com/facebookresearch/detectron2), [Mask2Former](https://github.com/facebookresearch/MaskFormer), and [VITA](https://github.com/sukjunhwang/VITA). We are truly grateful for their excellent work.
101 | 


--------------------------------------------------------------------------------
/mask2former/data/datasets/register_ade20k_instance.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | import json
 3 | import logging
 4 | import numpy as np
 5 | import os
 6 | from PIL import Image
 7 | 
 8 | from detectron2.data import DatasetCatalog, MetadataCatalog
 9 | from detectron2.data.datasets.coco import load_coco_json, register_coco_instances
10 | from detectron2.utils.file_io import PathManager
11 | 
12 | ADE_CATEGORIES = [{'id': 7, 'name': 'bed'}, {'id': 8, 'name': 'windowpane'}, {'id': 10, 'name': 'cabinet'}, {'id': 12, 'name': 'person'}, {'id': 14, 'name': 'door'}, {'id': 15, 'name': 'table'}, {'id': 18, 'name': 'curtain'}, {'id': 19, 'name': 'chair'}, {'id': 20, 'name': 'car'}, {'id': 22, 'name': 'painting'}, {'id': 23, 'name': 'sofa'}, {'id': 24, 'name': 'shelf'}, {'id': 27, 'name': 'mirror'}, {'id': 30, 'name': 'armchair'}, {'id': 31, 'name': 'seat'}, {'id': 32, 'name': 'fence'}, {'id': 33, 'name': 'desk'}, {'id': 35, 'name': 'wardrobe'}, {'id': 36, 'name': 'lamp'}, {'id': 37, 'name': 'bathtub'}, {'id': 38, 'name': 'railing'}, {'id': 39, 'name': 'cushion'}, {'id': 41, 'name': 'box'}, {'id': 42, 'name': 'column'}, {'id': 43, 'name': 'signboard'}, {'id': 44, 'name': 'chest of drawers'}, {'id': 45, 'name': 'counter'}, {'id': 47, 'name': 'sink'}, {'id': 49, 'name': 'fireplace'}, {'id': 50, 'name': 'refrigerator'}, {'id': 53, 'name': 'stairs'}, {'id': 55, 'name': 'case'}, {'id': 56, 'name': 'pool table'}, {'id': 57, 'name': 'pillow'}, {'id': 58, 'name': 'screen door'}, {'id': 62, 'name': 'bookcase'}, {'id': 64, 'name': 'coffee table'}, {'id': 65, 'name': 'toilet'}, {'id': 66, 'name': 'flower'}, {'id': 67, 'name': 'book'}, {'id': 69, 'name': 'bench'}, {'id': 70, 'name': 'countertop'}, {'id': 71, 'name': 'stove'}, {'id': 72, 'name': 'palm'}, {'id': 73, 'name': 'kitchen island'}, {'id': 74, 'name': 'computer'}, {'id': 75, 'name': 'swivel chair'}, {'id': 76, 'name': 'boat'}, {'id': 78, 'name': 'arcade machine'}, {'id': 80, 'name': 'bus'}, {'id': 81, 'name': 'towel'}, {'id': 82, 'name': 'light'}, {'id': 83, 'name': 'truck'}, {'id': 85, 'name': 'chandelier'}, {'id': 86, 'name': 'awning'}, {'id': 87, 'name': 'streetlight'}, {'id': 88, 'name': 'booth'}, {'id': 89, 'name': 'television receiver'}, {'id': 90, 'name': 'airplane'}, {'id': 92, 'name': 'apparel'}, {'id': 93, 'name': 'pole'}, {'id': 95, 'name': 'bannister'}, {'id': 97, 'name': 'ottoman'}, {'id': 98, 'name': 'bottle'}, {'id': 102, 'name': 'van'}, {'id': 103, 'name': 'ship'}, {'id': 104, 'name': 'fountain'}, {'id': 107, 'name': 'washer'}, {'id': 108, 'name': 'plaything'}, {'id': 110, 'name': 'stool'}, {'id': 111, 'name': 'barrel'}, {'id': 112, 'name': 'basket'}, {'id': 115, 'name': 'bag'}, {'id': 116, 'name': 'minibike'}, {'id': 118, 'name': 'oven'}, {'id': 119, 'name': 'ball'}, {'id': 120, 'name': 'food'}, {'id': 121, 'name': 'step'}, {'id': 123, 'name': 'trade name'}, {'id': 124, 'name': 'microwave'}, {'id': 125, 'name': 'pot'}, {'id': 126, 'name': 'animal'}, {'id': 127, 'name': 'bicycle'}, {'id': 129, 'name': 'dishwasher'}, {'id': 130, 'name': 'screen'}, {'id': 132, 'name': 'sculpture'}, {'id': 133, 'name': 'hood'}, {'id': 134, 'name': 'sconce'}, {'id': 135, 'name': 'vase'}, {'id': 136, 'name': 'traffic light'}, {'id': 137, 'name': 'tray'}, {'id': 138, 'name': 'ashcan'}, {'id': 139, 'name': 'fan'}, {'id': 142, 'name': 'plate'}, {'id': 143, 'name': 'monitor'}, {'id': 144, 'name': 'bulletin board'}, {'id': 146, 'name': 'radiator'}, {'id': 147, 'name': 'glass'}, {'id': 148, 'name': 'clock'}, {'id': 149, 'name': 'flag'}]
13 | 
14 | 
15 | _PREDEFINED_SPLITS = {
16 |     # point annotations without masks
17 |     "ade20k_instance_train": (
18 |         "ADEChallengeData2016/images/training",
19 |         "ADEChallengeData2016/ade20k_instance_train.json",
20 |     ),
21 |     "ade20k_instance_val": (
22 |         "ADEChallengeData2016/images/validation",
23 |         "ADEChallengeData2016/ade20k_instance_val.json",
24 |     ),
25 | }
26 | 
27 | 
28 | def _get_ade_instances_meta():
29 |     thing_ids = [k["id"] for k in ADE_CATEGORIES]
30 |     assert len(thing_ids) == 100, len(thing_ids)
31 |     # Mapping from the incontiguous ADE category id to an id in [0, 99]
32 |     thing_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(thing_ids)}
33 |     thing_classes = [k["name"] for k in ADE_CATEGORIES]
34 |     ret = {
35 |         "thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id,
36 |         "thing_classes": thing_classes,
37 |     }
38 |     return ret
39 | 
40 | 
41 | def register_all_ade20k_instance(root):
42 |     for key, (image_root, json_file) in _PREDEFINED_SPLITS.items():
43 |         # Assume pre-defined datasets live in `./datasets`.
44 |         register_coco_instances(
45 |             key,
46 |             _get_ade_instances_meta(),
47 |             os.path.join(root, json_file) if "://" not in json_file else json_file,
48 |             os.path.join(root, image_root),
49 |         )
50 | 
51 | 
52 | _root = os.getenv("DETECTRON2_DATASETS", "datasets")
53 | register_all_ade20k_instance(_root)
54 | 


--------------------------------------------------------------------------------
/mask2former/evaluation/instance_evaluation.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | import contextlib
  3 | import copy
  4 | import io
  5 | import itertools
  6 | import json
  7 | import logging
  8 | import numpy as np
  9 | import os
 10 | import pickle
 11 | from collections import OrderedDict
 12 | import pycocotools.mask as mask_util
 13 | import torch
 14 | from pycocotools.coco import COCO
 15 | from pycocotools.cocoeval import COCOeval
 16 | from tabulate import tabulate
 17 | 
 18 | import detectron2.utils.comm as comm
 19 | from detectron2.config import CfgNode
 20 | from detectron2.data import MetadataCatalog
 21 | from detectron2.data.datasets.coco import convert_to_coco_json
 22 | from detectron2.evaluation.coco_evaluation import COCOEvaluator, _evaluate_predictions_on_coco
 23 | from detectron2.evaluation.fast_eval_api import COCOeval_opt
 24 | from detectron2.structures import Boxes, BoxMode, pairwise_iou
 25 | from detectron2.utils.file_io import PathManager
 26 | from detectron2.utils.logger import create_small_table
 27 | 
 28 | 
 29 | # modified from COCOEvaluator for instance segmetnat
 30 | class InstanceSegEvaluator(COCOEvaluator):
 31 |     """
 32 |     Evaluate AR for object proposals, AP for instance detection/segmentation, AP
 33 |     for keypoint detection outputs using COCO's metrics.
 34 |     See http://cocodataset.org/#detection-eval and
 35 |     http://cocodataset.org/#keypoints-eval to understand its metrics.
 36 |     The metrics range from 0 to 100 (instead of 0 to 1), where a -1 or NaN means
 37 |     the metric cannot be computed (e.g. due to no predictions made).
 38 | 
 39 |     In addition to COCO, this evaluator is able to support any bounding box detection,
 40 |     instance segmentation, or keypoint detection dataset.
 41 |     """
 42 | 
 43 |     def _eval_predictions(self, predictions, img_ids=None):
 44 |         """
 45 |         Evaluate predictions. Fill self._results with the metrics of the tasks.
 46 |         """
 47 |         self._logger.info("Preparing results for COCO format ...")
 48 |         coco_results = list(itertools.chain(*[x["instances"] for x in predictions]))
 49 |         tasks = self._tasks or self._tasks_from_predictions(coco_results)
 50 | 
 51 |         # unmap the category ids for COCO
 52 |         if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"):
 53 |             dataset_id_to_contiguous_id = self._metadata.thing_dataset_id_to_contiguous_id
 54 |             # all_contiguous_ids = list(dataset_id_to_contiguous_id.values())
 55 |             # num_classes = len(all_contiguous_ids)
 56 |             # assert min(all_contiguous_ids) == 0 and max(all_contiguous_ids) == num_classes - 1
 57 | 
 58 |             reverse_id_mapping = {v: k for k, v in dataset_id_to_contiguous_id.items()}
 59 |             for result in coco_results:
 60 |                 category_id = result["category_id"]
 61 |                 # assert category_id < num_classes, (
 62 |                 #     f"A prediction has class={category_id}, "
 63 |                 #     f"but the dataset only has {num_classes} classes and "
 64 |                 #     f"predicted class id should be in [0, {num_classes - 1}]."
 65 |                 # )
 66 |                 assert category_id in reverse_id_mapping, (
 67 |                     f"A prediction has class={category_id}, "
 68 |                     f"but the dataset only has class ids in {dataset_id_to_contiguous_id}."
 69 |                 )
 70 |                 result["category_id"] = reverse_id_mapping[category_id]
 71 | 
 72 |         if self._output_dir:
 73 |             file_path = os.path.join(self._output_dir, "coco_instances_results.json")
 74 |             self._logger.info("Saving results to {}".format(file_path))
 75 |             with PathManager.open(file_path, "w") as f:
 76 |                 f.write(json.dumps(coco_results))
 77 |                 f.flush()
 78 | 
 79 |         if not self._do_evaluation:
 80 |             self._logger.info("Annotations are not available for evaluation.")
 81 |             return
 82 | 
 83 |         self._logger.info(
 84 |             "Evaluating predictions with {} COCO API...".format(
 85 |                 "unofficial" if self._use_fast_impl else "official"
 86 |             )
 87 |         )
 88 |         for task in sorted(tasks):
 89 |             assert task in {"bbox", "segm", "keypoints"}, f"Got unknown task: {task}!"
 90 |             coco_eval = (
 91 |                 _evaluate_predictions_on_coco(
 92 |                     self._coco_api,
 93 |                     coco_results,
 94 |                     task,
 95 |                     kpt_oks_sigmas=self._kpt_oks_sigmas,
 96 |                     use_fast_impl=self._use_fast_impl,
 97 |                     img_ids=img_ids,
 98 |                     max_dets_per_image=self._max_dets_per_image,
 99 |                 )
100 |                 if len(coco_results) > 0
101 |                 else None  # cocoapi does not handle empty results very well
102 |             )
103 | 
104 |             res = self._derive_coco_results(
105 |                 coco_eval, task, class_names=self._metadata.get("thing_classes")
106 |             )
107 |             self._results[task] = res
108 | 


--------------------------------------------------------------------------------
/lbvq/utils/misc.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Misc functions, including distributed helpers.
  3 | 
  4 | Mostly copy-paste from torchvision references.
  5 | """
  6 | from typing import List, Optional
  7 | 
  8 | import torch
  9 | import torch.distributed as dist
 10 | import torchvision
 11 | from torch import Tensor
 12 | 
 13 | if float(torchvision.__version__.split(".")[1]) < 7.0:
 14 |     from torchvision.ops import _new_empty_tensor
 15 |     from torchvision.ops.misc import _output_size
 16 | 
 17 | 
 18 | def _max_by_axis(the_list):
 19 |     # type: (List[List[int]]) -> List[int]
 20 |     maxes = the_list[0]
 21 |     for sublist in the_list[1:]:
 22 |         for index, item in enumerate(sublist):
 23 |             maxes[index] = max(maxes[index], item)
 24 |     return maxes
 25 | 
 26 | 
 27 | def interpolate(input, size=None, scale_factor=None, mode="nearest", align_corners=None):
 28 |     # type: (Tensor, Optional[List[int]], Optional[float], str, Optional[bool]) -> Tensor
 29 |     """
 30 |     Equivalent to nn.functional.interpolate, but with support for empty batch sizes.
 31 |     This will eventually be supported natively by PyTorch, and this
 32 |     class can go away.
 33 |     """
 34 |     if float(torchvision.__version__.split(".")[1]) < 7.0:
 35 |         if input.numel() > 0:
 36 |             return torch.nn.functional.interpolate(
 37 |                 input, size, scale_factor, mode, align_corners
 38 |             )
 39 | 
 40 |         output_shape = _output_size(2, input, size, scale_factor)
 41 |         output_shape = list(input.shape[:-2]) + list(output_shape)
 42 |         return _new_empty_tensor(input, output_shape)
 43 |     else:
 44 |         return torchvision.ops.misc.interpolate(input, size, scale_factor, mode, align_corners)
 45 | 
 46 | 
 47 | class NestedTensor(object):
 48 |     def __init__(self, tensors, mask: Optional[Tensor]):
 49 |         self.tensors = tensors
 50 |         self.mask = mask
 51 | 
 52 |     def to(self, device):
 53 |         # type: (Device) -> NestedTensor # noqa
 54 |         cast_tensor = self.tensors.to(device)
 55 |         mask = self.mask
 56 |         if mask is not None:
 57 |             assert mask is not None
 58 |             cast_mask = mask.to(device)
 59 |         else:
 60 |             cast_mask = None
 61 |         return NestedTensor(cast_tensor, cast_mask)
 62 | 
 63 |     def decompose(self):
 64 |         return self.tensors, self.mask
 65 | 
 66 |     def __repr__(self):
 67 |         return str(self.tensors)
 68 | 
 69 | 
 70 | def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
 71 |     # TODO make this more general
 72 |     if tensor_list[0].ndim == 3:
 73 |         if torchvision._is_tracing():
 74 |             # nested_tensor_from_tensor_list() does not export well to ONNX
 75 |             # call _onnx_nested_tensor_from_tensor_list() instead
 76 |             return _onnx_nested_tensor_from_tensor_list(tensor_list)
 77 | 
 78 |         # TODO make it support different-sized images
 79 |         max_size = _max_by_axis([list(img.shape) for img in tensor_list])
 80 |         # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list]))
 81 |         batch_shape = [len(tensor_list)] + max_size
 82 |         b, c, h, w = batch_shape
 83 |         dtype = tensor_list[0].dtype
 84 |         device = tensor_list[0].device
 85 |         tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
 86 |         mask = torch.ones((b, h, w), dtype=torch.bool, device=device)
 87 |         for img, pad_img, m in zip(tensor_list, tensor, mask):
 88 |             pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
 89 |             m[: img.shape[1], : img.shape[2]] = False
 90 |     else:
 91 |         raise ValueError("not supported")
 92 |     return NestedTensor(tensor, mask)
 93 | 
 94 | 
 95 | # _onnx_nested_tensor_from_tensor_list() is an implementation of
 96 | # nested_tensor_from_tensor_list() that is supported by ONNX tracing.
 97 | @torch.jit.unused
 98 | def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor:
 99 |     max_size = []
100 |     for i in range(tensor_list[0].dim()):
101 |         max_size_i = torch.max(
102 |             torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32)
103 |         ).to(torch.int64)
104 |         max_size.append(max_size_i)
105 |     max_size = tuple(max_size)
106 | 
107 |     # work around for
108 |     # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
109 |     # m[: img.shape[1], :img.shape[2]] = False
110 |     # which is not yet supported in onnx
111 |     padded_imgs = []
112 |     padded_masks = []
113 |     for img in tensor_list:
114 |         padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))]
115 |         padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0]))
116 |         padded_imgs.append(padded_img)
117 | 
118 |         m = torch.zeros_like(img[0], dtype=torch.int, device=img.device)
119 |         padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1)
120 |         padded_masks.append(padded_mask.to(torch.bool))
121 | 
122 |     tensor = torch.stack(padded_imgs)
123 |     mask = torch.stack(padded_masks)
124 | 
125 |     return NestedTensor(tensor, mask=mask)
126 | 
127 | 
128 | def is_dist_avail_and_initialized():
129 |     if not dist.is_available():
130 |         return False
131 |     if not dist.is_initialized():
132 |         return False
133 |     return True
134 | 


--------------------------------------------------------------------------------
/mask2former/modeling/meta_arch/mask_former_head.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | import logging
  3 | from copy import deepcopy
  4 | from typing import Callable, Dict, List, Optional, Tuple, Union
  5 | 
  6 | import fvcore.nn.weight_init as weight_init
  7 | from torch import nn
  8 | from torch.nn import functional as F
  9 | 
 10 | from detectron2.config import configurable
 11 | from detectron2.layers import Conv2d, ShapeSpec, get_norm
 12 | from detectron2.modeling import SEM_SEG_HEADS_REGISTRY
 13 | 
 14 | from ..transformer_decoder.maskformer_transformer_decoder import build_transformer_decoder
 15 | from ..pixel_decoder.fpn import build_pixel_decoder
 16 | 
 17 | 
 18 | @SEM_SEG_HEADS_REGISTRY.register()
 19 | class MaskFormerHead(nn.Module):
 20 | 
 21 |     _version = 2
 22 | 
 23 |     def _load_from_state_dict(
 24 |         self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
 25 |     ):
 26 |         # NOTE Modified by Sukjun Hwang: Issues with recent detectron2 versions.
 27 |         version = 2 # local_metadata.get("version", None)
 28 |         if version is None or version < 2:
 29 |             # Do not warn if train from scratch
 30 |             scratch = True
 31 |             logger = logging.getLogger(__name__)
 32 |             for k in list(state_dict.keys()):
 33 |                 newk = k
 34 |                 if "sem_seg_head" in k and not k.startswith(prefix + "predictor"):
 35 |                     newk = k.replace(prefix, prefix + "pixel_decoder.")
 36 |                     # logger.debug(f"{k} ==> {newk}")
 37 |                 if newk != k:
 38 |                     state_dict[newk] = state_dict[k]
 39 |                     del state_dict[k]
 40 |                     scratch = False
 41 | 
 42 |             if not scratch:
 43 |                 logger.warning(
 44 |                     f"Weight format of {self.__class__.__name__} have changed! "
 45 |                     "Please upgrade your models. Applying automatic conversion now ..."
 46 |                 )
 47 | 
 48 |     @configurable
 49 |     def __init__(
 50 |         self,
 51 |         input_shape: Dict[str, ShapeSpec],
 52 |         *,
 53 |         num_classes: int,
 54 |         pixel_decoder: nn.Module,
 55 |         loss_weight: float = 1.0,
 56 |         ignore_value: int = -1,
 57 |         # extra parameters
 58 |         transformer_predictor: nn.Module,
 59 |         transformer_in_feature: str,
 60 |     ):
 61 |         """
 62 |         NOTE: this interface is experimental.
 63 |         Args:
 64 |             input_shape: shapes (channels and stride) of the input features
 65 |             num_classes: number of classes to predict
 66 |             pixel_decoder: the pixel decoder module
 67 |             loss_weight: loss weight
 68 |             ignore_value: category id to be ignored during training.
 69 |             transformer_predictor: the transformer decoder that makes prediction
 70 |             transformer_in_feature: input feature name to the transformer_predictor
 71 |         """
 72 |         super().__init__()
 73 |         input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride)
 74 |         self.in_features = [k for k, v in input_shape]
 75 |         feature_strides = [v.stride for k, v in input_shape]
 76 |         feature_channels = [v.channels for k, v in input_shape]
 77 | 
 78 |         self.ignore_value = ignore_value
 79 |         self.common_stride = 4
 80 |         self.loss_weight = loss_weight
 81 | 
 82 |         self.pixel_decoder = pixel_decoder
 83 |         self.predictor = transformer_predictor
 84 |         self.transformer_in_feature = transformer_in_feature
 85 | 
 86 |         self.num_classes = num_classes
 87 | 
 88 |     @classmethod
 89 |     def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
 90 |         # figure out in_channels to transformer predictor
 91 |         if cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "transformer_encoder":
 92 |             transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
 93 |         elif cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "pixel_embedding":
 94 |             transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM
 95 |         elif cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "multi_scale_pixel_decoder":  # for maskformer2
 96 |             transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
 97 |         else:
 98 |             transformer_predictor_in_channels = input_shape[cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE].channels
 99 | 
100 |         return {
101 |             "input_shape": {
102 |                 k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES
103 |             },
104 |             "ignore_value": cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
105 |             "num_classes": cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES,
106 |             "pixel_decoder": build_pixel_decoder(cfg, input_shape),
107 |             "loss_weight": cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT,
108 |             "transformer_in_feature": cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE,
109 |             "transformer_predictor": build_transformer_decoder(
110 |                 cfg,
111 |                 transformer_predictor_in_channels,
112 |                 mask_classification=True,
113 |             ),
114 |         }
115 | 
116 |     def forward(self, features, mask=None):
117 |         return self.layers(features, mask)
118 | 
119 |     def layers(self, features, mask=None):
120 |         mask_features, clip_mask_features, transformer_encoder_features, multi_scale_features = self.pixel_decoder.forward_features(features)
121 |         if self.transformer_in_feature == "multi_scale_pixel_decoder":
122 |             predictions = self.predictor(multi_scale_features, mask_features, clip_mask_features, mask)
123 |         else:
124 |             if self.transformer_in_feature == "transformer_encoder":
125 |                 assert (
126 |                     transformer_encoder_features is not None
127 |                 ), "Please use the TransformerEncoderPixelDecoder."
128 |                 predictions = self.predictor(transformer_encoder_features, mask_features, mask)
129 |             elif self.transformer_in_feature == "pixel_embedding":
130 |                 predictions = self.predictor(mask_features, mask_features, mask)
131 |             else:
132 |                 predictions = self.predictor(features[self.transformer_in_feature], mask_features, mask)
133 |         return predictions
134 | 


--------------------------------------------------------------------------------
/mask2former/data/dataset_mappers/coco_panoptic_new_baseline_dataset_mapper.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/d2/detr/dataset_mapper.py
  3 | import copy
  4 | import logging
  5 | 
  6 | import numpy as np
  7 | import torch
  8 | 
  9 | from detectron2.config import configurable
 10 | from detectron2.data import detection_utils as utils
 11 | from detectron2.data import transforms as T
 12 | from detectron2.data.transforms import TransformGen
 13 | from detectron2.structures import BitMasks, Boxes, Instances
 14 | 
 15 | __all__ = ["COCOPanopticNewBaselineDatasetMapper"]
 16 | 
 17 | 
 18 | def build_transform_gen(cfg, is_train):
 19 |     """
 20 |     Create a list of default :class:`Augmentation` from config.
 21 |     Now it includes resizing and flipping.
 22 |     Returns:
 23 |         list[Augmentation]
 24 |     """
 25 |     assert is_train, "Only support training augmentation"
 26 |     image_size = cfg.INPUT.IMAGE_SIZE
 27 |     min_scale = cfg.INPUT.MIN_SCALE
 28 |     max_scale = cfg.INPUT.MAX_SCALE
 29 | 
 30 |     augmentation = []
 31 | 
 32 |     if cfg.INPUT.RANDOM_FLIP != "none":
 33 |         augmentation.append(
 34 |             T.RandomFlip(
 35 |                 horizontal=cfg.INPUT.RANDOM_FLIP == "horizontal",
 36 |                 vertical=cfg.INPUT.RANDOM_FLIP == "vertical",
 37 |             )
 38 |         )
 39 | 
 40 |     augmentation.extend([
 41 |         T.ResizeScale(
 42 |             min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size
 43 |         ),
 44 |         T.FixedSizeCrop(crop_size=(image_size, image_size)),
 45 |     ])
 46 | 
 47 |     return augmentation
 48 | 
 49 | 
 50 | # This is specifically designed for the COCO dataset.
 51 | class COCOPanopticNewBaselineDatasetMapper:
 52 |     """
 53 |     A callable which takes a dataset dict in Detectron2 Dataset format,
 54 |     and map it into a format used by MaskFormer.
 55 | 
 56 |     This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation.
 57 | 
 58 |     The callable currently does the following:
 59 | 
 60 |     1. Read the image from "file_name"
 61 |     2. Applies geometric transforms to the image and annotation
 62 |     3. Find and applies suitable cropping to the image and annotation
 63 |     4. Prepare image and annotation to Tensors
 64 |     """
 65 | 
 66 |     @configurable
 67 |     def __init__(
 68 |         self,
 69 |         is_train=True,
 70 |         *,
 71 |         tfm_gens,
 72 |         image_format,
 73 |     ):
 74 |         """
 75 |         NOTE: this interface is experimental.
 76 |         Args:
 77 |             is_train: for training or inference
 78 |             augmentations: a list of augmentations or deterministic transforms to apply
 79 |             crop_gen: crop augmentation
 80 |             tfm_gens: data augmentation
 81 |             image_format: an image format supported by :func:`detection_utils.read_image`.
 82 |         """
 83 |         self.tfm_gens = tfm_gens
 84 |         logging.getLogger(__name__).info(
 85 |             "[COCOPanopticNewBaselineDatasetMapper] Full TransformGens used in training: {}".format(
 86 |                 str(self.tfm_gens)
 87 |             )
 88 |         )
 89 | 
 90 |         self.img_format = image_format
 91 |         self.is_train = is_train
 92 | 
 93 |     @classmethod
 94 |     def from_config(cls, cfg, is_train=True):
 95 |         # Build augmentation
 96 |         tfm_gens = build_transform_gen(cfg, is_train)
 97 | 
 98 |         ret = {
 99 |             "is_train": is_train,
100 |             "tfm_gens": tfm_gens,
101 |             "image_format": cfg.INPUT.FORMAT,
102 |         }
103 |         return ret
104 | 
105 |     def __call__(self, dataset_dict):
106 |         """
107 |         Args:
108 |             dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
109 | 
110 |         Returns:
111 |             dict: a format that builtin models in detectron2 accept
112 |         """
113 |         dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
114 |         image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
115 |         utils.check_image_size(dataset_dict, image)
116 | 
117 |         image, transforms = T.apply_transform_gens(self.tfm_gens, image)
118 |         image_shape = image.shape[:2]  # h, w
119 | 
120 |         # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
121 |         # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
122 |         # Therefore it's important to use torch.Tensor.
123 |         dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
124 | 
125 |         if not self.is_train:
126 |             # USER: Modify this if you want to keep them for some reason.
127 |             dataset_dict.pop("annotations", None)
128 |             return dataset_dict
129 | 
130 |         if "pan_seg_file_name" in dataset_dict:
131 |             pan_seg_gt = utils.read_image(dataset_dict.pop("pan_seg_file_name"), "RGB")
132 |             segments_info = dataset_dict["segments_info"]
133 | 
134 |             # apply the same transformation to panoptic segmentation
135 |             pan_seg_gt = transforms.apply_segmentation(pan_seg_gt)
136 | 
137 |             from panopticapi.utils import rgb2id
138 | 
139 |             pan_seg_gt = rgb2id(pan_seg_gt)
140 | 
141 |             instances = Instances(image_shape)
142 |             classes = []
143 |             masks = []
144 |             for segment_info in segments_info:
145 |                 class_id = segment_info["category_id"]
146 |                 if not segment_info["iscrowd"]:
147 |                     classes.append(class_id)
148 |                     masks.append(pan_seg_gt == segment_info["id"])
149 | 
150 |             classes = np.array(classes)
151 |             instances.gt_classes = torch.tensor(classes, dtype=torch.int64)
152 |             if len(masks) == 0:
153 |                 # Some image does not have annotation (all ignored)
154 |                 instances.gt_masks = torch.zeros((0, pan_seg_gt.shape[-2], pan_seg_gt.shape[-1]))
155 |                 instances.gt_boxes = Boxes(torch.zeros((0, 4)))
156 |             else:
157 |                 masks = BitMasks(
158 |                     torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks])
159 |                 )
160 |                 instances.gt_masks = masks.tensor
161 |                 instances.gt_boxes = masks.get_bounding_boxes()
162 | 
163 |             dataset_dict["instances"] = instances
164 | 
165 |         return dataset_dict
166 | 


--------------------------------------------------------------------------------
/demo_lbvq/demo.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import glob
  3 | import multiprocessing as mp
  4 | import os
  5 | 
  6 | # fmt: off
  7 | import sys
  8 | sys.path.insert(1, os.path.join(sys.path[0], '..'))
  9 | # fmt: on
 10 | 
 11 | import tempfile
 12 | import time
 13 | import warnings
 14 | 
 15 | import cv2
 16 | import numpy as np
 17 | import tqdm
 18 | 
 19 | from torch.cuda.amp import autocast
 20 | 
 21 | from detectron2.config import get_cfg
 22 | from detectron2.data.detection_utils import read_image
 23 | from detectron2.projects.deeplab import add_deeplab_config
 24 | from detectron2.utils.logger import setup_logger
 25 | 
 26 | from mask2former import add_maskformer2_config
 27 | from lbvq import add_lbvq_config
 28 | from predictor import VisualizationDemo
 29 | 
 30 | 
 31 | # constants
 32 | WINDOW_NAME = "lbvq video demo"
 33 | 
 34 | 
 35 | def setup_cfg(args):
 36 |     # load config from file and command-line arguments
 37 |     cfg = get_cfg()
 38 |     add_deeplab_config(cfg)
 39 |     add_maskformer2_config(cfg)
 40 |     add_lbvq_config(cfg)
 41 |     cfg.merge_from_file(args.config_file)
 42 |     cfg.merge_from_list(args.opts)
 43 |     cfg.freeze()
 44 |     return cfg
 45 | 
 46 | 
 47 | def get_parser():
 48 |     parser = argparse.ArgumentParser(description="lbvq demo for builtin configs")
 49 |     parser.add_argument(
 50 |         "--config-file",
 51 |         default="configs/youtubevis_2019/lbvq_R50_bs8.yaml",
 52 |         metavar="FILE",
 53 |         help="path to config file",
 54 |     )
 55 |     parser.add_argument("--video-input", help="Path to video file.")
 56 |     parser.add_argument(
 57 |         "--input",
 58 |         nargs="+",
 59 |         help="A list of space separated input images; "
 60 |         "or a single glob pattern such as 'directory/*.jpg'"
 61 |         "this will be treated as frames of a video",
 62 |     )
 63 |     parser.add_argument(
 64 |         "--output",
 65 |         help="A file or directory to save output visualizations. "
 66 |         "If not given, will show output in an OpenCV window.",
 67 |     )
 68 | 
 69 |     parser.add_argument(
 70 |         "--save-frames",
 71 |         default=False,
 72 |         help="Save frame level image outputs.",
 73 |     )
 74 | 
 75 |     parser.add_argument(
 76 |         "--confidence-threshold",
 77 |         type=float,
 78 |         default=0.5,
 79 |         help="Minimum score for instance predictions to be shown",
 80 |     )
 81 |     parser.add_argument(
 82 |         "--opts",
 83 |         help="Modify config options using the command-line 'KEY VALUE' pairs",
 84 |         default=[],
 85 |         nargs=argparse.REMAINDER,
 86 |     )
 87 |     return parser
 88 | 
 89 | 
 90 | def test_opencv_video_format(codec, file_ext):
 91 |     with tempfile.TemporaryDirectory(prefix="video_format_test") as dir:
 92 |         filename = os.path.join(dir, "test_file" + file_ext)
 93 |         writer = cv2.VideoWriter(
 94 |             filename=filename,
 95 |             fourcc=cv2.VideoWriter_fourcc(*codec),
 96 |             fps=float(30),
 97 |             frameSize=(10, 10),
 98 |             isColor=True,
 99 |         )
100 |         [writer.write(np.zeros((10, 10, 3), np.uint8)) for _ in range(30)]
101 |         writer.release()
102 |         if os.path.isfile(filename):
103 |             return True
104 |         return False
105 | 
106 | 
107 | if __name__ == "__main__":
108 |     mp.set_start_method("spawn", force=True)
109 |     args = get_parser().parse_args()
110 |     setup_logger(name="fvcore")
111 |     logger = setup_logger()
112 |     logger.info("Arguments: " + str(args))
113 | 
114 |     cfg = setup_cfg(args)
115 | 
116 |     demo = VisualizationDemo(cfg, conf_thres=args.confidence_threshold)
117 | 
118 |     if args.output:
119 |         os.makedirs(args.output, exist_ok=True)
120 | 
121 |     if args.input:
122 |         if len(args.input) == 1:
123 |             args.input = glob.glob(os.path.expanduser(args.input[0]))
124 |             assert args.input, "The input path(s) was not found"
125 | 
126 |         vid_frames = []
127 |         for path in args.input:
128 |             img = read_image(path, format="BGR")
129 |             vid_frames.append(img)
130 | 
131 |         start_time = time.time()
132 |         with autocast():
133 |             predictions, visualized_output = demo.run_on_video(vid_frames)
134 |         logger.info(
135 |             "detected {} instances per frame in {:.2f}s".format(
136 |                 len(predictions["pred_scores"]), time.time() - start_time
137 |             )
138 |         )
139 | 
140 |         if args.output:
141 |             if args.save_frames:
142 |                 for path, _vis_output in zip(args.input, visualized_output):
143 |                     out_filename = os.path.join(args.output, os.path.basename(path))
144 |                     _vis_output.save(out_filename)
145 | 
146 |             H, W = visualized_output[0].height, visualized_output[0].width
147 | 
148 |             cap = cv2.VideoCapture(-1)
149 |             fourcc = cv2.VideoWriter_fourcc(*"mp4v")
150 |             out = cv2.VideoWriter(os.path.join(args.output, "visualization.mp4"), fourcc, 10.0, (W, H), True)
151 |             for _vis_output in visualized_output:
152 |                 frame = _vis_output.get_image()[:, :, ::-1]
153 |                 out.write(frame)
154 |             cap.release()
155 |             out.release()
156 | 
157 |     elif args.video_input:
158 |         video = cv2.VideoCapture(args.video_input)
159 |         
160 |         vid_frames = []
161 |         while video.isOpened():
162 |             success, frame = video.read()
163 |             if success:
164 |                 vid_frames.append(frame)
165 |             else:
166 |                 break
167 | 
168 |         start_time = time.time()
169 |         with autocast():
170 |             predictions, visualized_output = demo.run_on_video(vid_frames)
171 |         logger.info(
172 |             "detected {} instances per frame in {:.2f}s".format(
173 |                 len(predictions["pred_scores"]), time.time() - start_time
174 |             )
175 |         )
176 | 
177 |         if args.output:
178 |             if args.save_frames:
179 |                 for idx, _vis_output in enumerate(visualized_output):
180 |                     out_filename = os.path.join(args.output, f"{idx}.jpg")
181 |                     _vis_output.save(out_filename)
182 | 
183 |             H, W = visualized_output[0].height, visualized_output[0].width
184 | 
185 |             cap = cv2.VideoCapture(-1)
186 |             fourcc = cv2.VideoWriter_fourcc(*"mp4v")
187 |             out = cv2.VideoWriter(os.path.join(args.output, "visualization.mp4"), fourcc, 10.0, (W, H), True)
188 |             for _vis_output in visualized_output:
189 |                 frame = _vis_output.get_image()[:, :, ::-1]
190 |                 out.write(frame)
191 |             cap.release()
192 |             out.release()
193 | 


--------------------------------------------------------------------------------
/mask2former/data/dataset_mappers/mask_former_panoptic_dataset_mapper.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | import copy
  3 | import logging
  4 | 
  5 | import numpy as np
  6 | import torch
  7 | from torch.nn import functional as F
  8 | 
  9 | from detectron2.config import configurable
 10 | from detectron2.data import detection_utils as utils
 11 | from detectron2.data import transforms as T
 12 | from detectron2.structures import BitMasks, Instances
 13 | 
 14 | from .mask_former_semantic_dataset_mapper import MaskFormerSemanticDatasetMapper
 15 | 
 16 | __all__ = ["MaskFormerPanopticDatasetMapper"]
 17 | 
 18 | 
 19 | class MaskFormerPanopticDatasetMapper(MaskFormerSemanticDatasetMapper):
 20 |     """
 21 |     A callable which takes a dataset dict in Detectron2 Dataset format,
 22 |     and map it into a format used by MaskFormer for panoptic segmentation.
 23 | 
 24 |     The callable currently does the following:
 25 | 
 26 |     1. Read the image from "file_name"
 27 |     2. Applies geometric transforms to the image and annotation
 28 |     3. Find and applies suitable cropping to the image and annotation
 29 |     4. Prepare image and annotation to Tensors
 30 |     """
 31 | 
 32 |     @configurable
 33 |     def __init__(
 34 |         self,
 35 |         is_train=True,
 36 |         *,
 37 |         augmentations,
 38 |         image_format,
 39 |         ignore_label,
 40 |         size_divisibility,
 41 |     ):
 42 |         """
 43 |         NOTE: this interface is experimental.
 44 |         Args:
 45 |             is_train: for training or inference
 46 |             augmentations: a list of augmentations or deterministic transforms to apply
 47 |             image_format: an image format supported by :func:`detection_utils.read_image`.
 48 |             ignore_label: the label that is ignored to evaluation
 49 |             size_divisibility: pad image size to be divisible by this value
 50 |         """
 51 |         super().__init__(
 52 |             is_train,
 53 |             augmentations=augmentations,
 54 |             image_format=image_format,
 55 |             ignore_label=ignore_label,
 56 |             size_divisibility=size_divisibility,
 57 |         )
 58 | 
 59 |     def __call__(self, dataset_dict):
 60 |         """
 61 |         Args:
 62 |             dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
 63 | 
 64 |         Returns:
 65 |             dict: a format that builtin models in detectron2 accept
 66 |         """
 67 |         assert self.is_train, "MaskFormerPanopticDatasetMapper should only be used for training!"
 68 | 
 69 |         dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
 70 |         image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
 71 |         utils.check_image_size(dataset_dict, image)
 72 | 
 73 |         # semantic segmentation
 74 |         if "sem_seg_file_name" in dataset_dict:
 75 |             # PyTorch transformation not implemented for uint16, so converting it to double first
 76 |             sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name")).astype("double")
 77 |         else:
 78 |             sem_seg_gt = None
 79 | 
 80 |         # panoptic segmentation
 81 |         if "pan_seg_file_name" in dataset_dict:
 82 |             pan_seg_gt = utils.read_image(dataset_dict.pop("pan_seg_file_name"), "RGB")
 83 |             segments_info = dataset_dict["segments_info"]
 84 |         else:
 85 |             pan_seg_gt = None
 86 |             segments_info = None
 87 | 
 88 |         if pan_seg_gt is None:
 89 |             raise ValueError(
 90 |                 "Cannot find 'pan_seg_file_name' for panoptic segmentation dataset {}.".format(
 91 |                     dataset_dict["file_name"]
 92 |                 )
 93 |             )
 94 | 
 95 |         aug_input = T.AugInput(image, sem_seg=sem_seg_gt)
 96 |         aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input)
 97 |         image = aug_input.image
 98 |         if sem_seg_gt is not None:
 99 |             sem_seg_gt = aug_input.sem_seg
100 | 
101 |         # apply the same transformation to panoptic segmentation
102 |         pan_seg_gt = transforms.apply_segmentation(pan_seg_gt)
103 | 
104 |         from panopticapi.utils import rgb2id
105 | 
106 |         pan_seg_gt = rgb2id(pan_seg_gt)
107 | 
108 |         # Pad image and segmentation label here!
109 |         image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
110 |         if sem_seg_gt is not None:
111 |             sem_seg_gt = torch.as_tensor(sem_seg_gt.astype("long"))
112 |         pan_seg_gt = torch.as_tensor(pan_seg_gt.astype("long"))
113 | 
114 |         if self.size_divisibility > 0:
115 |             image_size = (image.shape[-2], image.shape[-1])
116 |             padding_size = [
117 |                 0,
118 |                 self.size_divisibility - image_size[1],
119 |                 0,
120 |                 self.size_divisibility - image_size[0],
121 |             ]
122 |             image = F.pad(image, padding_size, value=128).contiguous()
123 |             if sem_seg_gt is not None:
124 |                 sem_seg_gt = F.pad(sem_seg_gt, padding_size, value=self.ignore_label).contiguous()
125 |             pan_seg_gt = F.pad(
126 |                 pan_seg_gt, padding_size, value=0
127 |             ).contiguous()  # 0 is the VOID panoptic label
128 | 
129 |         image_shape = (image.shape[-2], image.shape[-1])  # h, w
130 | 
131 |         # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
132 |         # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
133 |         # Therefore it's important to use torch.Tensor.
134 |         dataset_dict["image"] = image
135 |         if sem_seg_gt is not None:
136 |             dataset_dict["sem_seg"] = sem_seg_gt.long()
137 | 
138 |         if "annotations" in dataset_dict:
139 |             raise ValueError("Pemantic segmentation dataset should not have 'annotations'.")
140 | 
141 |         # Prepare per-category binary masks
142 |         pan_seg_gt = pan_seg_gt.numpy()
143 |         instances = Instances(image_shape)
144 |         classes = []
145 |         masks = []
146 |         for segment_info in segments_info:
147 |             class_id = segment_info["category_id"]
148 |             if not segment_info["iscrowd"]:
149 |                 classes.append(class_id)
150 |                 masks.append(pan_seg_gt == segment_info["id"])
151 | 
152 |         classes = np.array(classes)
153 |         instances.gt_classes = torch.tensor(classes, dtype=torch.int64)
154 |         if len(masks) == 0:
155 |             # Some image does not have annotation (all ignored)
156 |             instances.gt_masks = torch.zeros((0, pan_seg_gt.shape[-2], pan_seg_gt.shape[-1]))
157 |         else:
158 |             masks = BitMasks(
159 |                 torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks])
160 |             )
161 |             instances.gt_masks = masks.tensor
162 | 
163 |         dataset_dict["instances"] = instances
164 | 
165 |         return dataset_dict
166 | 


--------------------------------------------------------------------------------
/mask2former/modeling/pixel_decoder/ops/modules/ms_deform_attn.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------------------------------
  2 | # Deformable DETR
  3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
  4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  5 | # ------------------------------------------------------------------------------------------------
  6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
  7 | # ------------------------------------------------------------------------------------------------
  8 | 
  9 | # Copyright (c) Facebook, Inc. and its affiliates.
 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
 11 | 
 12 | from __future__ import absolute_import
 13 | from __future__ import print_function
 14 | from __future__ import division
 15 | 
 16 | import warnings
 17 | import math
 18 | 
 19 | import torch
 20 | from torch import nn
 21 | import torch.nn.functional as F
 22 | from torch.nn.init import xavier_uniform_, constant_
 23 | 
 24 | from ..functions import MSDeformAttnFunction
 25 | from ..functions.ms_deform_attn_func import ms_deform_attn_core_pytorch
 26 | 
 27 | 
 28 | def _is_power_of_2(n):
 29 |     if (not isinstance(n, int)) or (n < 0):
 30 |         raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n)))
 31 |     return (n & (n-1) == 0) and n != 0
 32 | 
 33 | 
 34 | class MSDeformAttn(nn.Module):
 35 |     def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4):
 36 |         """
 37 |         Multi-Scale Deformable Attention Module
 38 |         :param d_model      hidden dimension
 39 |         :param n_levels     number of feature levels
 40 |         :param n_heads      number of attention heads
 41 |         :param n_points     number of sampling points per attention head per feature level
 42 |         """
 43 |         super().__init__()
 44 |         if d_model % n_heads != 0:
 45 |             raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads))
 46 |         _d_per_head = d_model // n_heads
 47 |         # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation
 48 |         if not _is_power_of_2(_d_per_head):
 49 |             warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 "
 50 |                           "which is more efficient in our CUDA implementation.")
 51 | 
 52 |         self.im2col_step = 128
 53 | 
 54 |         self.d_model = d_model
 55 |         self.n_levels = n_levels
 56 |         self.n_heads = n_heads
 57 |         self.n_points = n_points
 58 | 
 59 |         self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2)
 60 |         self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points)
 61 |         self.value_proj = nn.Linear(d_model, d_model)
 62 |         self.output_proj = nn.Linear(d_model, d_model)
 63 | 
 64 |         self._reset_parameters()
 65 | 
 66 |     def _reset_parameters(self):
 67 |         constant_(self.sampling_offsets.weight.data, 0.)
 68 |         thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
 69 |         grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
 70 |         grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1)
 71 |         for i in range(self.n_points):
 72 |             grid_init[:, :, i, :] *= i + 1
 73 |         with torch.no_grad():
 74 |             self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
 75 |         constant_(self.attention_weights.weight.data, 0.)
 76 |         constant_(self.attention_weights.bias.data, 0.)
 77 |         xavier_uniform_(self.value_proj.weight.data)
 78 |         constant_(self.value_proj.bias.data, 0.)
 79 |         xavier_uniform_(self.output_proj.weight.data)
 80 |         constant_(self.output_proj.bias.data, 0.)
 81 | 
 82 |     def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None):
 83 |         """
 84 |         :param query                       (N, Length_{query}, C)
 85 |         :param reference_points            (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area
 86 |                                         or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes
 87 |         :param input_flatten               (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C)
 88 |         :param input_spatial_shapes        (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
 89 |         :param input_level_start_index     (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}]
 90 |         :param input_padding_mask          (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements
 91 | 
 92 |         :return output                     (N, Length_{query}, C)
 93 |         """
 94 |         N, Len_q, _ = query.shape
 95 |         N, Len_in, _ = input_flatten.shape
 96 |         assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in
 97 | 
 98 |         value = self.value_proj(input_flatten)
 99 |         if input_padding_mask is not None:
100 |             value = value.masked_fill(input_padding_mask[..., None], float(0))
101 |         value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads)
102 |         sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2)
103 |         attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points)
104 |         attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points)
105 |         # N, Len_q, n_heads, n_levels, n_points, 2
106 |         if reference_points.shape[-1] == 2:
107 |             offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1)
108 |             sampling_locations = reference_points[:, :, None, :, None, :] \
109 |                                  + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
110 |         elif reference_points.shape[-1] == 4:
111 |             sampling_locations = reference_points[:, :, None, :, None, :2] \
112 |                                  + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
113 |         else:
114 |             raise ValueError(
115 |                 'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1]))
116 |         output = MSDeformAttnFunction.apply(
117 |             value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step)
118 |         # # For FLOPs calculation only
119 |         # output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights)
120 |         output = self.output_proj(output)
121 |         return output
122 | 


--------------------------------------------------------------------------------
/mask2former/data/dataset_mappers/mask_former_instance_dataset_mapper.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | import copy
  3 | import logging
  4 | 
  5 | import numpy as np
  6 | import pycocotools.mask as mask_util
  7 | import torch
  8 | from torch.nn import functional as F
  9 | 
 10 | from detectron2.config import configurable
 11 | from detectron2.data import detection_utils as utils
 12 | from detectron2.data import transforms as T
 13 | from detectron2.projects.point_rend import ColorAugSSDTransform
 14 | from detectron2.structures import BitMasks, Instances, polygons_to_bitmask
 15 | 
 16 | __all__ = ["MaskFormerInstanceDatasetMapper"]
 17 | 
 18 | 
 19 | class MaskFormerInstanceDatasetMapper:
 20 |     """
 21 |     A callable which takes a dataset dict in Detectron2 Dataset format,
 22 |     and map it into a format used by MaskFormer for instance segmentation.
 23 | 
 24 |     The callable currently does the following:
 25 | 
 26 |     1. Read the image from "file_name"
 27 |     2. Applies geometric transforms to the image and annotation
 28 |     3. Find and applies suitable cropping to the image and annotation
 29 |     4. Prepare image and annotation to Tensors
 30 |     """
 31 | 
 32 |     @configurable
 33 |     def __init__(
 34 |         self,
 35 |         is_train=True,
 36 |         *,
 37 |         augmentations,
 38 |         image_format,
 39 |         size_divisibility,
 40 |     ):
 41 |         """
 42 |         NOTE: this interface is experimental.
 43 |         Args:
 44 |             is_train: for training or inference
 45 |             augmentations: a list of augmentations or deterministic transforms to apply
 46 |             image_format: an image format supported by :func:`detection_utils.read_image`.
 47 |             size_divisibility: pad image size to be divisible by this value
 48 |         """
 49 |         self.is_train = is_train
 50 |         self.tfm_gens = augmentations
 51 |         self.img_format = image_format
 52 |         self.size_divisibility = size_divisibility
 53 | 
 54 |         logger = logging.getLogger(__name__)
 55 |         mode = "training" if is_train else "inference"
 56 |         logger.info(f"[{self.__class__.__name__}] Augmentations used in {mode}: {augmentations}")
 57 | 
 58 |     @classmethod
 59 |     def from_config(cls, cfg, is_train=True):
 60 |         # Build augmentation
 61 |         augs = [
 62 |             T.ResizeShortestEdge(
 63 |                 cfg.INPUT.MIN_SIZE_TRAIN,
 64 |                 cfg.INPUT.MAX_SIZE_TRAIN,
 65 |                 cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING,
 66 |             )
 67 |         ]
 68 |         if cfg.INPUT.CROP.ENABLED:
 69 |             augs.append(
 70 |                 T.RandomCrop(
 71 |                     cfg.INPUT.CROP.TYPE,
 72 |                     cfg.INPUT.CROP.SIZE,
 73 |                 )
 74 |             )
 75 |         if cfg.INPUT.COLOR_AUG_SSD:
 76 |             augs.append(ColorAugSSDTransform(img_format=cfg.INPUT.FORMAT))
 77 |         augs.append(T.RandomFlip())
 78 | 
 79 |         ret = {
 80 |             "is_train": is_train,
 81 |             "augmentations": augs,
 82 |             "image_format": cfg.INPUT.FORMAT,
 83 |             "size_divisibility": cfg.INPUT.SIZE_DIVISIBILITY,
 84 |         }
 85 |         return ret
 86 | 
 87 |     def __call__(self, dataset_dict):
 88 |         """
 89 |         Args:
 90 |             dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
 91 | 
 92 |         Returns:
 93 |             dict: a format that builtin models in detectron2 accept
 94 |         """
 95 |         assert self.is_train, "MaskFormerPanopticDatasetMapper should only be used for training!"
 96 | 
 97 |         dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
 98 |         image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
 99 |         utils.check_image_size(dataset_dict, image)
100 | 
101 |         aug_input = T.AugInput(image)
102 |         aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input)
103 |         image = aug_input.image
104 | 
105 |         # transform instnace masks
106 |         assert "annotations" in dataset_dict
107 |         for anno in dataset_dict["annotations"]:
108 |             anno.pop("keypoints", None)
109 | 
110 |         annos = [
111 |             utils.transform_instance_annotations(obj, transforms, image.shape[:2])
112 |             for obj in dataset_dict.pop("annotations")
113 |             if obj.get("iscrowd", 0) == 0
114 |         ]
115 | 
116 |         if len(annos):
117 |             assert "segmentation" in annos[0]
118 |         segms = [obj["segmentation"] for obj in annos]
119 |         masks = []
120 |         for segm in segms:
121 |             if isinstance(segm, list):
122 |                 # polygon
123 |                 masks.append(polygons_to_bitmask(segm, *image.shape[:2]))
124 |             elif isinstance(segm, dict):
125 |                 # COCO RLE
126 |                 masks.append(mask_util.decode(segm))
127 |             elif isinstance(segm, np.ndarray):
128 |                 assert segm.ndim == 2, "Expect segmentation of 2 dimensions, got {}.".format(
129 |                     segm.ndim
130 |                 )
131 |                 # mask array
132 |                 masks.append(segm)
133 |             else:
134 |                 raise ValueError(
135 |                     "Cannot convert segmentation of type '{}' to BitMasks!"
136 |                     "Supported types are: polygons as list[list[float] or ndarray],"
137 |                     " COCO-style RLE as a dict, or a binary segmentation mask "
138 |                     " in a 2D numpy array of shape HxW.".format(type(segm))
139 |                 )
140 | 
141 |         # Pad image and segmentation label here!
142 |         image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
143 |         masks = [torch.from_numpy(np.ascontiguousarray(x)) for x in masks]
144 | 
145 |         classes = [int(obj["category_id"]) for obj in annos]
146 |         classes = torch.tensor(classes, dtype=torch.int64)
147 | 
148 |         if self.size_divisibility > 0:
149 |             image_size = (image.shape[-2], image.shape[-1])
150 |             padding_size = [
151 |                 0,
152 |                 self.size_divisibility - image_size[1],
153 |                 0,
154 |                 self.size_divisibility - image_size[0],
155 |             ]
156 |             # pad image
157 |             image = F.pad(image, padding_size, value=128).contiguous()
158 |             # pad mask
159 |             masks = [F.pad(x, padding_size, value=0).contiguous() for x in masks]
160 | 
161 |         image_shape = (image.shape[-2], image.shape[-1])  # h, w
162 | 
163 |         # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
164 |         # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
165 |         # Therefore it's important to use torch.Tensor.
166 |         dataset_dict["image"] = image
167 | 
168 |         # Prepare per-category binary masks
169 |         instances = Instances(image_shape)
170 |         instances.gt_classes = classes
171 |         if len(masks) == 0:
172 |             # Some image does not have annotation (all ignored)
173 |             instances.gt_masks = torch.zeros((0, image.shape[-2], image.shape[-1]))
174 |         else:
175 |             masks = BitMasks(torch.stack(masks))
176 |             instances.gt_masks = masks.tensor
177 | 
178 |         dataset_dict["instances"] = instances
179 | 
180 |         return dataset_dict
181 | 


--------------------------------------------------------------------------------
/mask2former/data/dataset_mappers/mask_former_semantic_dataset_mapper.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | import copy
  3 | import logging
  4 | 
  5 | import numpy as np
  6 | import torch
  7 | from torch.nn import functional as F
  8 | 
  9 | from detectron2.config import configurable
 10 | from detectron2.data import MetadataCatalog
 11 | from detectron2.data import detection_utils as utils
 12 | from detectron2.data import transforms as T
 13 | from detectron2.projects.point_rend import ColorAugSSDTransform
 14 | from detectron2.structures import BitMasks, Instances
 15 | 
 16 | __all__ = ["MaskFormerSemanticDatasetMapper"]
 17 | 
 18 | 
 19 | class MaskFormerSemanticDatasetMapper:
 20 |     """
 21 |     A callable which takes a dataset dict in Detectron2 Dataset format,
 22 |     and map it into a format used by MaskFormer for semantic segmentation.
 23 | 
 24 |     The callable currently does the following:
 25 | 
 26 |     1. Read the image from "file_name"
 27 |     2. Applies geometric transforms to the image and annotation
 28 |     3. Find and applies suitable cropping to the image and annotation
 29 |     4. Prepare image and annotation to Tensors
 30 |     """
 31 | 
 32 |     @configurable
 33 |     def __init__(
 34 |         self,
 35 |         is_train=True,
 36 |         *,
 37 |         augmentations,
 38 |         image_format,
 39 |         ignore_label,
 40 |         size_divisibility,
 41 |     ):
 42 |         """
 43 |         NOTE: this interface is experimental.
 44 |         Args:
 45 |             is_train: for training or inference
 46 |             augmentations: a list of augmentations or deterministic transforms to apply
 47 |             image_format: an image format supported by :func:`detection_utils.read_image`.
 48 |             ignore_label: the label that is ignored to evaluation
 49 |             size_divisibility: pad image size to be divisible by this value
 50 |         """
 51 |         self.is_train = is_train
 52 |         self.tfm_gens = augmentations
 53 |         self.img_format = image_format
 54 |         self.ignore_label = ignore_label
 55 |         self.size_divisibility = size_divisibility
 56 | 
 57 |         logger = logging.getLogger(__name__)
 58 |         mode = "training" if is_train else "inference"
 59 |         logger.info(f"[{self.__class__.__name__}] Augmentations used in {mode}: {augmentations}")
 60 | 
 61 |     @classmethod
 62 |     def from_config(cls, cfg, is_train=True):
 63 |         # Build augmentation
 64 |         augs = [
 65 |             T.ResizeShortestEdge(
 66 |                 cfg.INPUT.MIN_SIZE_TRAIN,
 67 |                 cfg.INPUT.MAX_SIZE_TRAIN,
 68 |                 cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING,
 69 |             )
 70 |         ]
 71 |         if cfg.INPUT.CROP.ENABLED:
 72 |             augs.append(
 73 |                 T.RandomCrop_CategoryAreaConstraint(
 74 |                     cfg.INPUT.CROP.TYPE,
 75 |                     cfg.INPUT.CROP.SIZE,
 76 |                     cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA,
 77 |                     cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
 78 |                 )
 79 |             )
 80 |         if cfg.INPUT.COLOR_AUG_SSD:
 81 |             augs.append(ColorAugSSDTransform(img_format=cfg.INPUT.FORMAT))
 82 |         augs.append(T.RandomFlip())
 83 | 
 84 |         # Assume always applies to the training set.
 85 |         dataset_names = cfg.DATASETS.TRAIN
 86 |         meta = MetadataCatalog.get(dataset_names[0])
 87 |         ignore_label = meta.ignore_label
 88 | 
 89 |         ret = {
 90 |             "is_train": is_train,
 91 |             "augmentations": augs,
 92 |             "image_format": cfg.INPUT.FORMAT,
 93 |             "ignore_label": ignore_label,
 94 |             "size_divisibility": cfg.INPUT.SIZE_DIVISIBILITY,
 95 |         }
 96 |         return ret
 97 | 
 98 |     def __call__(self, dataset_dict):
 99 |         """
100 |         Args:
101 |             dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
102 | 
103 |         Returns:
104 |             dict: a format that builtin models in detectron2 accept
105 |         """
106 |         assert self.is_train, "MaskFormerSemanticDatasetMapper should only be used for training!"
107 | 
108 |         dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
109 |         image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
110 |         utils.check_image_size(dataset_dict, image)
111 | 
112 |         if "sem_seg_file_name" in dataset_dict:
113 |             # PyTorch transformation not implemented for uint16, so converting it to double first
114 |             sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name")).astype("double")
115 |         else:
116 |             sem_seg_gt = None
117 | 
118 |         if sem_seg_gt is None:
119 |             raise ValueError(
120 |                 "Cannot find 'sem_seg_file_name' for semantic segmentation dataset {}.".format(
121 |                     dataset_dict["file_name"]
122 |                 )
123 |             )
124 | 
125 |         aug_input = T.AugInput(image, sem_seg=sem_seg_gt)
126 |         aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input)
127 |         image = aug_input.image
128 |         sem_seg_gt = aug_input.sem_seg
129 | 
130 |         # Pad image and segmentation label here!
131 |         image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
132 |         if sem_seg_gt is not None:
133 |             sem_seg_gt = torch.as_tensor(sem_seg_gt.astype("long"))
134 | 
135 |         if self.size_divisibility > 0:
136 |             image_size = (image.shape[-2], image.shape[-1])
137 |             padding_size = [
138 |                 0,
139 |                 self.size_divisibility - image_size[1],
140 |                 0,
141 |                 self.size_divisibility - image_size[0],
142 |             ]
143 |             image = F.pad(image, padding_size, value=128).contiguous()
144 |             if sem_seg_gt is not None:
145 |                 sem_seg_gt = F.pad(sem_seg_gt, padding_size, value=self.ignore_label).contiguous()
146 | 
147 |         image_shape = (image.shape[-2], image.shape[-1])  # h, w
148 | 
149 |         # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
150 |         # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
151 |         # Therefore it's important to use torch.Tensor.
152 |         dataset_dict["image"] = image
153 | 
154 |         if sem_seg_gt is not None:
155 |             dataset_dict["sem_seg"] = sem_seg_gt.long()
156 | 
157 |         if "annotations" in dataset_dict:
158 |             raise ValueError("Semantic segmentation dataset should not have 'annotations'.")
159 | 
160 |         # Prepare per-category binary masks
161 |         if sem_seg_gt is not None:
162 |             sem_seg_gt = sem_seg_gt.numpy()
163 |             instances = Instances(image_shape)
164 |             classes = np.unique(sem_seg_gt)
165 |             # remove ignored region
166 |             classes = classes[classes != self.ignore_label]
167 |             instances.gt_classes = torch.tensor(classes, dtype=torch.int64)
168 | 
169 |             masks = []
170 |             for class_id in classes:
171 |                 masks.append(sem_seg_gt == class_id)
172 | 
173 |             if len(masks) == 0:
174 |                 # Some image does not have annotation (all ignored)
175 |                 instances.gt_masks = torch.zeros((0, sem_seg_gt.shape[-2], sem_seg_gt.shape[-1]))
176 |             else:
177 |                 masks = BitMasks(
178 |                     torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks])
179 |                 )
180 |                 instances.gt_masks = masks.tensor
181 | 
182 |             dataset_dict["instances"] = instances
183 | 
184 |         return dataset_dict
185 | 


--------------------------------------------------------------------------------
/mask2former/modeling/transformer_decoder/maskformer_transformer_decoder.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/detr.py
  3 | import fvcore.nn.weight_init as weight_init
  4 | import torch
  5 | from torch import nn
  6 | from torch.nn import functional as F
  7 | 
  8 | from detectron2.config import configurable
  9 | from detectron2.layers import Conv2d
 10 | from detectron2.utils.registry import Registry
 11 | 
 12 | from .position_encoding import PositionEmbeddingSine
 13 | from .transformer import Transformer
 14 | 
 15 | 
 16 | TRANSFORMER_DECODER_REGISTRY = Registry("TRANSFORMER_MODULE")
 17 | TRANSFORMER_DECODER_REGISTRY.__doc__ = """
 18 | Registry for transformer module in MaskFormer.
 19 | """
 20 | 
 21 | 
 22 | def build_transformer_decoder(cfg, in_channels, mask_classification=True):
 23 |     """
 24 |     Build a instance embedding branch from `cfg.MODEL.INS_EMBED_HEAD.NAME`.
 25 |     """
 26 |     name = cfg.MODEL.MASK_FORMER.TRANSFORMER_DECODER_NAME
 27 |     return TRANSFORMER_DECODER_REGISTRY.get(name)(cfg, in_channels, mask_classification)
 28 | 
 29 | 
 30 | @TRANSFORMER_DECODER_REGISTRY.register()
 31 | class StandardTransformerDecoder(nn.Module):
 32 |     @configurable
 33 |     def __init__(
 34 |         self,
 35 |         in_channels,
 36 |         mask_classification=True,
 37 |         *,
 38 |         num_classes: int,
 39 |         hidden_dim: int,
 40 |         num_queries: int,
 41 |         nheads: int,
 42 |         dropout: float,
 43 |         dim_feedforward: int,
 44 |         enc_layers: int,
 45 |         dec_layers: int,
 46 |         pre_norm: bool,
 47 |         deep_supervision: bool,
 48 |         mask_dim: int,
 49 |         enforce_input_project: bool,
 50 |     ):
 51 |         """
 52 |         NOTE: this interface is experimental.
 53 |         Args:
 54 |             in_channels: channels of the input features
 55 |             mask_classification: whether to add mask classifier or not
 56 |             num_classes: number of classes
 57 |             hidden_dim: Transformer feature dimension
 58 |             num_queries: number of queries
 59 |             nheads: number of heads
 60 |             dropout: dropout in Transformer
 61 |             dim_feedforward: feature dimension in feedforward network
 62 |             enc_layers: number of Transformer encoder layers
 63 |             dec_layers: number of Transformer decoder layers
 64 |             pre_norm: whether to use pre-LayerNorm or not
 65 |             deep_supervision: whether to add supervision to every decoder layers
 66 |             mask_dim: mask feature dimension
 67 |             enforce_input_project: add input project 1x1 conv even if input
 68 |                 channels and hidden dim is identical
 69 |         """
 70 |         super().__init__()
 71 | 
 72 |         self.mask_classification = mask_classification
 73 | 
 74 |         # positional encoding
 75 |         N_steps = hidden_dim // 2
 76 |         self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True)
 77 | 
 78 |         transformer = Transformer(
 79 |             d_model=hidden_dim,
 80 |             dropout=dropout,
 81 |             nhead=nheads,
 82 |             dim_feedforward=dim_feedforward,
 83 |             num_encoder_layers=enc_layers,
 84 |             num_decoder_layers=dec_layers,
 85 |             normalize_before=pre_norm,
 86 |             return_intermediate_dec=deep_supervision,
 87 |         )
 88 | 
 89 |         self.num_queries = num_queries
 90 |         self.transformer = transformer
 91 |         hidden_dim = transformer.d_model
 92 | 
 93 |         self.query_embed = nn.Embedding(num_queries, hidden_dim)
 94 | 
 95 |         if in_channels != hidden_dim or enforce_input_project:
 96 |             self.input_proj = Conv2d(in_channels, hidden_dim, kernel_size=1)
 97 |             weight_init.c2_xavier_fill(self.input_proj)
 98 |         else:
 99 |             self.input_proj = nn.Sequential()
100 |         self.aux_loss = deep_supervision
101 | 
102 |         # output FFNs
103 |         if self.mask_classification:
104 |             self.class_embed = nn.Linear(hidden_dim, num_classes + 1)
105 |         self.mask_embed = MLP(hidden_dim, hidden_dim, mask_dim, 3)
106 | 
107 |     @classmethod
108 |     def from_config(cls, cfg, in_channels, mask_classification):
109 |         ret = {}
110 |         ret["in_channels"] = in_channels
111 |         ret["mask_classification"] = mask_classification
112 | 
113 |         ret["num_classes"] = cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES
114 |         ret["hidden_dim"] = cfg.MODEL.MASK_FORMER.HIDDEN_DIM
115 |         ret["num_queries"] = cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES
116 |         # Transformer parameters:
117 |         ret["nheads"] = cfg.MODEL.MASK_FORMER.NHEADS
118 |         ret["dropout"] = cfg.MODEL.MASK_FORMER.DROPOUT
119 |         ret["dim_feedforward"] = cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD
120 |         ret["enc_layers"] = cfg.MODEL.MASK_FORMER.ENC_LAYERS
121 |         ret["dec_layers"] = cfg.MODEL.MASK_FORMER.DEC_LAYERS
122 |         ret["pre_norm"] = cfg.MODEL.MASK_FORMER.PRE_NORM
123 |         ret["deep_supervision"] = cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION
124 |         ret["enforce_input_project"] = cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ
125 | 
126 |         ret["mask_dim"] = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM
127 | 
128 |         return ret
129 | 
130 |     def forward(self, x, mask_features, mask=None):
131 |         if mask is not None:
132 |             mask = F.interpolate(mask[None].float(), size=x.shape[-2:]).to(torch.bool)[0]
133 |         pos = self.pe_layer(x, mask)
134 | 
135 |         src = x
136 |         hs, memory = self.transformer(self.input_proj(src), mask, self.query_embed.weight, pos)
137 | 
138 |         if self.mask_classification:
139 |             outputs_class = self.class_embed(hs)
140 |             out = {"pred_logits": outputs_class[-1]}
141 |         else:
142 |             out = {}
143 | 
144 |         if self.aux_loss:
145 |             # [l, bs, queries, embed]
146 |             mask_embed = self.mask_embed(hs)
147 |             outputs_seg_masks = torch.einsum("lbqc,bchw->lbqhw", mask_embed, mask_features)
148 |             out["pred_masks"] = outputs_seg_masks[-1]
149 |             out["aux_outputs"] = self._set_aux_loss(
150 |                 outputs_class if self.mask_classification else None, outputs_seg_masks
151 |             )
152 |         else:
153 |             # FIXME h_boxes takes the last one computed, keep this in mind
154 |             # [bs, queries, embed]
155 |             mask_embed = self.mask_embed(hs[-1])
156 |             outputs_seg_masks = torch.einsum("bqc,bchw->bqhw", mask_embed, mask_features)
157 |             out["pred_masks"] = outputs_seg_masks
158 |         return out
159 | 
160 |     @torch.jit.unused
161 |     def _set_aux_loss(self, outputs_class, outputs_seg_masks):
162 |         # this is a workaround to make torchscript happy, as torchscript
163 |         # doesn't support dictionary with non-homogeneous values, such
164 |         # as a dict having both a Tensor and a list.
165 |         if self.mask_classification:
166 |             return [
167 |                 {"pred_logits": a, "pred_masks": b}
168 |                 for a, b in zip(outputs_class[:-1], outputs_seg_masks[:-1])
169 |             ]
170 |         else:
171 |             return [{"pred_masks": b} for b in outputs_seg_masks[:-1]]
172 | 
173 | 
174 | class MLP(nn.Module):
175 |     """Very simple multi-layer perceptron (also called FFN)"""
176 | 
177 |     def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
178 |         super().__init__()
179 |         self.num_layers = num_layers
180 |         h = [hidden_dim] * (num_layers - 1)
181 |         self.layers = nn.ModuleList(
182 |             nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])
183 |         )
184 | 
185 |     def forward(self, x):
186 |         for i, layer in enumerate(self.layers):
187 |             x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
188 |         return x
189 | 


--------------------------------------------------------------------------------
/mask2former/data/dataset_mappers/coco_instance_new_baseline_dataset_mapper.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/d2/detr/dataset_mapper.py
  3 | import copy
  4 | import logging
  5 | 
  6 | import numpy as np
  7 | import torch
  8 | 
  9 | from detectron2.config import configurable
 10 | from detectron2.data import detection_utils as utils
 11 | from detectron2.data import transforms as T
 12 | from detectron2.data.transforms import TransformGen
 13 | from detectron2.structures import BitMasks, Instances
 14 | 
 15 | from pycocotools import mask as coco_mask
 16 | 
 17 | __all__ = ["COCOInstanceNewBaselineDatasetMapper"]
 18 | 
 19 | 
 20 | def convert_coco_poly_to_mask(segmentations, height, width):
 21 |     masks = []
 22 |     for polygons in segmentations:
 23 |         rles = coco_mask.frPyObjects(polygons, height, width)
 24 |         mask = coco_mask.decode(rles)
 25 |         if len(mask.shape) < 3:
 26 |             mask = mask[..., None]
 27 |         mask = torch.as_tensor(mask, dtype=torch.uint8)
 28 |         mask = mask.any(dim=2)
 29 |         masks.append(mask)
 30 |     if masks:
 31 |         masks = torch.stack(masks, dim=0)
 32 |     else:
 33 |         masks = torch.zeros((0, height, width), dtype=torch.uint8)
 34 |     return masks
 35 | 
 36 | 
 37 | def build_transform_gen(cfg, is_train):
 38 |     """
 39 |     Create a list of default :class:`Augmentation` from config.
 40 |     Now it includes resizing and flipping.
 41 |     Returns:
 42 |         list[Augmentation]
 43 |     """
 44 |     assert is_train, "Only support training augmentation"
 45 |     image_size = cfg.INPUT.IMAGE_SIZE
 46 |     min_scale = cfg.INPUT.MIN_SCALE
 47 |     max_scale = cfg.INPUT.MAX_SCALE
 48 | 
 49 |     augmentation = []
 50 | 
 51 |     if cfg.INPUT.RANDOM_FLIP != "none":
 52 |         augmentation.append(
 53 |             T.RandomFlip(
 54 |                 horizontal=cfg.INPUT.RANDOM_FLIP == "horizontal",
 55 |                 vertical=cfg.INPUT.RANDOM_FLIP == "vertical",
 56 |             )
 57 |         )
 58 | 
 59 |     augmentation.extend([
 60 |         T.ResizeScale(
 61 |             min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size
 62 |         ),
 63 |         T.FixedSizeCrop(crop_size=(image_size, image_size)),
 64 |     ])
 65 | 
 66 |     return augmentation
 67 | 
 68 | 
 69 | # This is specifically designed for the COCO dataset.
 70 | class COCOInstanceNewBaselineDatasetMapper:
 71 |     """
 72 |     A callable which takes a dataset dict in Detectron2 Dataset format,
 73 |     and map it into a format used by MaskFormer.
 74 | 
 75 |     This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation.
 76 | 
 77 |     The callable currently does the following:
 78 | 
 79 |     1. Read the image from "file_name"
 80 |     2. Applies geometric transforms to the image and annotation
 81 |     3. Find and applies suitable cropping to the image and annotation
 82 |     4. Prepare image and annotation to Tensors
 83 |     """
 84 | 
 85 |     @configurable
 86 |     def __init__(
 87 |         self,
 88 |         is_train=True,
 89 |         *,
 90 |         tfm_gens,
 91 |         image_format,
 92 |     ):
 93 |         """
 94 |         NOTE: this interface is experimental.
 95 |         Args:
 96 |             is_train: for training or inference
 97 |             augmentations: a list of augmentations or deterministic transforms to apply
 98 |             tfm_gens: data augmentation
 99 |             image_format: an image format supported by :func:`detection_utils.read_image`.
100 |         """
101 |         self.tfm_gens = tfm_gens
102 |         logging.getLogger(__name__).info(
103 |             "[COCOInstanceNewBaselineDatasetMapper] Full TransformGens used in training: {}".format(str(self.tfm_gens))
104 |         )
105 | 
106 |         self.img_format = image_format
107 |         self.is_train = is_train
108 |     
109 |     @classmethod
110 |     def from_config(cls, cfg, is_train=True):
111 |         # Build augmentation
112 |         tfm_gens = build_transform_gen(cfg, is_train)
113 | 
114 |         ret = {
115 |             "is_train": is_train,
116 |             "tfm_gens": tfm_gens,
117 |             "image_format": cfg.INPUT.FORMAT,
118 |         }
119 |         return ret
120 | 
121 |     def __call__(self, dataset_dict):
122 |         """
123 |         Args:
124 |             dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
125 | 
126 |         Returns:
127 |             dict: a format that builtin models in detectron2 accept
128 |         """
129 |         dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
130 |         image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
131 |         utils.check_image_size(dataset_dict, image)
132 | 
133 |         # TODO: get padding mask
134 |         # by feeding a "segmentation mask" to the same transforms
135 |         padding_mask = np.ones(image.shape[:2])
136 | 
137 |         image, transforms = T.apply_transform_gens(self.tfm_gens, image)
138 |         # the crop transformation has default padding value 0 for segmentation
139 |         padding_mask = transforms.apply_segmentation(padding_mask)
140 |         padding_mask = ~ padding_mask.astype(bool)
141 | 
142 |         image_shape = image.shape[:2]  # h, w
143 | 
144 |         # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
145 |         # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
146 |         # Therefore it's important to use torch.Tensor.
147 |         dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
148 |         dataset_dict["padding_mask"] = torch.as_tensor(np.ascontiguousarray(padding_mask))
149 | 
150 |         if not self.is_train:
151 |             # USER: Modify this if you want to keep them for some reason.
152 |             dataset_dict.pop("annotations", None)
153 |             return dataset_dict
154 | 
155 |         if "annotations" in dataset_dict:
156 |             # USER: Modify this if you want to keep them for some reason.
157 |             for anno in dataset_dict["annotations"]:
158 |                 # Let's always keep mask
159 |                 # if not self.mask_on:
160 |                 #     anno.pop("segmentation", None)
161 |                 anno.pop("keypoints", None)
162 | 
163 |             # USER: Implement additional transformations if you have other types of data
164 |             annos = [
165 |                 utils.transform_instance_annotations(obj, transforms, image_shape)
166 |                 for obj in dataset_dict.pop("annotations")
167 |                 if obj.get("iscrowd", 0) == 0
168 |             ]
169 |             # NOTE: does not support BitMask due to augmentation
170 |             # Current BitMask cannot handle empty objects
171 |             instances = utils.annotations_to_instances(annos, image_shape)
172 |             # After transforms such as cropping are applied, the bounding box may no longer
173 |             # tightly bound the object. As an example, imagine a triangle object
174 |             # [(0,0), (2,0), (0,2)] cropped by a box [(1,0),(2,2)] (XYXY format). The tight
175 |             # bounding box of the cropped triangle should be [(1,0),(2,1)], which is not equal to
176 |             # the intersection of original bounding box and the cropping box.
177 |             instances.gt_boxes = instances.gt_masks.get_bounding_boxes()
178 |             # Need to filter empty instances first (due to augmentation)
179 |             instances = utils.filter_empty_instances(instances)
180 |             # Generate masks from polygon
181 |             h, w = instances.image_size
182 |             # image_size_xyxy = torch.as_tensor([w, h, w, h], dtype=torch.float)
183 |             if hasattr(instances, 'gt_masks'):
184 |                 gt_masks = instances.gt_masks
185 |                 gt_masks = convert_coco_poly_to_mask(gt_masks.polygons, h, w)
186 |                 instances.gt_masks = gt_masks
187 |             dataset_dict["instances"] = instances
188 | 
189 |         return dataset_dict
190 | 


--------------------------------------------------------------------------------
/mask2former/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.cu:
--------------------------------------------------------------------------------
  1 | /*!
  2 | **************************************************************************************************
  3 | * Deformable DETR
  4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
  5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  6 | **************************************************************************************************
  7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
  8 | **************************************************************************************************
  9 | */
 10 | 
 11 | /*!
 12 | * Copyright (c) Facebook, Inc. and its affiliates.
 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
 14 | */
 15 | 
 16 | #include <vector>
 17 | #include "cuda/ms_deform_im2col_cuda.cuh"
 18 | 
 19 | #include <ATen/ATen.h>
 20 | #include <ATen/cuda/CUDAContext.h>
 21 | #include <cuda.h>
 22 | #include <cuda_runtime.h>
 23 | 
 24 | 
 25 | at::Tensor ms_deform_attn_cuda_forward(
 26 |     const at::Tensor &value, 
 27 |     const at::Tensor &spatial_shapes,
 28 |     const at::Tensor &level_start_index,
 29 |     const at::Tensor &sampling_loc,
 30 |     const at::Tensor &attn_weight,
 31 |     const int im2col_step)
 32 | {
 33 |     AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
 34 |     AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
 35 |     AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
 36 |     AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
 37 |     AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
 38 | 
 39 |     AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
 40 |     AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
 41 |     AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
 42 |     AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
 43 |     AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
 44 | 
 45 |     const int batch = value.size(0);
 46 |     const int spatial_size = value.size(1);
 47 |     const int num_heads = value.size(2);
 48 |     const int channels = value.size(3);
 49 | 
 50 |     const int num_levels = spatial_shapes.size(0);
 51 | 
 52 |     const int num_query = sampling_loc.size(1);
 53 |     const int num_point = sampling_loc.size(4);
 54 | 
 55 |     const int im2col_step_ = std::min(batch, im2col_step);
 56 | 
 57 |     AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
 58 |     
 59 |     auto output = at::zeros({batch, num_query, num_heads, channels}, value.options());
 60 | 
 61 |     const int batch_n = im2col_step_;
 62 |     auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
 63 |     auto per_value_size = spatial_size * num_heads * channels;
 64 |     auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
 65 |     auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
 66 |     for (int n = 0; n < batch/im2col_step_; ++n)
 67 |     {
 68 |         auto columns = output_n.select(0, n);
 69 |         AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_forward_cuda", ([&] {
 70 |             ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(),
 71 |                 value.data<scalar_t>() + n * im2col_step_ * per_value_size,
 72 |                 spatial_shapes.data<int64_t>(),
 73 |                 level_start_index.data<int64_t>(),
 74 |                 sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
 75 |                 attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
 76 |                 batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
 77 |                 columns.data<scalar_t>());
 78 | 
 79 |         }));
 80 |     }
 81 | 
 82 |     output = output.view({batch, num_query, num_heads*channels});
 83 | 
 84 |     return output;
 85 | }
 86 | 
 87 | 
 88 | std::vector<at::Tensor> ms_deform_attn_cuda_backward(
 89 |     const at::Tensor &value, 
 90 |     const at::Tensor &spatial_shapes,
 91 |     const at::Tensor &level_start_index,
 92 |     const at::Tensor &sampling_loc,
 93 |     const at::Tensor &attn_weight,
 94 |     const at::Tensor &grad_output,
 95 |     const int im2col_step)
 96 | {
 97 | 
 98 |     AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
 99 |     AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
100 |     AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
101 |     AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
102 |     AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
103 |     AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous");
104 | 
105 |     AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
106 |     AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
107 |     AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
108 |     AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
109 |     AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
110 |     AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor");
111 | 
112 |     const int batch = value.size(0);
113 |     const int spatial_size = value.size(1);
114 |     const int num_heads = value.size(2);
115 |     const int channels = value.size(3);
116 | 
117 |     const int num_levels = spatial_shapes.size(0);
118 | 
119 |     const int num_query = sampling_loc.size(1);
120 |     const int num_point = sampling_loc.size(4);
121 | 
122 |     const int im2col_step_ = std::min(batch, im2col_step);
123 | 
124 |     AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
125 | 
126 |     auto grad_value = at::zeros_like(value);
127 |     auto grad_sampling_loc = at::zeros_like(sampling_loc);
128 |     auto grad_attn_weight = at::zeros_like(attn_weight);
129 | 
130 |     const int batch_n = im2col_step_;
131 |     auto per_value_size = spatial_size * num_heads * channels;
132 |     auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
133 |     auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
134 |     auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
135 |     
136 |     for (int n = 0; n < batch/im2col_step_; ++n)
137 |     {
138 |         auto grad_output_g = grad_output_n.select(0, n);
139 |         AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_backward_cuda", ([&] {
140 |             ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(),
141 |                                     grad_output_g.data<scalar_t>(),
142 |                                     value.data<scalar_t>() + n * im2col_step_ * per_value_size,
143 |                                     spatial_shapes.data<int64_t>(),
144 |                                     level_start_index.data<int64_t>(),
145 |                                     sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
146 |                                     attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
147 |                                     batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
148 |                                     grad_value.data<scalar_t>() +  n * im2col_step_ * per_value_size,
149 |                                     grad_sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
150 |                                     grad_attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size);
151 | 
152 |         }));
153 |     }
154 | 
155 |     return {
156 |         grad_value, grad_sampling_loc, grad_attn_weight
157 |     };
158 | }


--------------------------------------------------------------------------------
/mask2former/data/datasets/register_coco_panoptic_annos_semseg.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | import json
  3 | import os
  4 | 
  5 | from detectron2.data import DatasetCatalog, MetadataCatalog
  6 | from detectron2.data.datasets import load_sem_seg
  7 | from detectron2.data.datasets.builtin_meta import COCO_CATEGORIES
  8 | from detectron2.utils.file_io import PathManager
  9 | 
 10 | 
 11 | _PREDEFINED_SPLITS_COCO_PANOPTIC = {
 12 |     "coco_2017_train_panoptic": (
 13 |         # This is the original panoptic annotation directory
 14 |         "coco/panoptic_train2017",
 15 |         "coco/annotations/panoptic_train2017.json",
 16 |         # This directory contains semantic annotations that are
 17 |         # converted from panoptic annotations.
 18 |         # It is used by PanopticFPN.
 19 |         # You can use the script at detectron2/datasets/prepare_panoptic_fpn.py
 20 |         # to create these directories.
 21 |         "coco/panoptic_semseg_train2017",
 22 |     ),
 23 |     "coco_2017_val_panoptic": (
 24 |         "coco/panoptic_val2017",
 25 |         "coco/annotations/panoptic_val2017.json",
 26 |         "coco/panoptic_semseg_val2017",
 27 |     ),
 28 | }
 29 | 
 30 | 
 31 | def get_metadata():
 32 |     meta = {}
 33 |     # The following metadata maps contiguous id from [0, #thing categories +
 34 |     # #stuff categories) to their names and colors. We have to replica of the
 35 |     # same name and color under "thing_*" and "stuff_*" because the current
 36 |     # visualization function in D2 handles thing and class classes differently
 37 |     # due to some heuristic used in Panoptic FPN. We keep the same naming to
 38 |     # enable reusing existing visualization functions.
 39 |     thing_classes = [k["name"] for k in COCO_CATEGORIES if k["isthing"] == 1]
 40 |     thing_colors = [k["color"] for k in COCO_CATEGORIES if k["isthing"] == 1]
 41 |     stuff_classes = [k["name"] for k in COCO_CATEGORIES]
 42 |     stuff_colors = [k["color"] for k in COCO_CATEGORIES]
 43 | 
 44 |     meta["thing_classes"] = thing_classes
 45 |     meta["thing_colors"] = thing_colors
 46 |     meta["stuff_classes"] = stuff_classes
 47 |     meta["stuff_colors"] = stuff_colors
 48 | 
 49 |     # Convert category id for training:
 50 |     #   category id: like semantic segmentation, it is the class id for each
 51 |     #   pixel. Since there are some classes not used in evaluation, the category
 52 |     #   id is not always contiguous and thus we have two set of category ids:
 53 |     #       - original category id: category id in the original dataset, mainly
 54 |     #           used for evaluation.
 55 |     #       - contiguous category id: [0, #classes), in order to train the linear
 56 |     #           softmax classifier.
 57 |     thing_dataset_id_to_contiguous_id = {}
 58 |     stuff_dataset_id_to_contiguous_id = {}
 59 | 
 60 |     for i, cat in enumerate(COCO_CATEGORIES):
 61 |         if cat["isthing"]:
 62 |             thing_dataset_id_to_contiguous_id[cat["id"]] = i
 63 |         # else:
 64 |         #     stuff_dataset_id_to_contiguous_id[cat["id"]] = i
 65 | 
 66 |         # in order to use sem_seg evaluator
 67 |         stuff_dataset_id_to_contiguous_id[cat["id"]] = i
 68 | 
 69 |     meta["thing_dataset_id_to_contiguous_id"] = thing_dataset_id_to_contiguous_id
 70 |     meta["stuff_dataset_id_to_contiguous_id"] = stuff_dataset_id_to_contiguous_id
 71 | 
 72 |     return meta
 73 | 
 74 | 
 75 | def load_coco_panoptic_json(json_file, image_dir, gt_dir, semseg_dir, meta):
 76 |     """
 77 |     Args:
 78 |         image_dir (str): path to the raw dataset. e.g., "~/coco/train2017".
 79 |         gt_dir (str): path to the raw annotations. e.g., "~/coco/panoptic_train2017".
 80 |         json_file (str): path to the json file. e.g., "~/coco/annotations/panoptic_train2017.json".
 81 |     Returns:
 82 |         list[dict]: a list of dicts in Detectron2 standard format. (See
 83 |         `Using Custom Datasets </tutorials/datasets.html>`_ )
 84 |     """
 85 | 
 86 |     def _convert_category_id(segment_info, meta):
 87 |         if segment_info["category_id"] in meta["thing_dataset_id_to_contiguous_id"]:
 88 |             segment_info["category_id"] = meta["thing_dataset_id_to_contiguous_id"][
 89 |                 segment_info["category_id"]
 90 |             ]
 91 |             segment_info["isthing"] = True
 92 |         else:
 93 |             segment_info["category_id"] = meta["stuff_dataset_id_to_contiguous_id"][
 94 |                 segment_info["category_id"]
 95 |             ]
 96 |             segment_info["isthing"] = False
 97 |         return segment_info
 98 | 
 99 |     with PathManager.open(json_file) as f:
100 |         json_info = json.load(f)
101 | 
102 |     ret = []
103 |     for ann in json_info["annotations"]:
104 |         image_id = int(ann["image_id"])
105 |         # TODO: currently we assume image and label has the same filename but
106 |         # different extension, and images have extension ".jpg" for COCO. Need
107 |         # to make image extension a user-provided argument if we extend this
108 |         # function to support other COCO-like datasets.
109 |         image_file = os.path.join(image_dir, os.path.splitext(ann["file_name"])[0] + ".jpg")
110 |         label_file = os.path.join(gt_dir, ann["file_name"])
111 |         sem_label_file = os.path.join(semseg_dir, ann["file_name"])
112 |         segments_info = [_convert_category_id(x, meta) for x in ann["segments_info"]]
113 |         ret.append(
114 |             {
115 |                 "file_name": image_file,
116 |                 "image_id": image_id,
117 |                 "pan_seg_file_name": label_file,
118 |                 "sem_seg_file_name": sem_label_file,
119 |                 "segments_info": segments_info,
120 |             }
121 |         )
122 |     assert len(ret), f"No images found in {image_dir}!"
123 |     assert PathManager.isfile(ret[0]["file_name"]), ret[0]["file_name"]
124 |     assert PathManager.isfile(ret[0]["pan_seg_file_name"]), ret[0]["pan_seg_file_name"]
125 |     assert PathManager.isfile(ret[0]["sem_seg_file_name"]), ret[0]["sem_seg_file_name"]
126 |     return ret
127 | 
128 | 
129 | def register_coco_panoptic_annos_sem_seg(
130 |     name, metadata, image_root, panoptic_root, panoptic_json, sem_seg_root, instances_json
131 | ):
132 |     panoptic_name = name
133 |     delattr(MetadataCatalog.get(panoptic_name), "thing_classes")
134 |     delattr(MetadataCatalog.get(panoptic_name), "thing_colors")
135 |     MetadataCatalog.get(panoptic_name).set(
136 |         thing_classes=metadata["thing_classes"],
137 |         thing_colors=metadata["thing_colors"],
138 |         # thing_dataset_id_to_contiguous_id=metadata["thing_dataset_id_to_contiguous_id"],
139 |     )
140 | 
141 |     # the name is "coco_2017_train_panoptic_with_sem_seg" and "coco_2017_val_panoptic_with_sem_seg"
142 |     semantic_name = name + "_with_sem_seg"
143 |     DatasetCatalog.register(
144 |         semantic_name,
145 |         lambda: load_coco_panoptic_json(panoptic_json, image_root, panoptic_root, sem_seg_root, metadata),
146 |     )
147 |     MetadataCatalog.get(semantic_name).set(
148 |         sem_seg_root=sem_seg_root,
149 |         panoptic_root=panoptic_root,
150 |         image_root=image_root,
151 |         panoptic_json=panoptic_json,
152 |         json_file=instances_json,
153 |         evaluator_type="coco_panoptic_seg",
154 |         ignore_label=255,
155 |         label_divisor=1000,
156 |         **metadata,
157 |     )
158 | 
159 | 
160 | def register_all_coco_panoptic_annos_sem_seg(root):
161 |     for (
162 |         prefix,
163 |         (panoptic_root, panoptic_json, semantic_root),
164 |     ) in _PREDEFINED_SPLITS_COCO_PANOPTIC.items():
165 |         prefix_instances = prefix[: -len("_panoptic")]
166 |         instances_meta = MetadataCatalog.get(prefix_instances)
167 |         image_root, instances_json = instances_meta.image_root, instances_meta.json_file
168 | 
169 |         register_coco_panoptic_annos_sem_seg(
170 |             prefix,
171 |             get_metadata(),
172 |             image_root,
173 |             os.path.join(root, panoptic_root),
174 |             os.path.join(root, panoptic_json),
175 |             os.path.join(root, semantic_root),
176 |             instances_json,
177 |         )
178 | 
179 | 
180 | _root = os.getenv("DETECTRON2_DATASETS", "datasets")
181 | register_all_coco_panoptic_annos_sem_seg(_root)
182 | 


--------------------------------------------------------------------------------
/mask2former/modeling/matcher.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/models/matcher.py
  3 | """
  4 | Modules to compute the matching cost and solve the corresponding LSAP.
  5 | """
  6 | import torch
  7 | import torch.nn.functional as F
  8 | from scipy.optimize import linear_sum_assignment
  9 | from torch import nn
 10 | from torch.cuda.amp import autocast
 11 | 
 12 | from detectron2.projects.point_rend.point_features import point_sample
 13 | 
 14 | 
 15 | def batch_dice_loss(inputs: torch.Tensor, targets: torch.Tensor):
 16 |     """
 17 |     Compute the DICE loss, similar to generalized IOU for masks
 18 |     Args:
 19 |         inputs: A float tensor of arbitrary shape.
 20 |                 The predictions for each example.
 21 |         targets: A float tensor with the same shape as inputs. Stores the binary
 22 |                  classification label for each element in inputs
 23 |                 (0 for the negative class and 1 for the positive class).
 24 |     """
 25 |     inputs = inputs.sigmoid()
 26 |     inputs = inputs.flatten(1)
 27 |     numerator = 2 * torch.einsum("nc,mc->nm", inputs, targets)
 28 |     denominator = inputs.sum(-1)[:, None] + targets.sum(-1)[None, :]
 29 |     loss = 1 - (numerator + 1) / (denominator + 1)
 30 |     return loss
 31 | 
 32 | 
 33 | batch_dice_loss_jit = torch.jit.script(
 34 |     batch_dice_loss
 35 | )  # type: torch.jit.ScriptModule
 36 | 
 37 | 
 38 | def batch_sigmoid_ce_loss(inputs: torch.Tensor, targets: torch.Tensor):
 39 |     """
 40 |     Args:
 41 |         inputs: A float tensor of arbitrary shape.
 42 |                 The predictions for each example.
 43 |         targets: A float tensor with the same shape as inputs. Stores the binary
 44 |                  classification label for each element in inputs
 45 |                 (0 for the negative class and 1 for the positive class).
 46 |     Returns:
 47 |         Loss tensor
 48 |     """
 49 |     hw = inputs.shape[1]
 50 | 
 51 |     pos = F.binary_cross_entropy_with_logits(
 52 |         inputs, torch.ones_like(inputs), reduction="none"
 53 |     )
 54 |     neg = F.binary_cross_entropy_with_logits(
 55 |         inputs, torch.zeros_like(inputs), reduction="none"
 56 |     )
 57 | 
 58 |     loss = torch.einsum("nc,mc->nm", pos, targets) + torch.einsum(
 59 |         "nc,mc->nm", neg, (1 - targets)
 60 |     )
 61 | 
 62 |     return loss / hw
 63 | 
 64 | 
 65 | batch_sigmoid_ce_loss_jit = torch.jit.script(
 66 |     batch_sigmoid_ce_loss
 67 | )  # type: torch.jit.ScriptModule
 68 | 
 69 | 
 70 | class HungarianMatcher(nn.Module):
 71 |     """This class computes an assignment between the targets and the predictions of the network
 72 | 
 73 |     For efficiency reasons, the targets don't include the no_object. Because of this, in general,
 74 |     there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions,
 75 |     while the others are un-matched (and thus treated as non-objects).
 76 |     """
 77 | 
 78 |     def __init__(
 79 |         self, cost_class: float = 1, cost_mask: float = 1, cost_dice: float = 1,
 80 |         num_points: int = 0,
 81 |     ):
 82 |         """Creates the matcher
 83 | 
 84 |         Params:
 85 |             cost_class: This is the relative weight of the classification error in the matching cost
 86 |             cost_mask: This is the relative weight of the focal loss of the binary mask in the matching cost
 87 |             cost_dice: This is the relative weight of the dice loss of the binary mask in the matching cost
 88 |         """
 89 |         super().__init__()
 90 |         self.cost_class = cost_class
 91 |         self.cost_mask = cost_mask
 92 |         self.cost_dice = cost_dice
 93 | 
 94 |         assert cost_class != 0 or cost_mask != 0 or cost_dice != 0, "all costs cant be 0"
 95 | 
 96 |         self.num_points = num_points
 97 | 
 98 |     @torch.no_grad()
 99 |     def memory_efficient_forward(self, outputs, targets):
100 |         """More memory-friendly matching"""
101 |         bs, num_queries = outputs["pred_logits"].shape[:2]
102 | 
103 |         indices = []
104 | 
105 |         # Iterate through batch size
106 |         for b in range(bs):
107 | 
108 |             out_prob = outputs["pred_logits"][b].softmax(-1)  # [num_queries, num_classes]
109 |             tgt_ids = targets[b]["labels"]
110 | 
111 |             # Compute the classification cost. Contrary to the loss, we don't use the NLL,
112 |             # but approximate it in 1 - proba[target class].
113 |             # The 1 is a constant that doesn't change the matching, it can be ommitted.
114 |             cost_class = -out_prob[:, tgt_ids]
115 | 
116 |             out_mask = outputs["pred_masks"][b]  # [num_queries, H_pred, W_pred]
117 |             # gt masks are already padded when preparing target
118 |             tgt_mask = targets[b]["masks"].to(out_mask)
119 | 
120 |             out_mask = out_mask[:, None]
121 |             tgt_mask = tgt_mask[:, None]
122 |             # all masks share the same set of points for efficient matching!
123 |             point_coords = torch.rand(1, self.num_points, 2, device=out_mask.device)
124 |             # get gt labels
125 |             tgt_mask = point_sample(
126 |                 tgt_mask,
127 |                 point_coords.repeat(tgt_mask.shape[0], 1, 1),
128 |                 align_corners=False,
129 |             ).squeeze(1)
130 | 
131 |             out_mask = point_sample(
132 |                 out_mask,
133 |                 point_coords.repeat(out_mask.shape[0], 1, 1),
134 |                 align_corners=False,
135 |             ).squeeze(1)
136 | 
137 |             with autocast(enabled=False):
138 |                 out_mask = out_mask.float()
139 |                 tgt_mask = tgt_mask.float()
140 |                 # Compute the focal loss between masks
141 |                 cost_mask = batch_sigmoid_ce_loss_jit(out_mask, tgt_mask)
142 | 
143 |                 # Compute the dice loss betwen masks
144 |                 cost_dice = batch_dice_loss(out_mask, tgt_mask)
145 |             
146 |             # Final cost matrix
147 |             C = (
148 |                 self.cost_mask * cost_mask
149 |                 + self.cost_class * cost_class
150 |                 + self.cost_dice * cost_dice
151 |             )
152 |             C = C.reshape(num_queries, -1).cpu()
153 | 
154 |             indices.append(linear_sum_assignment(C))
155 | 
156 |         return [
157 |             (torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64))
158 |             for i, j in indices
159 |         ]
160 | 
161 |     @torch.no_grad()
162 |     def forward(self, outputs, targets):
163 |         """Performs the matching
164 | 
165 |         Params:
166 |             outputs: This is a dict that contains at least these entries:
167 |                  "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
168 |                  "pred_masks": Tensor of dim [batch_size, num_queries, H_pred, W_pred] with the predicted masks
169 | 
170 |             targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
171 |                  "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
172 |                            objects in the target) containing the class labels
173 |                  "masks": Tensor of dim [num_target_boxes, H_gt, W_gt] containing the target masks
174 | 
175 |         Returns:
176 |             A list of size batch_size, containing tuples of (index_i, index_j) where:
177 |                 - index_i is the indices of the selected predictions (in order)
178 |                 - index_j is the indices of the corresponding selected targets (in order)
179 |             For each batch element, it holds:
180 |                 len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
181 |         """
182 |         return self.memory_efficient_forward(outputs, targets)
183 | 
184 |     def __repr__(self, _repr_indent=4):
185 |         head = "Matcher " + self.__class__.__name__
186 |         body = [
187 |             "cost_class: {}".format(self.cost_class),
188 |             "cost_mask: {}".format(self.cost_mask),
189 |             "cost_dice: {}".format(self.cost_dice),
190 |         ]
191 |         lines = [head] + [" " * _repr_indent + line for line in body]
192 |         return "\n".join(lines)
193 | 


--------------------------------------------------------------------------------
/lbvq/modeling/lbvq_matcher.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Modules to compute the matching cost and solve the corresponding LSAP.
  3 | """
  4 | import torch
  5 | import torch.nn.functional as F
  6 | from scipy.optimize import linear_sum_assignment
  7 | from torch import nn
  8 | from torch.cuda.amp import autocast
  9 | 
 10 | from detectron2.projects.point_rend.point_features import point_sample
 11 | 
 12 | 
 13 | def batch_dice_loss(inputs: torch.Tensor, targets: torch.Tensor):
 14 |     """
 15 |     Compute the DICE loss, similar to generalized IOU for masks
 16 |     Args:
 17 |         inputs: A float tensor of arbitrary shape.
 18 |                 The predictions for each example.
 19 |         targets: A float tensor with the same shape as inputs. Stores the binary
 20 |                  classification label for each element in inputs
 21 |                 (0 for the negative class and 1 for the positive class).
 22 |     """
 23 |     inputs = inputs.sigmoid()
 24 |     inputs = inputs.flatten(1)
 25 |     numerator = 2 * torch.einsum("nc,mc->nm", inputs, targets)
 26 |     denominator = inputs.sum(-1)[:, None] + targets.sum(-1)[None, :]
 27 |     loss = 1 - (numerator + 1) / (denominator + 1)
 28 |     return loss
 29 | 
 30 | 
 31 | batch_dice_loss_jit = torch.jit.script(
 32 |     batch_dice_loss
 33 | )  # type: torch.jit.ScriptModule
 34 | 
 35 | 
 36 | def batch_sigmoid_ce_loss(inputs: torch.Tensor, targets: torch.Tensor):
 37 |     """
 38 |     Args:
 39 |         inputs: A float tensor of arbitrary shape.
 40 |                 The predictions for each example.
 41 |         targets: A float tensor with the same shape as inputs. Stores the binary
 42 |                  classification label for each element in inputs
 43 |                 (0 for the negative class and 1 for the positive class).
 44 |     Returns:
 45 |         Loss tensor
 46 |     """
 47 |     hw = inputs.shape[1]
 48 | 
 49 |     pos = F.binary_cross_entropy_with_logits(
 50 |         inputs, torch.ones_like(inputs), reduction="none"
 51 |     )
 52 |     neg = F.binary_cross_entropy_with_logits(
 53 |         inputs, torch.zeros_like(inputs), reduction="none"
 54 |     )
 55 | 
 56 |     loss = torch.einsum("nc,mc->nm", pos, targets) + torch.einsum(
 57 |         "nc,mc->nm", neg, (1 - targets)
 58 |     )
 59 | 
 60 |     return loss / hw
 61 | 
 62 | 
 63 | batch_sigmoid_ce_loss_jit = torch.jit.script(
 64 |     batch_sigmoid_ce_loss
 65 | )  # type: torch.jit.ScriptModule
 66 | 
 67 | 
 68 | class LbvqHungarianMatcher(nn.Module):
 69 |     """This class computes an assignment between the targets and the predictions of the network
 70 | 
 71 |     For efficiency reasons, the targets don't include the no_object. Because of this, in general,
 72 |     there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions,
 73 |     while the others are un-matched (and thus treated as non-objects).
 74 |     """
 75 | 
 76 |     def __init__(self, cost_class: float = 1, cost_mask: float = 1, cost_dice: float = 1, num_points: int = 0):
 77 |         """Creates the matcher
 78 | 
 79 |         Params:
 80 |             cost_class: This is the relative weight of the classification error in the matching cost
 81 |             cost_mask: This is the relative weight of the focal loss of the binary mask in the matching cost
 82 |             cost_dice: This is the relative weight of the dice loss of the binary mask in the matching cost
 83 |         """
 84 |         super().__init__()
 85 |         self.cost_class = cost_class
 86 |         self.cost_mask = cost_mask
 87 |         self.cost_dice = cost_dice
 88 | 
 89 |         assert cost_class != 0 or cost_mask != 0 or cost_dice != 0, "all costs cant be 0"
 90 | 
 91 |         self.num_points = num_points
 92 | 
 93 |     @torch.no_grad()
 94 |     def memory_efficient_forward(self, outputs, targets):
 95 |         # We flatten to compute the cost matrices in a batch
 96 | 
 97 |         # Here, "L" is the number of frame-level decoder layers.
 98 |         out_prob = outputs["pred_logits"].softmax(-1)   # L, B, cQ, K+1
 99 |         out_mask = outputs["pred_masks"]                # L, B, cQ, T, H, W
100 | 
101 |         L, B, cQ, T, s_h, s_w = out_mask.shape
102 | 
103 |         out_prob = out_prob.reshape(L*B, cQ, -1)
104 |         out_mask = out_mask.reshape(L*B, cQ, T, s_h, s_w)
105 | 
106 |         # If target is [vid1, vid2, vid3],
107 |         # it now becomes [vid1, vid2, vid3, vid1, vid2, vid3, ...].
108 |         targets = targets * L
109 | 
110 |         indices = []
111 |         for b in range(L*B):
112 |             b_out_prob = out_prob[b]
113 |             tgt_ids = targets[b]["labels"]
114 |             # Compute the classification cost. Contrary to the loss, we don't use the NLL,
115 |             # but approximate it in 1 - proba[target class].
116 |             # The 1 is a constant that doesn't change the matching, it can be ommitted.
117 |             cost_class = -b_out_prob[:, tgt_ids]
118 | 
119 |             b_out_mask = out_mask[b]  # cQ x T x H_pred x W_pred
120 |             # gt masks are already padded when preparing target
121 |             tgt_mask = targets[b]["masks"].to(b_out_mask) # Nins x T x H_tgt x W_tgt
122 | 
123 |             # out_mask = out_mask[:, None]
124 |             # tgt_mask = tgt_mask[:, None]
125 |             # all masks share the same set of points for efficient matching!
126 |             point_coords = torch.rand(1, self.num_points, 2, device=b_out_mask.device)
127 |             # get gt labels
128 |             tgt_mask = point_sample(
129 |                 tgt_mask,
130 |                 point_coords.repeat(tgt_mask.shape[0], 1, 1),
131 |                 align_corners=False,
132 |             ).flatten(1)
133 | 
134 |             b_out_mask = point_sample(
135 |                 b_out_mask,
136 |                 point_coords.repeat(b_out_mask.shape[0], 1, 1),
137 |                 align_corners=False,
138 |             ).flatten(1)
139 | 
140 |             with autocast(enabled=False):
141 |                 b_out_mask = b_out_mask.float()
142 |                 tgt_mask = tgt_mask.float()
143 |                 # Compute the focal loss between masks
144 |                 cost_mask = batch_sigmoid_ce_loss_jit(b_out_mask, tgt_mask)
145 |                 # Compute the dice loss betwen masks
146 |                 cost_dice = batch_dice_loss(b_out_mask, tgt_mask)
147 |             
148 |             # Final cost matrix
149 |             C = (
150 |                 self.cost_mask * cost_mask
151 |                 + self.cost_class * cost_class
152 |                 + self.cost_dice * cost_dice
153 |             )
154 |             C = C.reshape(cQ, -1).cpu()
155 | 
156 |             indices.append(linear_sum_assignment(C))
157 | 
158 |         return [
159 |             (torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64))
160 |             for i, j in indices
161 |         ]
162 | 
163 |     @torch.no_grad()
164 |     def forward(self, outputs, targets):
165 |         """Performs the matching
166 | 
167 |         Params:
168 |             outputs: This is a dict that contains at least these entries:
169 |                  "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
170 |                  "pred_masks": Tensor of dim [batch_size, num_queries, H_pred, W_pred] with the predicted masks
171 | 
172 |             targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
173 |                  "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
174 |                            objects in the target) containing the class labels
175 |                  "masks": Tensor of dim [num_target_boxes, H_gt, W_gt] containing the target masks
176 | 
177 |         Returns:
178 |             A list of size batch_size, containing tuples of (index_i, index_j) where:
179 |                 - index_i is the indices of the selected predictions (in order)
180 |                 - index_j is the indices of the corresponding selected targets (in order)
181 |             For each batch element, it holds:
182 |                 len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
183 |         """
184 |         return self.memory_efficient_forward(outputs, targets)
185 | 
186 |     def __repr__(self, _repr_indent=4):
187 |         head = "Matcher " + self.__class__.__name__
188 |         body = [
189 |             "cost_class: {}".format(self.cost_class),
190 |             "cost_mask: {}".format(self.cost_mask),
191 |             "cost_dice: {}".format(self.cost_dice),
192 |         ]
193 |         lines = [head] + [" " * _repr_indent + line for line in body]
194 |         return "\n".join(lines)
195 | 


--------------------------------------------------------------------------------
/lbvq/data/ytvis_eval.py:
--------------------------------------------------------------------------------
  1 | import contextlib
  2 | import copy
  3 | import io
  4 | import itertools
  5 | import json
  6 | import logging
  7 | import numpy as np
  8 | import os
  9 | from collections import OrderedDict
 10 | import pycocotools.mask as mask_util
 11 | import torch
 12 | from .datasets.ytvis_api.ytvos import YTVOS
 13 | 
 14 | import detectron2.utils.comm as comm
 15 | from detectron2.config import CfgNode
 16 | from detectron2.data import MetadataCatalog
 17 | from detectron2.evaluation import DatasetEvaluator
 18 | from detectron2.utils.file_io import PathManager
 19 | 
 20 | 
 21 | class YTVISEvaluator(DatasetEvaluator):
 22 |     """
 23 |     Evaluate AR for object proposals, AP for instance detection/segmentation, AP
 24 |     for keypoint detection outputs using COCO's metrics.
 25 |     See http://cocodataset.org/#detection-eval and
 26 |     http://cocodataset.org/#keypoints-eval to understand its metrics.
 27 | 
 28 |     In addition to COCO, this evaluator is able to support any bounding box detection,
 29 |     instance segmentation, or keypoint detection dataset.
 30 |     """
 31 | 
 32 |     def __init__(
 33 |         self,
 34 |         dataset_name,
 35 |         tasks=None,
 36 |         distributed=True,
 37 |         output_dir=None,
 38 |         *,
 39 |         use_fast_impl=True,
 40 |     ):
 41 |         """
 42 |         Args:
 43 |             dataset_name (str): name of the dataset to be evaluated.
 44 |                 It must have either the following corresponding metadata:
 45 | 
 46 |                     "json_file": the path to the COCO format annotation
 47 | 
 48 |                 Or it must be in detectron2's standard dataset format
 49 |                 so it can be converted to COCO format automatically.
 50 |             tasks (tuple[str]): tasks that can be evaluated under the given
 51 |                 configuration. A task is one of "bbox", "segm", "keypoints".
 52 |                 By default, will infer this automatically from predictions.
 53 |             distributed (True): if True, will collect results from all ranks and run evaluation
 54 |                 in the main process.
 55 |                 Otherwise, will only evaluate the results in the current process.
 56 |             output_dir (str): optional, an output directory to dump all
 57 |                 results predicted on the dataset. The dump contains two files:
 58 | 
 59 |                 1. "instances_predictions.pth" a file in torch serialization
 60 |                    format that contains all the raw original predictions.
 61 |                 2. "coco_instances_results.json" a json file in COCO's result
 62 |                    format.
 63 |             use_fast_impl (bool): use a fast but **unofficial** implementation to compute AP.
 64 |                 Although the results should be very close to the official implementation in COCO
 65 |                 API, it is still recommended to compute results with the official API for use in
 66 |                 papers. The faster implementation also uses more RAM.
 67 |         """
 68 |         self._logger = logging.getLogger(__name__)
 69 |         self._distributed = distributed
 70 |         self._output_dir = output_dir
 71 |         self._use_fast_impl = use_fast_impl
 72 | 
 73 |         if tasks is not None and isinstance(tasks, CfgNode):
 74 |             self._logger.warning(
 75 |                 "COCO Evaluator instantiated using config, this is deprecated behavior."
 76 |                 " Please pass in explicit arguments instead."
 77 |             )
 78 |             self._tasks = None  # Infering it from predictions should be better
 79 |         else:
 80 |             self._tasks = tasks
 81 | 
 82 |         self._cpu_device = torch.device("cpu")
 83 | 
 84 |         self._metadata = MetadataCatalog.get(dataset_name)
 85 | 
 86 |         json_file = PathManager.get_local_path(self._metadata.json_file)
 87 |         with contextlib.redirect_stdout(io.StringIO()):
 88 |             self._ytvis_api = YTVOS(json_file)
 89 | 
 90 |     def reset(self):
 91 |         self._predictions = []
 92 | 
 93 |     def process(self, inputs, outputs):
 94 |         """
 95 |         Args:
 96 |             inputs: the inputs to a COCO model (e.g., GeneralizedRCNN).
 97 |                 It is a list of dict. Each dict corresponds to an image and
 98 |                 contains keys like "height", "width", "file_name", "image_id".
 99 |             outputs: the outputs of a COCO model. It is a list of dicts with key
100 |                 "instances" that contains :class:`Instances`.
101 |         """
102 |         prediction = instances_to_coco_json_video(inputs, outputs)
103 |         self._predictions.extend(prediction)
104 | 
105 |     def evaluate(self):
106 |         """
107 |         Args:
108 |             img_ids: a list of image IDs to evaluate on. Default to None for the whole dataset
109 |         """
110 |         if self._distributed:
111 |             comm.synchronize()
112 |             predictions = comm.gather(self._predictions, dst=0)
113 |             predictions = list(itertools.chain(*predictions))
114 | 
115 |             if not comm.is_main_process():
116 |                 return {}
117 |         else:
118 |             predictions = self._predictions
119 | 
120 |         if len(predictions) == 0:
121 |             self._logger.warning("[COCOEvaluator] Did not receive valid predictions.")
122 |             return {}
123 | 
124 |         if self._output_dir:
125 |             PathManager.mkdirs(self._output_dir)
126 |             file_path = os.path.join(self._output_dir, "instances_predictions.pth")
127 |             with PathManager.open(file_path, "wb") as f:
128 |                 torch.save(predictions, f)
129 | 
130 |         self._results = OrderedDict()
131 |         self._eval_predictions(predictions)
132 |         # Copy so the caller can do whatever with results
133 |         return copy.deepcopy(self._results)
134 | 
135 |     def _eval_predictions(self, predictions):
136 |         """
137 |         Evaluate predictions. Fill self._results with the metrics of the tasks.
138 |         """
139 |         self._logger.info("Preparing results for YTVIS format ...")
140 | 
141 |         # unmap the category ids for COCO
142 |         if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"):
143 |             dataset_id_to_contiguous_id = self._metadata.thing_dataset_id_to_contiguous_id
144 |             all_contiguous_ids = list(dataset_id_to_contiguous_id.values())
145 |             num_classes = len(all_contiguous_ids)
146 |             assert min(all_contiguous_ids) == 0 and max(all_contiguous_ids) == num_classes - 1
147 | 
148 |             reverse_id_mapping = {v: k for k, v in dataset_id_to_contiguous_id.items()}
149 |             for result in predictions:
150 |                 category_id = result["category_id"]
151 |                 assert category_id < num_classes, (
152 |                     f"A prediction has class={category_id}, "
153 |                     f"but the dataset only has {num_classes} classes and "
154 |                     f"predicted class id should be in [0, {num_classes - 1}]."
155 |                 )
156 |                 result["category_id"] = reverse_id_mapping[category_id]
157 | 
158 |         if self._output_dir:
159 |             file_path = os.path.join(self._output_dir, "results.json")
160 |             self._logger.info("Saving results to {}".format(file_path))
161 |             with PathManager.open(file_path, "w") as f:
162 |                 f.write(json.dumps(predictions))
163 |                 f.flush()
164 | 
165 |         self._logger.info("Annotations are not available for evaluation.")
166 |         return
167 | 
168 | 
169 | def instances_to_coco_json_video(inputs, outputs):
170 |     """
171 |     Dump an "Instances" object to a COCO-format json that's used for evaluation.
172 | 
173 |     Args:
174 |         instances (Instances):
175 |         video_id (int): the image id
176 | 
177 |     Returns:
178 |         list[dict]: list of json annotations in COCO format.
179 |     """
180 |     assert len(inputs) == 1, "More than one inputs are loaded for inference!"
181 | 
182 |     video_id = inputs[0]["video_id"]
183 | 
184 |     scores = outputs["pred_scores"]
185 |     labels = outputs["pred_labels"]
186 |     masks = outputs["pred_masks"]
187 | 
188 |     ytvis_results = []
189 |     for (s, l, m) in zip(scores, labels, masks):
190 |         segms = [
191 |             mask_util.encode(np.array(_mask[:, :, None], order="F", dtype="uint8"))[0]
192 |             for _mask in m
193 |         ]
194 |         for rle in segms:
195 |             rle["counts"] = rle["counts"].decode("utf-8")
196 | 
197 |         res = {
198 |             "video_id": video_id,
199 |             "score": s,
200 |             "category_id": l,
201 |             "segmentations": segms,
202 |         }
203 |         ytvis_results.append(res)
204 | 
205 |     return ytvis_results
206 | 


--------------------------------------------------------------------------------
/demo_lbvq/predictor.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  2 | # reference: https://github.com/sukjunhwang/IFC/blob/master/projects/IFC/demo/predictor.py
  3 | import atexit
  4 | import bisect
  5 | import multiprocessing as mp
  6 | from collections import deque
  7 | import cv2
  8 | import torch
  9 | 
 10 | from visualizer import TrackVisualizer
 11 | 
 12 | from detectron2.data import MetadataCatalog
 13 | from detectron2.engine.defaults import DefaultPredictor
 14 | from detectron2.structures import Instances
 15 | from detectron2.utils.video_visualizer import VideoVisualizer
 16 | from detectron2.utils.visualizer import ColorMode
 17 | 
 18 | 
 19 | class VisualizationDemo(object):
 20 |     def __init__(self, cfg, instance_mode=ColorMode.IMAGE, parallel=False, conf_thres=0.5):
 21 |         """
 22 |         Args:
 23 |             cfg (CfgNode):
 24 |             instance_mode (ColorMode):
 25 |             parallel (bool): whether to run the model in different processes from visualization.
 26 |                 Useful since the visualization logic can be slow.
 27 |         """
 28 |         self.metadata = MetadataCatalog.get(
 29 |             cfg.DATASETS.TEST[0] if len(cfg.DATASETS.TEST) else "__unused"
 30 |         )
 31 |         self.cpu_device = torch.device("cpu")
 32 |         self.instance_mode = instance_mode
 33 | 
 34 |         self.parallel = parallel
 35 |         if parallel:
 36 |             num_gpu = torch.cuda.device_count()
 37 |             self.predictor = AsyncPredictor(cfg, num_gpus=num_gpu)
 38 |         else:
 39 |             self.predictor = VideoPredictor(cfg)
 40 |         self.conf_thres = conf_thres
 41 | 
 42 |     def run_on_video(self, frames):
 43 |         """
 44 |         Args:
 45 |             frames (List[np.ndarray]): a list of images of shape (H, W, C) (in BGR order).
 46 |                 This is the format used by OpenCV.
 47 |         Returns:
 48 |             predictions (dict): the output of the model.
 49 |             vis_output (VisImage): the visualized image output.
 50 |         """
 51 |         vis_output = None
 52 |         predictions = self.predictor(frames)
 53 | 
 54 |         image_size = predictions["image_size"]
 55 |         _pred_scores = predictions["pred_scores"]
 56 |         _pred_labels = predictions["pred_labels"]
 57 |         _pred_masks = predictions["pred_masks"]
 58 | 
 59 |         pred_scores, pred_labels, pred_masks = [], [], []
 60 |         for s, l, m in zip(_pred_scores, _pred_labels, _pred_masks):
 61 |             if s > self.conf_thres:
 62 |                 pred_scores.append(s)
 63 |                 pred_labels.append(l)
 64 |                 pred_masks.append(m)
 65 | 
 66 |         frame_masks = list(zip(*pred_masks))
 67 |         total_vis_output = []
 68 |         for frame_idx in range(len(frames)):
 69 |             frame = frames[frame_idx][:, :, ::-1]
 70 |             visualizer = TrackVisualizer(frame, self.metadata, instance_mode=self.instance_mode)
 71 |             ins = Instances(image_size)
 72 |             if len(pred_scores) > 0:
 73 |                 ins.scores = pred_scores
 74 |                 ins.pred_classes = pred_labels
 75 |                 ins.pred_masks = torch.stack(frame_masks[frame_idx], dim=0)
 76 | 
 77 |             vis_output = visualizer.draw_instance_predictions(predictions=ins)
 78 |             total_vis_output.append(vis_output)
 79 | 
 80 |         return predictions, total_vis_output
 81 | 
 82 | 
 83 | class VideoPredictor(DefaultPredictor):
 84 |     """
 85 |     Create a simple end-to-end predictor with the given config that runs on
 86 |     single device for a single input image.
 87 |     Compared to using the model directly, this class does the following additions:
 88 |     1. Load checkpoint from `cfg.MODEL.WEIGHTS`.
 89 |     2. Always take BGR image as the input and apply conversion defined by `cfg.INPUT.FORMAT`.
 90 |     3. Apply resizing defined by `cfg.INPUT.{MIN,MAX}_SIZE_TEST`.
 91 |     4. Take one input image and produce a single output, instead of a batch.
 92 |     If you'd like to do anything more fancy, please refer to its source code
 93 |     as examples to build and use the model manually.
 94 |     Attributes:
 95 |         metadata (Metadata): the metadata of the underlying dataset, obtained from
 96 |             cfg.DATASETS.TEST.
 97 |     Examples:
 98 |     ::
 99 |         pred = DefaultPredictor(cfg)
100 |         inputs = cv2.imread("input.jpg")
101 |         outputs = pred(inputs)
102 |     """
103 |     def __call__(self, frames):
104 |         """
105 |         Args:
106 |             original_image (np.ndarray): an image of shape (H, W, C) (in BGR order).
107 |         Returns:
108 |             predictions (dict):
109 |                 the output of the model for one image only.
110 |                 See :doc:`/tutorials/models` for details about the format.
111 |         """
112 |         with torch.no_grad():  # https://github.com/sphinx-doc/sphinx/issues/4258
113 |             input_frames = []
114 |             for original_image in frames:
115 |                 # Apply pre-processing to image.
116 |                 if self.input_format == "RGB":
117 |                     # whether the model expects BGR inputs or RGB
118 |                     original_image = original_image[:, :, ::-1]
119 |                 height, width = original_image.shape[:2]
120 |                 image = self.aug.get_transform(original_image).apply_image(original_image)
121 |                 image = torch.as_tensor(image.astype("float32").transpose(2, 0, 1))
122 |                 input_frames.append(image)
123 | 
124 |             inputs = {"image": input_frames, "height": height, "width": width}
125 |             predictions = self.model([inputs])
126 |             return predictions
127 | 
128 | 
129 | class AsyncPredictor:
130 |     """
131 |     A predictor that runs the model asynchronously, possibly on >1 GPUs.
132 |     Because rendering the visualization takes considerably amount of time,
133 |     this helps improve throughput when rendering videos.
134 |     """
135 | 
136 |     class _StopToken:
137 |         pass
138 | 
139 |     class _PredictWorker(mp.Process):
140 |         def __init__(self, cfg, task_queue, result_queue):
141 |             self.cfg = cfg
142 |             self.task_queue = task_queue
143 |             self.result_queue = result_queue
144 |             super().__init__()
145 | 
146 |         def run(self):
147 |             predictor = VideoPredictor(self.cfg)
148 | 
149 |             while True:
150 |                 task = self.task_queue.get()
151 |                 if isinstance(task, AsyncPredictor._StopToken):
152 |                     break
153 |                 idx, data = task
154 |                 result = predictor(data)
155 |                 self.result_queue.put((idx, result))
156 | 
157 |     def __init__(self, cfg, num_gpus: int = 1):
158 |         """
159 |         Args:
160 |             cfg (CfgNode):
161 |             num_gpus (int): if 0, will run on CPU
162 |         """
163 |         num_workers = max(num_gpus, 1)
164 |         self.task_queue = mp.Queue(maxsize=num_workers * 3)
165 |         self.result_queue = mp.Queue(maxsize=num_workers * 3)
166 |         self.procs = []
167 |         for gpuid in range(max(num_gpus, 1)):
168 |             cfg = cfg.clone()
169 |             cfg.defrost()
170 |             cfg.MODEL.DEVICE = "cuda:{}".format(gpuid) if num_gpus > 0 else "cpu"
171 |             self.procs.append(
172 |                 AsyncPredictor._PredictWorker(cfg, self.task_queue, self.result_queue)
173 |             )
174 | 
175 |         self.put_idx = 0
176 |         self.get_idx = 0
177 |         self.result_rank = []
178 |         self.result_data = []
179 | 
180 |         for p in self.procs:
181 |             p.start()
182 |         atexit.register(self.shutdown)
183 | 
184 |     def put(self, image):
185 |         self.put_idx += 1
186 |         self.task_queue.put((self.put_idx, image))
187 | 
188 |     def get(self):
189 |         self.get_idx += 1  # the index needed for this request
190 |         if len(self.result_rank) and self.result_rank[0] == self.get_idx:
191 |             res = self.result_data[0]
192 |             del self.result_data[0], self.result_rank[0]
193 |             return res
194 | 
195 |         while True:
196 |             # make sure the results are returned in the correct order
197 |             idx, res = self.result_queue.get()
198 |             if idx == self.get_idx:
199 |                 return res
200 |             insert = bisect.bisect(self.result_rank, idx)
201 |             self.result_rank.insert(insert, idx)
202 |             self.result_data.insert(insert, res)
203 | 
204 |     def __len__(self):
205 |         return self.put_idx - self.get_idx
206 | 
207 |     def __call__(self, image):
208 |         self.put(image)
209 |         return self.get()
210 | 
211 |     def shutdown(self):
212 |         for _ in self.procs:
213 |             self.task_queue.put(AsyncPredictor._StopToken())
214 | 
215 |     @property
216 |     def default_buffer_size(self):
217 |         return len(self.procs) * 5
218 | 


--------------------------------------------------------------------------------