├── .gitignore
├── maskclip
    ├── evaluation
    │   ├── __init__.py
    │   └── instance_evaluation.py
    ├── utils
    │   ├── __init__.py
    │   ├── get_vocab.py
    │   └── misc.py
    ├── modeling
    │   ├── backbone
    │   │   └── __init__.py
    │   ├── meta_arch
    │   │   ├── __init__.py
    │   │   ├── mask_former_head.py
    │   │   └── per_pixel_baseline.py
    │   ├── pixel_decoder
    │   │   ├── __init__.py
    │   │   └── ops
    │   │   │   ├── MultiScaleDeformableAttention.egg-info
    │   │   │       ├── dependency_links.txt
    │   │   │       ├── top_level.txt
    │   │   │       ├── PKG-INFO
    │   │   │       └── SOURCES.txt
    │   │   │   ├── build
    │   │   │       ├── temp.linux-x86_64-cpython-39
    │   │   │       │   ├── .ninja_deps
    │   │   │       │   ├── mnt
    │   │   │       │   │   └── sdd
    │   │   │       │   │   │   └── OVS_experiments
    │   │   │       │   │   │       └── maskclip
    │   │   │       │   │   │           └── train_bkb
    │   │   │       │   │   │               └── mask2former
    │   │   │       │   │   │                   └── modeling
    │   │   │       │   │   │                       └── pixel_decoder
    │   │   │       │   │   │                           └── ops
    │   │   │       │   │   │                               └── src
    │   │   │       │   │   │                                   ├── vision.o
    │   │   │       │   │   │                                   ├── cpu
    │   │   │       │   │   │                                       └── ms_deform_attn_cpu.o
    │   │   │       │   │   │                                   └── cuda
    │   │   │       │   │   │                                       └── ms_deform_attn_cuda.o
    │   │   │       │   ├── .ninja_log
    │   │   │       │   └── build.ninja
    │   │   │       ├── lib.linux-x86_64-3.9
    │   │   │       │   ├── MultiScaleDeformableAttention.cpython-39-x86_64-linux-gnu.so
    │   │   │       │   ├── modules
    │   │   │       │   │   ├── __init__.py
    │   │   │       │   │   └── ms_deform_attn.py
    │   │   │       │   └── functions
    │   │   │       │   │   ├── __init__.py
    │   │   │       │   │   └── ms_deform_attn_func.py
    │   │   │       ├── lib.linux-x86_64-cpython-39
    │   │   │       │   ├── MultiScaleDeformableAttention.cpython-39-x86_64-linux-gnu.so
    │   │   │       │   ├── modules
    │   │   │       │   │   ├── __init__.py
    │   │   │       │   │   └── ms_deform_attn.py
    │   │   │       │   └── functions
    │   │   │       │   │   ├── __init__.py
    │   │   │       │   │   └── ms_deform_attn_func.py
    │   │   │       └── temp.linux-x86_64-3.9
    │   │   │       │   └── mnt
    │   │   │       │       └── local
    │   │   │       │           └── zhding
    │   │   │       │               └── open_source
    │   │   │       │                   └── mask2former
    │   │   │       │                       └── modeling
    │   │   │       │                           └── pixel_decoder
    │   │   │       │                               └── ops
    │   │   │       │                                   └── src
    │   │   │       │                                       ├── vision.o
    │   │   │       │                                       ├── cpu
    │   │   │       │                                           └── ms_deform_attn_cpu.o
    │   │   │       │                                       └── cuda
    │   │   │       │                                           └── ms_deform_attn_cuda.o
    │   │   │   ├── dist
    │   │   │       └── MultiScaleDeformableAttention-1.0-py3.9-linux-x86_64.egg
    │   │   │   ├── make.sh
    │   │   │   ├── modules
    │   │   │       ├── __init__.py
    │   │   │       └── ms_deform_attn.py
    │   │   │   ├── functions
    │   │   │       ├── __init__.py
    │   │   │       └── ms_deform_attn_func.py
    │   │   │   ├── src
    │   │   │       ├── vision.cpp
    │   │   │       ├── cuda
    │   │   │       │   ├── ms_deform_attn_cuda.h
    │   │   │       │   └── ms_deform_attn_cuda.cu
    │   │   │       ├── cpu
    │   │   │       │   ├── ms_deform_attn_cpu.h
    │   │   │       │   └── ms_deform_attn_cpu.cpp
    │   │   │       └── ms_deform_attn.h
    │   │   │   ├── setup.py
    │   │   │   └── test.py
    │   ├── transformer_decoder
    │   │   ├── __init__.py
    │   │   ├── position_encoding.py
    │   │   └── maskformer_transformer_decoder.py
    │   ├── __init__.py
    │   ├── matcher.py
    │   ├── utils.py
    │   └── maskclip.py
    ├── data
    │   ├── dataset_mappers
    │   │   ├── __init__.py
    │   │   ├── coco_panoptic_new_baseline_dataset_mapper.py
    │   │   ├── mask_former_panoptic_dataset_mapper.py
    │   │   ├── mask_former_instance_dataset_mapper.py
    │   │   ├── mask_former_semantic_dataset_mapper.py
    │   │   └── coco_instance_new_baseline_dataset_mapper.py
    │   ├── __init__.py
    │   └── datasets
    │   │   ├── __init__.py
    │   │   ├── register_ade20k_instance.py
    │   │   └── register_coco_panoptic_annos_semseg.py
    ├── __init__.py
    ├── test_time_augmentation.py
    └── config.py
├── figs
    └── model.png
├── datasets
    ├── prepare_ade20k_sem_seg.py
    ├── ade20k_instance_catid_mapping.txt
    ├── prepare_coco_semantic_annos_from_panoptic_annos.py
    ├── prepare_ade20k_ins_seg.py
    └── README.md
├── configs
    ├── coco
    │   ├── Base-COCO-PanopticSegmentation.yaml
    │   └── maskformer2_R50_bs16_50ep.yaml
    └── ade20k
    │   ├── maskformer2_R50_bs16_160k.yaml
    │   └── Base-ADE20K-PanopticSegmentation.yaml
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | 


--------------------------------------------------------------------------------
/maskclip/evaluation/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/figs/model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlpc-ucsd/MaskCLIP/HEAD/figs/model.png


--------------------------------------------------------------------------------
/maskclip/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/maskclip/modeling/backbone/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/maskclip/data/dataset_mappers/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/maskclip/modeling/meta_arch/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/maskclip/modeling/pixel_decoder/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/maskclip/modeling/pixel_decoder/ops/MultiScaleDeformableAttention.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/maskclip/data/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from . import datasets
3 | 


--------------------------------------------------------------------------------
/maskclip/modeling/pixel_decoder/ops/MultiScaleDeformableAttention.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | MultiScaleDeformableAttention
2 | functions
3 | modules
4 | 


--------------------------------------------------------------------------------
/maskclip/modeling/pixel_decoder/ops/build/temp.linux-x86_64-cpython-39/.ninja_deps:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlpc-ucsd/MaskCLIP/HEAD/maskclip/modeling/pixel_decoder/ops/build/temp.linux-x86_64-cpython-39/.ninja_deps


--------------------------------------------------------------------------------
/maskclip/modeling/pixel_decoder/ops/dist/MultiScaleDeformableAttention-1.0-py3.9-linux-x86_64.egg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlpc-ucsd/MaskCLIP/HEAD/maskclip/modeling/pixel_decoder/ops/dist/MultiScaleDeformableAttention-1.0-py3.9-linux-x86_64.egg


--------------------------------------------------------------------------------
/maskclip/modeling/transformer_decoder/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from .maskformer_transformer_decoder import StandardTransformerDecoder
3 | from .mask2former_transformer_decoder import MultiScaleMaskedTransformerDecoder
4 | 


--------------------------------------------------------------------------------
/maskclip/data/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from . import (
3 |     register_ade20k_full,
4 |     register_ade20k_panoptic,
5 |     register_coco_stuff_10k,
6 |     register_coco_panoptic_annos_semseg,
7 |     register_ade20k_instance,
8 | )
9 | 


--------------------------------------------------------------------------------
/maskclip/modeling/pixel_decoder/ops/build/lib.linux-x86_64-3.9/MultiScaleDeformableAttention.cpython-39-x86_64-linux-gnu.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlpc-ucsd/MaskCLIP/HEAD/maskclip/modeling/pixel_decoder/ops/build/lib.linux-x86_64-3.9/MultiScaleDeformableAttention.cpython-39-x86_64-linux-gnu.so


--------------------------------------------------------------------------------
/maskclip/modeling/pixel_decoder/ops/build/lib.linux-x86_64-cpython-39/MultiScaleDeformableAttention.cpython-39-x86_64-linux-gnu.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlpc-ucsd/MaskCLIP/HEAD/maskclip/modeling/pixel_decoder/ops/build/lib.linux-x86_64-cpython-39/MultiScaleDeformableAttention.cpython-39-x86_64-linux-gnu.so


--------------------------------------------------------------------------------
/maskclip/modeling/pixel_decoder/ops/build/temp.linux-x86_64-3.9/mnt/local/zhding/open_source/mask2former/modeling/pixel_decoder/ops/src/vision.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlpc-ucsd/MaskCLIP/HEAD/maskclip/modeling/pixel_decoder/ops/build/temp.linux-x86_64-3.9/mnt/local/zhding/open_source/mask2former/modeling/pixel_decoder/ops/src/vision.o


--------------------------------------------------------------------------------
/maskclip/modeling/pixel_decoder/ops/build/temp.linux-x86_64-3.9/mnt/local/zhding/open_source/mask2former/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlpc-ucsd/MaskCLIP/HEAD/maskclip/modeling/pixel_decoder/ops/build/temp.linux-x86_64-3.9/mnt/local/zhding/open_source/mask2former/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.o


--------------------------------------------------------------------------------
/maskclip/modeling/pixel_decoder/ops/build/temp.linux-x86_64-3.9/mnt/local/zhding/open_source/mask2former/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlpc-ucsd/MaskCLIP/HEAD/maskclip/modeling/pixel_decoder/ops/build/temp.linux-x86_64-3.9/mnt/local/zhding/open_source/mask2former/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.o


--------------------------------------------------------------------------------
/maskclip/modeling/pixel_decoder/ops/build/temp.linux-x86_64-cpython-39/mnt/sdd/OVS_experiments/maskclip/train_bkb/mask2former/modeling/pixel_decoder/ops/src/vision.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlpc-ucsd/MaskCLIP/HEAD/maskclip/modeling/pixel_decoder/ops/build/temp.linux-x86_64-cpython-39/mnt/sdd/OVS_experiments/maskclip/train_bkb/mask2former/modeling/pixel_decoder/ops/src/vision.o


--------------------------------------------------------------------------------
/maskclip/modeling/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from .backbone.swin import D2SwinTransformer
3 | from .pixel_decoder.fpn import BasePixelDecoder
4 | from .pixel_decoder.msdeformattn import MSDeformAttnPixelDecoder
5 | from .meta_arch.mask_former_head import MaskFormerHead
6 | from .meta_arch.per_pixel_baseline import PerPixelBaselineHead, PerPixelBaselinePlusHead
7 | 


--------------------------------------------------------------------------------
/maskclip/modeling/pixel_decoder/ops/MultiScaleDeformableAttention.egg-info/PKG-INFO:
--------------------------------------------------------------------------------
 1 | Metadata-Version: 2.1
 2 | Name: MultiScaleDeformableAttention
 3 | Version: 1.0
 4 | Summary: PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention
 5 | Home-page: https://github.com/fundamentalvision/Deformable-DETR
 6 | Author: Weijie Su
 7 | License: UNKNOWN
 8 | Platform: UNKNOWN
 9 | 
10 | UNKNOWN
11 | 
12 | 


--------------------------------------------------------------------------------
/maskclip/modeling/pixel_decoder/ops/build/temp.linux-x86_64-cpython-39/mnt/sdd/OVS_experiments/maskclip/train_bkb/mask2former/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlpc-ucsd/MaskCLIP/HEAD/maskclip/modeling/pixel_decoder/ops/build/temp.linux-x86_64-cpython-39/mnt/sdd/OVS_experiments/maskclip/train_bkb/mask2former/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.o


--------------------------------------------------------------------------------
/maskclip/modeling/pixel_decoder/ops/build/temp.linux-x86_64-cpython-39/mnt/sdd/OVS_experiments/maskclip/train_bkb/mask2former/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlpc-ucsd/MaskCLIP/HEAD/maskclip/modeling/pixel_decoder/ops/build/temp.linux-x86_64-cpython-39/mnt/sdd/OVS_experiments/maskclip/train_bkb/mask2former/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.o


--------------------------------------------------------------------------------
/maskclip/modeling/pixel_decoder/ops/MultiScaleDeformableAttention.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
 1 | setup.py
 2 | /mnt/local/zhding/open_source/mask2former/modeling/pixel_decoder/ops/src/vision.cpp
 3 | /mnt/local/zhding/open_source/mask2former/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.cpp
 4 | /mnt/local/zhding/open_source/mask2former/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.cu
 5 | MultiScaleDeformableAttention.egg-info/PKG-INFO
 6 | MultiScaleDeformableAttention.egg-info/SOURCES.txt
 7 | MultiScaleDeformableAttention.egg-info/dependency_links.txt
 8 | MultiScaleDeformableAttention.egg-info/top_level.txt
 9 | functions/__init__.py
10 | functions/ms_deform_attn_func.py
11 | modules/__init__.py
12 | modules/ms_deform_attn.py


--------------------------------------------------------------------------------
/maskclip/modeling/pixel_decoder/ops/make.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # ------------------------------------------------------------------------------------------------
 3 | # Deformable DETR
 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | # ------------------------------------------------------------------------------------------------
 7 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | # ------------------------------------------------------------------------------------------------
 9 | 
10 | # Copyright (c) Facebook, Inc. and its affiliates.
11 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
12 | 
13 | python setup.py build install
14 | 


--------------------------------------------------------------------------------
/maskclip/modeling/pixel_decoder/ops/modules/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from .ms_deform_attn import MSDeformAttn
13 | 


--------------------------------------------------------------------------------
/maskclip/modeling/pixel_decoder/ops/functions/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from .ms_deform_attn_func import MSDeformAttnFunction
13 | 
14 | 


--------------------------------------------------------------------------------
/maskclip/modeling/pixel_decoder/ops/build/lib.linux-x86_64-3.9/modules/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from .ms_deform_attn import MSDeformAttn
13 | 


--------------------------------------------------------------------------------
/maskclip/utils/get_vocab.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from detectron2.data.datasets.builtin_meta import COCO_CATEGORIES
 3 | from ..data.datasets.register_ade20k_panoptic import ADE20K_150_CATEGORIES
 4 | 
 5 | 
 6 | def get_class_names(dataset_name: str):
 7 |     # COCO panoptic
 8 |     if dataset_name == "coco_2017_train_panoptic" or \
 9 |         dataset_name == "coco_2017_val_panoptic_with_sem_seg":
10 |         class_names = [x['name'] for x in COCO_CATEGORIES]
11 |     # ADE 150
12 |     elif dataset_name == "ade20k_panoptic_val" or \
13 |         dataset_name == "ade20k_panoptic_train":
14 |         class_names = [x['name'] for x in ADE20K_150_CATEGORIES]
15 |     else:
16 |         raise NotImplementedError(f"Unknown dataset: {dataset_name}")
17 | 
18 |     if 'train' in dataset_name:
19 |         class_names.append('other')
20 |     return class_names
21 | 


--------------------------------------------------------------------------------
/maskclip/modeling/pixel_decoder/ops/build/lib.linux-x86_64-cpython-39/modules/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from .ms_deform_attn import MSDeformAttn
13 | 


--------------------------------------------------------------------------------
/maskclip/modeling/pixel_decoder/ops/build/lib.linux-x86_64-3.9/functions/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from .ms_deform_attn_func import MSDeformAttnFunction
13 | 
14 | 


--------------------------------------------------------------------------------
/maskclip/modeling/pixel_decoder/ops/build/lib.linux-x86_64-cpython-39/functions/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from .ms_deform_attn_func import MSDeformAttnFunction
13 | 
14 | 


--------------------------------------------------------------------------------
/maskclip/modeling/pixel_decoder/ops/build/temp.linux-x86_64-cpython-39/.ninja_log:
--------------------------------------------------------------------------------
1 | # ninja log v5
2 | 0	4002	1667684389800053600	/mnt/sdd/OVS_experiments/maskclip/train_bkb/mask2former/modeling/pixel_decoder/ops/build/temp.linux-x86_64-cpython-39/mnt/sdd/OVS_experiments/maskclip/train_bkb/mask2former/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.o	ac6e50dfe84bd228
3 | 1	15343	1667684401133586400	/mnt/sdd/OVS_experiments/maskclip/train_bkb/mask2former/modeling/pixel_decoder/ops/build/temp.linux-x86_64-cpython-39/mnt/sdd/OVS_experiments/maskclip/train_bkb/mask2former/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.o	9e82a791e0d9cf33
4 | 1	17080	1667684402877534800	/mnt/sdd/OVS_experiments/maskclip/train_bkb/mask2former/modeling/pixel_decoder/ops/build/temp.linux-x86_64-cpython-39/mnt/sdd/OVS_experiments/maskclip/train_bkb/mask2former/modeling/pixel_decoder/ops/src/vision.o	e98aff91c8aeab85
5 | 


--------------------------------------------------------------------------------
/datasets/prepare_ade20k_sem_seg.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | # Copyright (c) Facebook, Inc. and its affiliates.
 4 | import os
 5 | from pathlib import Path
 6 | 
 7 | import numpy as np
 8 | import tqdm
 9 | from PIL import Image
10 | 
11 | 
12 | def convert(input, output):
13 |     img = np.asarray(Image.open(input))
14 |     assert img.dtype == np.uint8
15 |     img = img - 1  # 0 (ignore) becomes 255. others are shifted by 1
16 |     Image.fromarray(img).save(output)
17 | 
18 | 
19 | if __name__ == "__main__":
20 |     dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "ADEChallengeData2016"
21 |     for name in ["training", "validation"]:
22 |         annotation_dir = dataset_dir / "annotations" / name
23 |         output_dir = dataset_dir / "annotations_detectron2" / name
24 |         output_dir.mkdir(parents=True, exist_ok=True)
25 |         for file in tqdm.tqdm(list(annotation_dir.iterdir())):
26 |             output_file = output_dir / file.name
27 |             convert(file, output_file)
28 | 


--------------------------------------------------------------------------------
/maskclip/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | from . import data  # register all new datasets
 3 | from . import modeling
 4 | 
 5 | # config
 6 | from .config import add_maskformer2_config
 7 | 
 8 | # dataset loading
 9 | from .data.dataset_mappers.coco_instance_new_baseline_dataset_mapper import COCOInstanceNewBaselineDatasetMapper
10 | from .data.dataset_mappers.coco_panoptic_new_baseline_dataset_mapper import COCOPanopticNewBaselineDatasetMapper
11 | from .data.dataset_mappers.mask_former_instance_dataset_mapper import (
12 |     MaskFormerInstanceDatasetMapper,
13 | )
14 | from .data.dataset_mappers.mask_former_panoptic_dataset_mapper import (
15 |     MaskFormerPanopticDatasetMapper,
16 | )
17 | from .data.dataset_mappers.mask_former_semantic_dataset_mapper import (
18 |     MaskFormerSemanticDatasetMapper,
19 | )
20 | 
21 | # models
22 | from .model import MaskFormer
23 | from .test_time_augmentation import SemanticSegmentorWithTTA
24 | 
25 | # evaluation
26 | from .evaluation.instance_evaluation import InstanceSegEvaluator
27 | 


--------------------------------------------------------------------------------
/maskclip/modeling/pixel_decoder/ops/src/vision.cpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #include "ms_deform_attn.h"
17 | 
18 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
19 |   m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward");
20 |   m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward");
21 | }
22 | 


--------------------------------------------------------------------------------
/configs/coco/Base-COCO-PanopticSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     # NORM: "SyncBN"
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("coco_2017_train_panoptic",)
18 |   TEST: ("coco_2017_val_panoptic_with_sem_seg",)  # to evaluate instance and semantic performance as well
19 | SOLVER:
20 |   IMS_PER_BATCH: 8
21 |   BASE_LR: 0.0001
22 |   # STEPS: (327778, 355092)
23 |   MAX_ITER: 10000
24 |   WARMUP_FACTOR: 1.0
25 |   WARMUP_ITERS: 10
26 |   WEIGHT_DECAY: 0.05
27 |   OPTIMIZER: "ADAMW"
28 |   BACKBONE_MULTIPLIER: 0.1
29 |   CHECKPOINT_PERIOD: 1000
30 |   CLIP_GRADIENTS:
31 |     ENABLED: True
32 |     CLIP_TYPE: "full_model"
33 |     CLIP_VALUE: 0.01
34 |     NORM_TYPE: 2.0
35 |   AMP:
36 |     ENABLED: True
37 | INPUT:
38 |   IMAGE_SIZE: 1024
39 |   MIN_SCALE: 0.1
40 |   MAX_SCALE: 2.0
41 |   FORMAT: "RGB"
42 |   DATASET_MAPPER_NAME: "coco_panoptic_lsj"
43 | TEST:
44 |   EVAL_PERIOD: 368751
45 | DATALOADER:
46 |   FILTER_EMPTY_ANNOTATIONS: True
47 |   NUM_WORKERS: 8
48 | VERSION: 2
49 | 


--------------------------------------------------------------------------------
/maskclip/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #pragma once
17 | #include <torch/extension.h>
18 | 
19 | at::Tensor ms_deform_attn_cuda_forward(
20 |     const at::Tensor &value, 
21 |     const at::Tensor &spatial_shapes,
22 |     const at::Tensor &level_start_index,
23 |     const at::Tensor &sampling_loc,
24 |     const at::Tensor &attn_weight,
25 |     const int im2col_step);
26 | 
27 | std::vector<at::Tensor> ms_deform_attn_cuda_backward(
28 |     const at::Tensor &value, 
29 |     const at::Tensor &spatial_shapes,
30 |     const at::Tensor &level_start_index,
31 |     const at::Tensor &sampling_loc,
32 |     const at::Tensor &attn_weight,
33 |     const at::Tensor &grad_output,
34 |     const int im2col_step);
35 | 
36 | 


--------------------------------------------------------------------------------
/maskclip/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #pragma once
17 | #include <torch/extension.h>
18 | 
19 | at::Tensor
20 | ms_deform_attn_cpu_forward(
21 |     const at::Tensor &value, 
22 |     const at::Tensor &spatial_shapes,
23 |     const at::Tensor &level_start_index,
24 |     const at::Tensor &sampling_loc,
25 |     const at::Tensor &attn_weight,
26 |     const int im2col_step);
27 | 
28 | std::vector<at::Tensor>
29 | ms_deform_attn_cpu_backward(
30 |     const at::Tensor &value, 
31 |     const at::Tensor &spatial_shapes,
32 |     const at::Tensor &level_start_index,
33 |     const at::Tensor &sampling_loc,
34 |     const at::Tensor &attn_weight,
35 |     const at::Tensor &grad_output,
36 |     const int im2col_step);
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/configs/coco/maskformer2_R50_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-COCO-PanopticSegmentation.yaml
 2 | MODEL:
 3 |   WEIGHTS: "./mask_proposal_network.pth"
 4 |   META_ARCHITECTURE: "MaskFormer"
 5 |   SEM_SEG_HEAD:
 6 |     NAME: "MaskFormerHead"
 7 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
 8 |     IGNORE_VALUE: 255
 9 |     NUM_CLASSES: 133
10 |     LOSS_WEIGHT: 1.0
11 |     CONVS_DIM: 256
12 |     MASK_DIM: 256
13 |     NORM: "GN"
14 |     # pixel decoder
15 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
16 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
17 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
18 |     COMMON_STRIDE: 4
19 |     TRANSFORMER_ENC_LAYERS: 6
20 |   MASK_FORMER:
21 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
22 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
23 |     DEEP_SUPERVISION: True
24 |     NO_OBJECT_WEIGHT: 0.0 
25 |     CLASS_WEIGHT: 2.0
26 |     MASK_WEIGHT: 5.0
27 |     DICE_WEIGHT: 5.0
28 |     HIDDEN_DIM: 256
29 |     NUM_OBJECT_QUERIES: 100
30 |     NHEADS: 8
31 |     DROPOUT: 0.0
32 |     DIM_FEEDFORWARD: 2048
33 |     ENC_LAYERS: 0
34 |     PRE_NORM: False
35 |     ENFORCE_INPUT_PROJ: False
36 |     SIZE_DIVISIBILITY: 32
37 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
38 |     TRAIN_NUM_POINTS: 12544
39 |     OVERSAMPLE_RATIO: 3.0
40 |     IMPORTANCE_SAMPLE_RATIO: 0.75
41 |     TEST:
42 |       SEMANTIC_ON: True
43 |       INSTANCE_ON: True
44 |       PANOPTIC_ON: True
45 |       OVERLAP_THRESHOLD: 0.5
46 |       OBJECT_MASK_THRESHOLD: 0.5
47 | OUTPUT_DIR: ./output
48 | 


--------------------------------------------------------------------------------
/maskclip/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.cpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #include <vector>
17 | 
18 | #include <ATen/ATen.h>
19 | #include <ATen/cuda/CUDAContext.h>
20 | 
21 | 
22 | at::Tensor
23 | ms_deform_attn_cpu_forward(
24 |     const at::Tensor &value, 
25 |     const at::Tensor &spatial_shapes,
26 |     const at::Tensor &level_start_index,
27 |     const at::Tensor &sampling_loc,
28 |     const at::Tensor &attn_weight,
29 |     const int im2col_step)
30 | {
31 |     AT_ERROR("Not implement on cpu");
32 | }
33 | 
34 | std::vector<at::Tensor>
35 | ms_deform_attn_cpu_backward(
36 |     const at::Tensor &value, 
37 |     const at::Tensor &spatial_shapes,
38 |     const at::Tensor &level_start_index,
39 |     const at::Tensor &sampling_loc,
40 |     const at::Tensor &attn_weight,
41 |     const at::Tensor &grad_output,
42 |     const int im2col_step)
43 | {
44 |     AT_ERROR("Not implement on cpu");
45 | }
46 | 
47 | 


--------------------------------------------------------------------------------
/configs/ade20k/maskformer2_R50_bs16_160k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-ADE20K-PanopticSegmentation.yaml
 2 | MODEL:
 3 |   WEIGHTS: "model_final.pth" 
 4 |   META_ARCHITECTURE: "MaskFormer"
 5 |   SEM_SEG_HEAD:
 6 |     NAME: "MaskFormerHead"
 7 |     IGNORE_VALUE: 255
 8 |     NUM_CLASSES: 150
 9 |     LOSS_WEIGHT: 1.0
10 |     CONVS_DIM: 256
11 |     MASK_DIM: 256
12 |     NORM: "GN"
13 |     # pixel decoder
14 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
15 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
16 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
17 |     COMMON_STRIDE: 4
18 |     TRANSFORMER_ENC_LAYERS: 6
19 |   MASK_FORMER:
20 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
21 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
22 |     DEEP_SUPERVISION: True
23 |     NO_OBJECT_WEIGHT: 0.1
24 |     CLASS_WEIGHT: 2.0
25 |     MASK_WEIGHT: 5.0
26 |     DICE_WEIGHT: 5.0
27 |     HIDDEN_DIM: 256
28 |     NUM_OBJECT_QUERIES: 100
29 |     NHEADS: 8
30 |     DROPOUT: 0.0
31 |     DIM_FEEDFORWARD: 2048
32 |     ENC_LAYERS: 0
33 |     PRE_NORM: False
34 |     ENFORCE_INPUT_PROJ: False
35 |     SIZE_DIVISIBILITY: 32
36 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
37 |     TRAIN_NUM_POINTS: 12544
38 |     OVERSAMPLE_RATIO: 3.0
39 |     IMPORTANCE_SAMPLE_RATIO: 0.75
40 |     TEST:
41 |       SEMANTIC_ON: True
42 |       INSTANCE_ON: True
43 |       PANOPTIC_ON: True
44 |       OVERLAP_THRESHOLD: 0.5
45 |       OBJECT_MASK_THRESHOLD: 0.3
46 |   CLIP_MODEL:
47 |     NAME: "ViT-L/14@336px"
48 |     INPUT_RESOLUTION: 336
49 |     PATCH_SIZE: 14
50 |     WIDTH: 1024
51 |     LAYERS: 24
52 |     HEADS: 16
53 |     OUTPUT_DIM: 768
54 |     TEMPERATURE: 0.01
55 | 
56 | 


--------------------------------------------------------------------------------
/configs/ade20k/Base-ADE20K-PanopticSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     # NORM: "SyncBN"
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("ade20k_panoptic_train",)
18 |   TEST: ("ade20k_panoptic_val",)
19 | SOLVER:
20 |   IMS_PER_BATCH: 16
21 |   BASE_LR: 0.0001
22 |   MAX_ITER: 160000
23 |   WARMUP_FACTOR: 1.0
24 |   WARMUP_ITERS: 0
25 |   WEIGHT_DECAY: 0.05
26 |   OPTIMIZER: "ADAMW"
27 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
28 |   BACKBONE_MULTIPLIER: 0.1
29 |   CLIP_GRADIENTS:
30 |     ENABLED: True
31 |     CLIP_TYPE: "full_model"
32 |     CLIP_VALUE: 0.01
33 |     NORM_TYPE: 2.0
34 |   AMP:
35 |     ENABLED: True
36 | INPUT:
37 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
38 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
39 |   MIN_SIZE_TEST: 640
40 |   MAX_SIZE_TRAIN: 2560
41 |   MAX_SIZE_TEST: 2560
42 |   CROP:
43 |     ENABLED: True
44 |     TYPE: "absolute"
45 |     SIZE: (640, 640)
46 |     SINGLE_CATEGORY_MAX_AREA: 1.0
47 |   COLOR_AUG_SSD: True
48 |   SIZE_DIVISIBILITY: 640  # used in dataset mapper
49 |   FORMAT: "RGB"
50 |   DATASET_MAPPER_NAME: "mask_former_panoptic"
51 | TEST:
52 |   EVAL_PERIOD: 5000
53 |   AUG:
54 |     ENABLED: False
55 |     MIN_SIZES: [320, 480, 640, 800, 960, 1120]
56 |     MAX_SIZE: 4480
57 |     FLIP: True
58 | DATALOADER:
59 |   FILTER_EMPTY_ANNOTATIONS: True
60 |   NUM_WORKERS: 4
61 | VERSION: 2
62 | 


--------------------------------------------------------------------------------
/datasets/ade20k_instance_catid_mapping.txt:
--------------------------------------------------------------------------------
  1 | Instacne100	SceneParse150	FullADE20K
  2 | 1		8		165
  3 | 2		9		3055
  4 | 3		11		350
  5 | 4		13		1831
  6 | 5		15		774
  7 | 5		15		783
  8 | 6		16		2684
  9 | 7		19		687
 10 | 8		20		471
 11 | 9		21		401
 12 | 10		23		1735
 13 | 11		24		2473
 14 | 12		25		2329
 15 | 13		28		1564
 16 | 14		31		57
 17 | 15		32		2272
 18 | 16		33		907
 19 | 17		34		724
 20 | 18		36		2985
 21 | 18		36		533
 22 | 19		37		1395
 23 | 20		38		155
 24 | 21		39		2053
 25 | 22		40		689
 26 | 23		42		266
 27 | 24		43		581
 28 | 25		44		2380
 29 | 26		45		491
 30 | 27		46		627
 31 | 28		48		2388
 32 | 29		50		943
 33 | 30		51		2096
 34 | 31		54		2530
 35 | 32		56		420
 36 | 33		57		1948
 37 | 34		58		1869
 38 | 35		59		2251
 39 | 36		63		239
 40 | 37		65		571
 41 | 38		66		2793
 42 | 39		67		978
 43 | 40		68		236
 44 | 41		70		181
 45 | 42		71		629
 46 | 43		72		2598
 47 | 44		73		1744
 48 | 45		74		1374
 49 | 46		75		591
 50 | 47		76		2679
 51 | 48		77		223
 52 | 49		79		47
 53 | 50		81		327
 54 | 51		82		2821
 55 | 52		83		1451
 56 | 53		84		2880
 57 | 54		86		480
 58 | 55		87		77
 59 | 56		88		2616
 60 | 57		89		246
 61 | 57		89		247
 62 | 58		90		2733
 63 | 59		91		14
 64 | 60		93		38
 65 | 61		94		1936
 66 | 62		96		120
 67 | 63		98		1702
 68 | 64		99		249
 69 | 65		103		2928
 70 | 66		104		2337
 71 | 67		105		1023
 72 | 68		108		2989
 73 | 69		109		1930
 74 | 70		111		2586
 75 | 71		112		131
 76 | 72		113		146
 77 | 73		116		95
 78 | 74		117		1563
 79 | 75		119		1708
 80 | 76		120		103
 81 | 77		121		1002
 82 | 78		122		2569
 83 | 79		124		2833
 84 | 80		125		1551
 85 | 81		126		1981
 86 | 82		127		29
 87 | 83		128		187
 88 | 84		130		747
 89 | 85		131		2254
 90 | 86		133		2262
 91 | 87		134		1260
 92 | 88		135		2243
 93 | 89		136		2932
 94 | 90		137		2836
 95 | 91		138		2850
 96 | 92		139		64
 97 | 93		140		894
 98 | 94		143		1919
 99 | 95		144		1583
100 | 96		145		318
101 | 97		147		2046
102 | 98		148		1098
103 | 99		149		530
104 | 100		150		954
105 | 


--------------------------------------------------------------------------------
/maskclip/modeling/pixel_decoder/ops/src/ms_deform_attn.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #pragma once
17 | 
18 | #include "cpu/ms_deform_attn_cpu.h"
19 | 
20 | #ifdef WITH_CUDA
21 | #include "cuda/ms_deform_attn_cuda.h"
22 | #endif
23 | 
24 | 
25 | at::Tensor
26 | ms_deform_attn_forward(
27 |     const at::Tensor &value, 
28 |     const at::Tensor &spatial_shapes,
29 |     const at::Tensor &level_start_index,
30 |     const at::Tensor &sampling_loc,
31 |     const at::Tensor &attn_weight,
32 |     const int im2col_step)
33 | {
34 |     if (value.type().is_cuda())
35 |     {
36 | #ifdef WITH_CUDA
37 |         return ms_deform_attn_cuda_forward(
38 |             value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step);
39 | #else
40 |         AT_ERROR("Not compiled with GPU support");
41 | #endif
42 |     }
43 |     AT_ERROR("Not implemented on the CPU");
44 | }
45 | 
46 | std::vector<at::Tensor>
47 | ms_deform_attn_backward(
48 |     const at::Tensor &value, 
49 |     const at::Tensor &spatial_shapes,
50 |     const at::Tensor &level_start_index,
51 |     const at::Tensor &sampling_loc,
52 |     const at::Tensor &attn_weight,
53 |     const at::Tensor &grad_output,
54 |     const int im2col_step)
55 | {
56 |     if (value.type().is_cuda())
57 |     {
58 | #ifdef WITH_CUDA
59 |         return ms_deform_attn_cuda_backward(
60 |             value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step);
61 | #else
62 |         AT_ERROR("Not compiled with GPU support");
63 | #endif
64 |     }
65 |     AT_ERROR("Not implemented on the CPU");
66 | }
67 | 
68 | 


--------------------------------------------------------------------------------
/maskclip/modeling/transformer_decoder/position_encoding.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py
 3 | """
 4 | Various positional encodings for the transformer.
 5 | """
 6 | import math
 7 | 
 8 | import torch
 9 | from torch import nn
10 | 
11 | 
12 | class PositionEmbeddingSine(nn.Module):
13 |     """
14 |     This is a more standard version of the position embedding, very similar to the one
15 |     used by the Attention is all you need paper, generalized to work on images.
16 |     """
17 | 
18 |     def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
19 |         super().__init__()
20 |         self.num_pos_feats = num_pos_feats
21 |         self.temperature = temperature
22 |         self.normalize = normalize
23 |         if scale is not None and normalize is False:
24 |             raise ValueError("normalize should be True if scale is passed")
25 |         if scale is None:
26 |             scale = 2 * math.pi
27 |         self.scale = scale
28 | 
29 |     def forward(self, x, mask=None):
30 |         if mask is None:
31 |             mask = torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool)
32 |         not_mask = ~mask
33 |         y_embed = not_mask.cumsum(1, dtype=torch.float32)
34 |         x_embed = not_mask.cumsum(2, dtype=torch.float32)
35 |         if self.normalize:
36 |             eps = 1e-6
37 |             y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
38 |             x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
39 | 
40 |         dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
41 |         dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
42 | 
43 |         pos_x = x_embed[:, :, :, None] / dim_t
44 |         pos_y = y_embed[:, :, :, None] / dim_t
45 |         pos_x = torch.stack(
46 |             (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4
47 |         ).flatten(3)
48 |         pos_y = torch.stack(
49 |             (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4
50 |         ).flatten(3)
51 |         pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
52 |         return pos
53 |     
54 |     def __repr__(self, _repr_indent=4):
55 |         head = "Positional encoding " + self.__class__.__name__
56 |         body = [
57 |             "num_pos_feats: {}".format(self.num_pos_feats),
58 |             "temperature: {}".format(self.temperature),
59 |             "normalize: {}".format(self.normalize),
60 |             "scale: {}".format(self.scale),
61 |         ]
62 |         # _repr_indent = 4
63 |         lines = [head] + [" " * _repr_indent + line for line in body]
64 |         return "\n".join(lines)
65 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Open-Vocabulary Universal Image Segmentation with MaskCLIP (ICML 2023)
 2 | 
 3 | [Zheng Ding](mailto:zhding@ucsd.edu), [Jieke Wang](), [Zhuowen Tu](http://www.cogsci.ucsd.edu/~ztu/)
 4 | 
 5 | [Arxiv](https://arxiv.org/abs/2208.08984) / [Project](https://maskclip.github.io) / [Video](https://youtu.be/nW0GYkLtka8)
 6 | 
 7 | ![teaser](figs/model.png)
 8 | 
 9 | ### Data preparation
10 | 
11 | For COCO and ADE20k data preparation, please refer to [Preparing Datasets in Mask2Former](https://github.com/facebookresearch/Mask2Former/tree/main/datasets).
12 | 
13 | ### Environment Setup
14 | 
15 | Please follow the following codes to set up the environment.
16 | 
17 | ```
18 | conda create -n maskclip python=3.9
19 | conda activate maskclip
20 | conda install pytorch=1.10 cudatoolkit=11.3 torchvision=0.11 -c pytorch -c conda-forge
21 | python -m pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu113/torch1.10/index.html
22 | pip install setuptools==59.5.0
23 | pip install timm opencv-python scipy einops
24 | pip install git+https://github.com/openai/CLIP.git
25 | pip install git+https://github.com/cocodataset/panopticapi.git
26 | 
27 | cd mask2former/modeling/pixel_decoder/ops/
28 | sh make.sh
29 | ```
30 | 
31 | ### Training
32 | 
33 | #### Training Class-Agnostic Mask Proposal Network
34 | 
35 | You can train a class-agnostic mask proposal network by removing the classification head of previous segmentation models e.g., Mask2Former, MaskRCNN. We provide our trained class-agnostic mask proposal network [here](https://drive.google.com/file/d/1NdXOUzJVQUdl0V0HkQ0yFMUa2MJQojiv/view?usp=sharing).
36 | 
37 | #### Training MaskCLIP on COCO dataset
38 | 
39 | With the trained class-agnostic mask proposal network, we can train the MaskCLIP model through the following command. We train our model for 10,000 iterations with a batch size of 8.
40 | 
41 | ```
42 | python train_net.py --num-gpus 8 --config-file configs/coco/maskformer2_R50_bs16_50ep.yaml
43 | ```
44 | 
45 | ### Testing MaskCLIP on ADE20K dataset
46 | 
47 | You can test our model on ADE20K dataset to get the results using the trained model. We also provide our trained model [here](https://drive.google.com/file/d/1fzf4y-l-BwhBkxambk_yS4yrCwHOKij7/view?usp=sharing). You need to change the path of `MODEL.WEIGHTS` in the yaml file or add to the line
48 | 
49 | ```
50 | python train_net.py --num-gpus 1 --config-file configs/ade20k/maskformer2_R50_bs16_160k.yaml --eval-only MODEL.WEIGHTS model_final.pth
51 | ``` 
52 | 
53 | ## Citation
54 | 
55 | If you find this work helpful, please consider citing MaskCLIP using the following BibTeX entry.
56 | 
57 | ```BibTeX
58 | @inproceedings{ding2023maskclip,
59 |   author    = {Zheng Ding, Jieke Wang, Zhuowen Tu},
60 |   title     = {Open-Vocabulary Universal Image Segmentation with MaskCLIP},
61 |   booktitle = {International Conference on Machine Learning},
62 |   year      = {2023},
63 | }
64 | ```
65 | 
66 | Please also checkout [MasQCLIP](https://github.com/mlpc-ucsd/MasQCLIP) for our lastest work on open-vocabulary segmentation.
67 | 
68 | 
69 | ## Acknowledgement
70 | 
71 | This codebase was built upon and drew inspirations from [CLIP](https://github.com/openai/CLIP) and [Mask2Former](https://github.com/facebookresearch/Mask2Formersss). We thank the authors for making those repositories public.
72 | 


--------------------------------------------------------------------------------
/maskclip/modeling/pixel_decoder/ops/setup.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | import os
13 | import glob
14 | 
15 | import torch
16 | 
17 | from torch.utils.cpp_extension import CUDA_HOME
18 | from torch.utils.cpp_extension import CppExtension
19 | from torch.utils.cpp_extension import CUDAExtension
20 | 
21 | from setuptools import find_packages
22 | from setuptools import setup
23 | 
24 | requirements = ["torch", "torchvision"]
25 | 
26 | def get_extensions():
27 |     this_dir = os.path.dirname(os.path.abspath(__file__))
28 |     extensions_dir = os.path.join(this_dir, "src")
29 | 
30 |     main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
31 |     source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
32 |     source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
33 | 
34 |     sources = main_file + source_cpu
35 |     extension = CppExtension
36 |     extra_compile_args = {"cxx": []}
37 |     define_macros = []
38 | 
39 |     # Force cuda since torch ask for a device, not if cuda is in fact available.
40 |     if (os.environ.get('FORCE_CUDA') or torch.cuda.is_available()) and CUDA_HOME is not None:
41 |         extension = CUDAExtension
42 |         sources += source_cuda
43 |         define_macros += [("WITH_CUDA", None)]
44 |         extra_compile_args["nvcc"] = [
45 |             "-DCUDA_HAS_FP16=1",
46 |             "-D__CUDA_NO_HALF_OPERATORS__",
47 |             "-D__CUDA_NO_HALF_CONVERSIONS__",
48 |             "-D__CUDA_NO_HALF2_OPERATORS__",
49 |         ]
50 |     else:
51 |         if CUDA_HOME is None:
52 |             raise NotImplementedError('CUDA_HOME is None. Please set environment variable CUDA_HOME.')
53 |         else:
54 |             raise NotImplementedError('No CUDA runtime is found. Please set FORCE_CUDA=1 or test it by running torch.cuda.is_available().')
55 | 
56 |     sources = [os.path.join(extensions_dir, s) for s in sources]
57 |     include_dirs = [extensions_dir]
58 |     ext_modules = [
59 |         extension(
60 |             "MultiScaleDeformableAttention",
61 |             sources,
62 |             include_dirs=include_dirs,
63 |             define_macros=define_macros,
64 |             extra_compile_args=extra_compile_args,
65 |         )
66 |     ]
67 |     return ext_modules
68 | 
69 | setup(
70 |     name="MultiScaleDeformableAttention",
71 |     version="1.0",
72 |     author="Weijie Su",
73 |     url="https://github.com/fundamentalvision/Deformable-DETR",
74 |     description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention",
75 |     packages=find_packages(exclude=("configs", "tests",)),
76 |     ext_modules=get_extensions(),
77 |     cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
78 | )
79 | 


--------------------------------------------------------------------------------
/datasets/prepare_coco_semantic_annos_from_panoptic_annos.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | # Copyright (c) Facebook, Inc. and its affiliates.
 4 | 
 5 | import functools
 6 | import json
 7 | import multiprocessing as mp
 8 | import numpy as np
 9 | import os
10 | import time
11 | from fvcore.common.download import download
12 | from panopticapi.utils import rgb2id
13 | from PIL import Image
14 | 
15 | from detectron2.data.datasets.builtin_meta import COCO_CATEGORIES
16 | 
17 | 
18 | def _process_panoptic_to_semantic(input_panoptic, output_semantic, segments, id_map):
19 |     panoptic = np.asarray(Image.open(input_panoptic), dtype=np.uint32)
20 |     panoptic = rgb2id(panoptic)
21 |     output = np.zeros_like(panoptic, dtype=np.uint8) + 255
22 |     for seg in segments:
23 |         cat_id = seg["category_id"]
24 |         new_cat_id = id_map[cat_id]
25 |         output[panoptic == seg["id"]] = new_cat_id
26 |     Image.fromarray(output).save(output_semantic)
27 | 
28 | 
29 | def separate_coco_semantic_from_panoptic(panoptic_json, panoptic_root, sem_seg_root, categories):
30 |     """
31 |     Create semantic segmentation annotations from panoptic segmentation
32 |     annotations, to be used by PanopticFPN.
33 |     It maps all thing categories to class 0, and maps all unlabeled pixels to class 255.
34 |     It maps all stuff categories to contiguous ids starting from 1.
35 |     Args:
36 |         panoptic_json (str): path to the panoptic json file, in COCO's format.
37 |         panoptic_root (str): a directory with panoptic annotation files, in COCO's format.
38 |         sem_seg_root (str): a directory to output semantic annotation files
39 |         categories (list[dict]): category metadata. Each dict needs to have:
40 |             "id": corresponds to the "category_id" in the json annotations
41 |             "isthing": 0 or 1
42 |     """
43 |     os.makedirs(sem_seg_root, exist_ok=True)
44 | 
45 |     id_map = {}  # map from category id to id in the output semantic annotation
46 |     assert len(categories) <= 254
47 |     for i, k in enumerate(categories):
48 |         id_map[k["id"]] = i
49 |     # what is id = 0?
50 |     # id_map[0] = 255
51 |     print(id_map)
52 | 
53 |     with open(panoptic_json) as f:
54 |         obj = json.load(f)
55 | 
56 |     pool = mp.Pool(processes=max(mp.cpu_count() // 2, 4))
57 | 
58 |     def iter_annotations():
59 |         for anno in obj["annotations"]:
60 |             file_name = anno["file_name"]
61 |             segments = anno["segments_info"]
62 |             input = os.path.join(panoptic_root, file_name)
63 |             output = os.path.join(sem_seg_root, file_name)
64 |             yield input, output, segments
65 | 
66 |     print("Start writing to {} ...".format(sem_seg_root))
67 |     start = time.time()
68 |     pool.starmap(
69 |         functools.partial(_process_panoptic_to_semantic, id_map=id_map),
70 |         iter_annotations(),
71 |         chunksize=100,
72 |     )
73 |     print("Finished. time: {:.2f}s".format(time.time() - start))
74 | 
75 | 
76 | if __name__ == "__main__":
77 |     dataset_dir = os.path.join(os.getenv("DETECTRON2_DATASETS", "datasets"), "coco")
78 |     for s in ["val2017", "train2017"]:
79 |         separate_coco_semantic_from_panoptic(
80 |             os.path.join(dataset_dir, "annotations/panoptic_{}.json".format(s)),
81 |             os.path.join(dataset_dir, "panoptic_{}".format(s)),
82 |             os.path.join(dataset_dir, "panoptic_semseg_{}".format(s)),
83 |             COCO_CATEGORIES,
84 |         )
85 | 


--------------------------------------------------------------------------------
/maskclip/modeling/pixel_decoder/ops/build/temp.linux-x86_64-cpython-39/build.ninja:
--------------------------------------------------------------------------------
 1 | ninja_required_version = 1.3
 2 | cxx = c++
 3 | nvcc = /usr/bin/nvcc
 4 | 
 5 | cflags = -pthread -B /home/jiw010/anaconda3/envs/maskclip/compiler_compat -Wno-unused-result -Wsign-compare -DNDEBUG -O2 -Wall -fPIC -O2 -isystem /home/jiw010/anaconda3/envs/maskclip/include -I/home/jiw010/anaconda3/envs/maskclip/include -fPIC -O2 -isystem /home/jiw010/anaconda3/envs/maskclip/include -fPIC -DWITH_CUDA -I/mnt/sdd/OVS_experiments/maskclip/train_bkb/mask2former/modeling/pixel_decoder/ops/src -I/home/jiw010/anaconda3/envs/maskclip/lib/python3.9/site-packages/torch/include -I/home/jiw010/anaconda3/envs/maskclip/lib/python3.9/site-packages/torch/include/torch/csrc/api/include -I/home/jiw010/anaconda3/envs/maskclip/lib/python3.9/site-packages/torch/include/TH -I/home/jiw010/anaconda3/envs/maskclip/lib/python3.9/site-packages/torch/include/THC -I/home/jiw010/anaconda3/envs/maskclip/include/python3.9 -c
 6 | post_cflags = -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=MultiScaleDeformableAttention -D_GLIBCXX_USE_CXX11_ABI=0 -std=c++14
 7 | cuda_cflags = -DWITH_CUDA -I/mnt/sdd/OVS_experiments/maskclip/train_bkb/mask2former/modeling/pixel_decoder/ops/src -I/home/jiw010/anaconda3/envs/maskclip/lib/python3.9/site-packages/torch/include -I/home/jiw010/anaconda3/envs/maskclip/lib/python3.9/site-packages/torch/include/torch/csrc/api/include -I/home/jiw010/anaconda3/envs/maskclip/lib/python3.9/site-packages/torch/include/TH -I/home/jiw010/anaconda3/envs/maskclip/lib/python3.9/site-packages/torch/include/THC -I/home/jiw010/anaconda3/envs/maskclip/include/python3.9 -c
 8 | cuda_post_cflags = -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DCUDA_HAS_FP16=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=MultiScaleDeformableAttention -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_86,code=sm_86 -std=c++14
 9 | ldflags = 
10 | 
11 | rule compile
12 |   command = $cxx -MMD -MF $out.d $cflags -c $in -o $out $post_cflags
13 |   depfile = $out.d
14 |   deps = gcc
15 | 
16 | rule cuda_compile
17 |   depfile = $out.d
18 |   deps = gcc
19 |   command = $nvcc  $cuda_cflags -c $in -o $out $cuda_post_cflags
20 | 
21 | 
22 | 
23 | build /mnt/sdd/OVS_experiments/maskclip/train_bkb/mask2former/modeling/pixel_decoder/ops/build/temp.linux-x86_64-cpython-39/mnt/sdd/OVS_experiments/maskclip/train_bkb/mask2former/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.o: compile /mnt/sdd/OVS_experiments/maskclip/train_bkb/mask2former/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.cpp
24 | build /mnt/sdd/OVS_experiments/maskclip/train_bkb/mask2former/modeling/pixel_decoder/ops/build/temp.linux-x86_64-cpython-39/mnt/sdd/OVS_experiments/maskclip/train_bkb/mask2former/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.o: cuda_compile /mnt/sdd/OVS_experiments/maskclip/train_bkb/mask2former/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.cu
25 | build /mnt/sdd/OVS_experiments/maskclip/train_bkb/mask2former/modeling/pixel_decoder/ops/build/temp.linux-x86_64-cpython-39/mnt/sdd/OVS_experiments/maskclip/train_bkb/mask2former/modeling/pixel_decoder/ops/src/vision.o: compile /mnt/sdd/OVS_experiments/maskclip/train_bkb/mask2former/modeling/pixel_decoder/ops/src/vision.cpp
26 | 
27 | 
28 | 
29 | 
30 | 
31 | 


--------------------------------------------------------------------------------
/maskclip/modeling/pixel_decoder/ops/functions/ms_deform_attn_func.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from __future__ import absolute_import
13 | from __future__ import print_function
14 | from __future__ import division
15 | 
16 | import torch
17 | import torch.nn.functional as F
18 | from torch.autograd import Function
19 | from torch.autograd.function import once_differentiable
20 | 
21 | try:
22 |     import MultiScaleDeformableAttention as MSDA
23 | except ModuleNotFoundError as e:
24 |     info_string = (
25 |         "\n\nPlease compile MultiScaleDeformableAttention CUDA op with the following commands:\n"
26 |         "\t`cd mask2former/modeling/pixel_decoder/ops`\n"
27 |         "\t`sh make.sh`\n"
28 |     )
29 |     raise ModuleNotFoundError(info_string)
30 | 
31 | 
32 | class MSDeformAttnFunction(Function):
33 |     @staticmethod
34 |     def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step):
35 |         ctx.im2col_step = im2col_step
36 |         output = MSDA.ms_deform_attn_forward(
37 |             value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step)
38 |         ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights)
39 |         return output
40 | 
41 |     @staticmethod
42 |     @once_differentiable
43 |     def backward(ctx, grad_output):
44 |         value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors
45 |         grad_value, grad_sampling_loc, grad_attn_weight = \
46 |             MSDA.ms_deform_attn_backward(
47 |                 value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step)
48 | 
49 |         return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
50 | 
51 | 
52 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
53 |     # for debug and test only,
54 |     # need to use cuda version instead
55 |     N_, S_, M_, D_ = value.shape
56 |     _, Lq_, M_, L_, P_, _ = sampling_locations.shape
57 |     value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
58 |     sampling_grids = 2 * sampling_locations - 1
59 |     sampling_value_list = []
60 |     for lid_, (H_, W_) in enumerate(value_spatial_shapes):
61 |         # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
62 |         value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_)
63 |         # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
64 |         sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)
65 |         # N_*M_, D_, Lq_, P_
66 |         sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,
67 |                                           mode='bilinear', padding_mode='zeros', align_corners=False)
68 |         sampling_value_list.append(sampling_value_l_)
69 |     # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
70 |     attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_)
71 |     output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)
72 |     return output.transpose(1, 2).contiguous()
73 | 


--------------------------------------------------------------------------------
/maskclip/modeling/pixel_decoder/ops/build/lib.linux-x86_64-3.9/functions/ms_deform_attn_func.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from __future__ import absolute_import
13 | from __future__ import print_function
14 | from __future__ import division
15 | 
16 | import torch
17 | import torch.nn.functional as F
18 | from torch.autograd import Function
19 | from torch.autograd.function import once_differentiable
20 | 
21 | try:
22 |     import MultiScaleDeformableAttention as MSDA
23 | except ModuleNotFoundError as e:
24 |     info_string = (
25 |         "\n\nPlease compile MultiScaleDeformableAttention CUDA op with the following commands:\n"
26 |         "\t`cd mask2former/modeling/pixel_decoder/ops`\n"
27 |         "\t`sh make.sh`\n"
28 |     )
29 |     raise ModuleNotFoundError(info_string)
30 | 
31 | 
32 | class MSDeformAttnFunction(Function):
33 |     @staticmethod
34 |     def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step):
35 |         ctx.im2col_step = im2col_step
36 |         output = MSDA.ms_deform_attn_forward(
37 |             value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step)
38 |         ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights)
39 |         return output
40 | 
41 |     @staticmethod
42 |     @once_differentiable
43 |     def backward(ctx, grad_output):
44 |         value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors
45 |         grad_value, grad_sampling_loc, grad_attn_weight = \
46 |             MSDA.ms_deform_attn_backward(
47 |                 value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step)
48 | 
49 |         return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
50 | 
51 | 
52 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
53 |     # for debug and test only,
54 |     # need to use cuda version instead
55 |     N_, S_, M_, D_ = value.shape
56 |     _, Lq_, M_, L_, P_, _ = sampling_locations.shape
57 |     value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
58 |     sampling_grids = 2 * sampling_locations - 1
59 |     sampling_value_list = []
60 |     for lid_, (H_, W_) in enumerate(value_spatial_shapes):
61 |         # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
62 |         value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_)
63 |         # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
64 |         sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)
65 |         # N_*M_, D_, Lq_, P_
66 |         sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,
67 |                                           mode='bilinear', padding_mode='zeros', align_corners=False)
68 |         sampling_value_list.append(sampling_value_l_)
69 |     # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
70 |     attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_)
71 |     output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)
72 |     return output.transpose(1, 2).contiguous()
73 | 


--------------------------------------------------------------------------------
/maskclip/modeling/pixel_decoder/ops/build/lib.linux-x86_64-cpython-39/functions/ms_deform_attn_func.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from __future__ import absolute_import
13 | from __future__ import print_function
14 | from __future__ import division
15 | 
16 | import torch
17 | import torch.nn.functional as F
18 | from torch.autograd import Function
19 | from torch.autograd.function import once_differentiable
20 | 
21 | try:
22 |     import MultiScaleDeformableAttention as MSDA
23 | except ModuleNotFoundError as e:
24 |     info_string = (
25 |         "\n\nPlease compile MultiScaleDeformableAttention CUDA op with the following commands:\n"
26 |         "\t`cd mask2former/modeling/pixel_decoder/ops`\n"
27 |         "\t`sh make.sh`\n"
28 |     )
29 |     raise ModuleNotFoundError(info_string)
30 | 
31 | 
32 | class MSDeformAttnFunction(Function):
33 |     @staticmethod
34 |     def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step):
35 |         ctx.im2col_step = im2col_step
36 |         output = MSDA.ms_deform_attn_forward(
37 |             value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step)
38 |         ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights)
39 |         return output
40 | 
41 |     @staticmethod
42 |     @once_differentiable
43 |     def backward(ctx, grad_output):
44 |         value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors
45 |         grad_value, grad_sampling_loc, grad_attn_weight = \
46 |             MSDA.ms_deform_attn_backward(
47 |                 value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step)
48 | 
49 |         return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
50 | 
51 | 
52 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
53 |     # for debug and test only,
54 |     # need to use cuda version instead
55 |     N_, S_, M_, D_ = value.shape
56 |     _, Lq_, M_, L_, P_, _ = sampling_locations.shape
57 |     value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
58 |     sampling_grids = 2 * sampling_locations - 1
59 |     sampling_value_list = []
60 |     for lid_, (H_, W_) in enumerate(value_spatial_shapes):
61 |         # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
62 |         value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_)
63 |         # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
64 |         sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)
65 |         # N_*M_, D_, Lq_, P_
66 |         sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,
67 |                                           mode='bilinear', padding_mode='zeros', align_corners=False)
68 |         sampling_value_list.append(sampling_value_l_)
69 |     # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
70 |     attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_)
71 |     output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)
72 |     return output.transpose(1, 2).contiguous()
73 | 


--------------------------------------------------------------------------------
/maskclip/test_time_augmentation.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | import copy
  3 | import logging
  4 | from itertools import count
  5 | 
  6 | import numpy as np
  7 | import torch
  8 | from fvcore.transforms import HFlipTransform
  9 | from torch import nn
 10 | from torch.nn.parallel import DistributedDataParallel
 11 | 
 12 | from detectron2.data.detection_utils import read_image
 13 | from detectron2.modeling import DatasetMapperTTA
 14 | 
 15 | 
 16 | __all__ = [
 17 |     "SemanticSegmentorWithTTA",
 18 | ]
 19 | 
 20 | 
 21 | class SemanticSegmentorWithTTA(nn.Module):
 22 |     """
 23 |     A SemanticSegmentor with test-time augmentation enabled.
 24 |     Its :meth:`__call__` method has the same interface as :meth:`SemanticSegmentor.forward`.
 25 |     """
 26 | 
 27 |     def __init__(self, cfg, model, tta_mapper=None, batch_size=1):
 28 |         """
 29 |         Args:
 30 |             cfg (CfgNode):
 31 |             model (SemanticSegmentor): a SemanticSegmentor to apply TTA on.
 32 |             tta_mapper (callable): takes a dataset dict and returns a list of
 33 |                 augmented versions of the dataset dict. Defaults to
 34 |                 `DatasetMapperTTA(cfg)`.
 35 |             batch_size (int): batch the augmented images into this batch size for inference.
 36 |         """
 37 |         super().__init__()
 38 |         if isinstance(model, DistributedDataParallel):
 39 |             model = model.module
 40 |         self.cfg = cfg.clone()
 41 | 
 42 |         self.model = model
 43 | 
 44 |         if tta_mapper is None:
 45 |             tta_mapper = DatasetMapperTTA(cfg)
 46 |         self.tta_mapper = tta_mapper
 47 |         self.batch_size = batch_size
 48 | 
 49 |     def __call__(self, batched_inputs):
 50 |         """
 51 |         Same input/output format as :meth:`SemanticSegmentor.forward`
 52 |         """
 53 | 
 54 |         def _maybe_read_image(dataset_dict):
 55 |             ret = copy.copy(dataset_dict)
 56 |             if "image" not in ret:
 57 |                 image = read_image(ret.pop("file_name"), self.model.input_format)
 58 |                 image = torch.from_numpy(np.ascontiguousarray(image.transpose(2, 0, 1)))  # CHW
 59 |                 ret["image"] = image
 60 |             if "height" not in ret and "width" not in ret:
 61 |                 ret["height"] = image.shape[1]
 62 |                 ret["width"] = image.shape[2]
 63 |             return ret
 64 | 
 65 |         processed_results = []
 66 |         for x in batched_inputs:
 67 |             result = self._inference_one_image(_maybe_read_image(x))
 68 |             processed_results.append(result)
 69 |         return processed_results
 70 | 
 71 |     def _inference_one_image(self, input):
 72 |         """
 73 |         Args:
 74 |             input (dict): one dataset dict with "image" field being a CHW tensor
 75 |         Returns:
 76 |             dict: one output dict
 77 |         """
 78 |         orig_shape = (input["height"], input["width"])
 79 |         augmented_inputs, tfms = self._get_augmented_inputs(input)
 80 | 
 81 |         final_predictions = None
 82 |         count_predictions = 0
 83 |         for input, tfm in zip(augmented_inputs, tfms):
 84 |             count_predictions += 1
 85 |             with torch.no_grad():
 86 |                 if final_predictions is None:
 87 |                     if any(isinstance(t, HFlipTransform) for t in tfm.transforms):
 88 |                         final_predictions = self.model([input])[0].pop("sem_seg").flip(dims=[2])
 89 |                     else:
 90 |                         final_predictions = self.model([input])[0].pop("sem_seg")
 91 |                 else:
 92 |                     if any(isinstance(t, HFlipTransform) for t in tfm.transforms):
 93 |                         final_predictions += self.model([input])[0].pop("sem_seg").flip(dims=[2])
 94 |                     else:
 95 |                         final_predictions += self.model([input])[0].pop("sem_seg")
 96 | 
 97 |         final_predictions = final_predictions / count_predictions
 98 |         return {"sem_seg": final_predictions}
 99 | 
100 |     def _get_augmented_inputs(self, input):
101 |         augmented_inputs = self.tta_mapper(input)
102 |         tfms = [x.pop("transforms") for x in augmented_inputs]
103 |         return augmented_inputs, tfms
104 | 


--------------------------------------------------------------------------------
/maskclip/utils/misc.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/util/misc.py
  3 | """
  4 | Misc functions, including distributed helpers.
  5 | 
  6 | Mostly copy-paste from torchvision references.
  7 | """
  8 | from typing import List, Optional
  9 | 
 10 | import torch
 11 | import torch.distributed as dist
 12 | import torchvision
 13 | from torch import Tensor
 14 | 
 15 | 
 16 | def _max_by_axis(the_list):
 17 |     # type: (List[List[int]]) -> List[int]
 18 |     maxes = the_list[0]
 19 |     for sublist in the_list[1:]:
 20 |         for index, item in enumerate(sublist):
 21 |             maxes[index] = max(maxes[index], item)
 22 |     return maxes
 23 | 
 24 | 
 25 | class NestedTensor(object):
 26 |     def __init__(self, tensors, mask: Optional[Tensor]):
 27 |         self.tensors = tensors
 28 |         self.mask = mask
 29 | 
 30 |     def to(self, device):
 31 |         # type: (Device) -> NestedTensor # noqa
 32 |         cast_tensor = self.tensors.to(device)
 33 |         mask = self.mask
 34 |         if mask is not None:
 35 |             assert mask is not None
 36 |             cast_mask = mask.to(device)
 37 |         else:
 38 |             cast_mask = None
 39 |         return NestedTensor(cast_tensor, cast_mask)
 40 | 
 41 |     def decompose(self):
 42 |         return self.tensors, self.mask
 43 | 
 44 |     def __repr__(self):
 45 |         return str(self.tensors)
 46 | 
 47 | 
 48 | def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
 49 |     # TODO make this more general
 50 |     if tensor_list[0].ndim == 3:
 51 |         if torchvision._is_tracing():
 52 |             # nested_tensor_from_tensor_list() does not export well to ONNX
 53 |             # call _onnx_nested_tensor_from_tensor_list() instead
 54 |             return _onnx_nested_tensor_from_tensor_list(tensor_list)
 55 | 
 56 |         # TODO make it support different-sized images
 57 |         max_size = _max_by_axis([list(img.shape) for img in tensor_list])
 58 |         # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list]))
 59 |         batch_shape = [len(tensor_list)] + max_size
 60 |         b, c, h, w = batch_shape
 61 |         dtype = tensor_list[0].dtype
 62 |         device = tensor_list[0].device
 63 |         tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
 64 |         mask = torch.ones((b, h, w), dtype=torch.bool, device=device)
 65 |         for img, pad_img, m in zip(tensor_list, tensor, mask):
 66 |             pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
 67 |             m[: img.shape[1], : img.shape[2]] = False
 68 |     else:
 69 |         raise ValueError("not supported")
 70 |     return NestedTensor(tensor, mask)
 71 | 
 72 | 
 73 | # _onnx_nested_tensor_from_tensor_list() is an implementation of
 74 | # nested_tensor_from_tensor_list() that is supported by ONNX tracing.
 75 | @torch.jit.unused
 76 | def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor:
 77 |     max_size = []
 78 |     for i in range(tensor_list[0].dim()):
 79 |         max_size_i = torch.max(
 80 |             torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32)
 81 |         ).to(torch.int64)
 82 |         max_size.append(max_size_i)
 83 |     max_size = tuple(max_size)
 84 | 
 85 |     # work around for
 86 |     # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
 87 |     # m[: img.shape[1], :img.shape[2]] = False
 88 |     # which is not yet supported in onnx
 89 |     padded_imgs = []
 90 |     padded_masks = []
 91 |     for img in tensor_list:
 92 |         padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))]
 93 |         padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0]))
 94 |         padded_imgs.append(padded_img)
 95 | 
 96 |         m = torch.zeros_like(img[0], dtype=torch.int, device=img.device)
 97 |         padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1)
 98 |         padded_masks.append(padded_mask.to(torch.bool))
 99 | 
100 |     tensor = torch.stack(padded_imgs)
101 |     mask = torch.stack(padded_masks)
102 | 
103 |     return NestedTensor(tensor, mask=mask)
104 | 
105 | 
106 | def is_dist_avail_and_initialized():
107 |     if not dist.is_available():
108 |         return False
109 |     if not dist.is_initialized():
110 |         return False
111 |     return True
112 | 


--------------------------------------------------------------------------------
/maskclip/modeling/pixel_decoder/ops/test.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from __future__ import absolute_import
13 | from __future__ import print_function
14 | from __future__ import division
15 | 
16 | import time
17 | import torch
18 | import torch.nn as nn
19 | from torch.autograd import gradcheck
20 | 
21 | from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch
22 | 
23 | 
24 | N, M, D = 1, 2, 2
25 | Lq, L, P = 2, 2, 2
26 | shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda()
27 | level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1]))
28 | S = sum([(H*W).item() for H, W in shapes])
29 | 
30 | 
31 | torch.manual_seed(3)
32 | 
33 | 
34 | @torch.no_grad()
35 | def check_forward_equal_with_pytorch_double():
36 |     value = torch.rand(N, S, M, D).cuda() * 0.01
37 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
38 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
39 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
40 |     im2col_step = 2
41 |     output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu()
42 |     output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu()
43 |     fwdok = torch.allclose(output_cuda, output_pytorch)
44 |     max_abs_err = (output_cuda - output_pytorch).abs().max()
45 |     max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
46 | 
47 |     print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
48 | 
49 | 
50 | @torch.no_grad()
51 | def check_forward_equal_with_pytorch_float():
52 |     value = torch.rand(N, S, M, D).cuda() * 0.01
53 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
54 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
55 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
56 |     im2col_step = 2
57 |     output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu()
58 |     output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu()
59 |     fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3)
60 |     max_abs_err = (output_cuda - output_pytorch).abs().max()
61 |     max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
62 | 
63 |     print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
64 | 
65 | 
66 | def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True):
67 | 
68 |     value = torch.rand(N, S, M, channels).cuda() * 0.01
69 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
70 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
71 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
72 |     im2col_step = 2
73 |     func = MSDeformAttnFunction.apply
74 | 
75 |     value.requires_grad = grad_value
76 |     sampling_locations.requires_grad = grad_sampling_loc
77 |     attention_weights.requires_grad = grad_attn_weight
78 | 
79 |     gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step))
80 | 
81 |     print(f'* {gradok} check_gradient_numerical(D={channels})')
82 | 
83 | 
84 | if __name__ == '__main__':
85 |     check_forward_equal_with_pytorch_double()
86 |     check_forward_equal_with_pytorch_float()
87 | 
88 |     for channels in [30, 32, 64, 71, 1025, 2048, 3096]:
89 |         check_gradient_numerical(channels, True, True, True)
90 | 
91 | 
92 | 
93 | 


--------------------------------------------------------------------------------
/datasets/prepare_ade20k_ins_seg.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | # Copyright (c) Facebook, Inc. and its affiliates.
  4 | import glob
  5 | import json
  6 | import os
  7 | from collections import Counter
  8 | 
  9 | import numpy as np
 10 | import tqdm
 11 | from panopticapi.utils import IdGenerator, save_json
 12 | from PIL import Image
 13 | import pycocotools.mask as mask_util
 14 | 
 15 | 
 16 | if __name__ == "__main__":
 17 |     dataset_dir = os.getenv("DETECTRON2_DATASETS", "datasets")
 18 | 
 19 |     for name, dirname in [("train", "training"), ("val", "validation")]:
 20 |         image_dir = os.path.join(dataset_dir, f"ADEChallengeData2016/images/{dirname}/")
 21 |         instance_dir = os.path.join(
 22 |             dataset_dir, f"ADEChallengeData2016/annotations_instance/{dirname}/"
 23 |         )
 24 | 
 25 |         # img_id = 0
 26 |         ann_id = 1
 27 | 
 28 |         # json
 29 |         out_file = os.path.join(dataset_dir, f"ADEChallengeData2016/ade20k_instance_{name}.json")
 30 | 
 31 |         # json config
 32 |         instance_config_file = "datasets/ade20k_instance_imgCatIds.json"
 33 |         with open(instance_config_file) as f:
 34 |             category_dict = json.load(f)["categories"]
 35 | 
 36 |         # load catid mapping
 37 |         # it is important to share category id for both instance and panoptic annotations
 38 |         mapping_file = "datasets/ade20k_instance_catid_mapping.txt"
 39 |         with open(mapping_file) as f:
 40 |             map_id = {}
 41 |             for i, line in enumerate(f.readlines()):
 42 |                 if i == 0:
 43 |                     continue
 44 |                 ins_id, sem_id, _ = line.strip().split()
 45 |                 # shift id by 1 because we want it to start from 0!
 46 |                 # ignore_label becomes 255
 47 |                 map_id[int(ins_id)] = int(sem_id) - 1
 48 | 
 49 |         for cat in category_dict:
 50 |             cat["id"] = map_id[cat["id"]]
 51 | 
 52 |         filenames = sorted(glob.glob(os.path.join(image_dir, "*.jpg")))
 53 | 
 54 |         ann_dict = {}
 55 |         images = []
 56 |         annotations = []
 57 | 
 58 |         for idx, filename in enumerate(tqdm.tqdm(filenames)):
 59 |             image = {}
 60 |             image_id = os.path.basename(filename).split(".")[0]
 61 | 
 62 |             image["id"] = image_id
 63 |             image["file_name"] = os.path.basename(filename)
 64 | 
 65 |             original_format = np.array(Image.open(filename))
 66 |             image["width"] = original_format.shape[1]
 67 |             image["height"] = original_format.shape[0]
 68 | 
 69 |             images.append(image)
 70 | 
 71 |             filename_instance = os.path.join(instance_dir, image_id + ".png")
 72 |             ins_seg = np.asarray(Image.open(filename_instance))
 73 |             assert ins_seg.dtype == np.uint8
 74 | 
 75 |             instance_cat_ids = ins_seg[..., 0]
 76 |             # instance id starts from 1!
 77 |             # because 0 is reserved as VOID label
 78 |             instance_ins_ids = ins_seg[..., 1]
 79 | 
 80 |             # process things
 81 |             for thing_id in np.unique(instance_ins_ids):
 82 |                 if thing_id == 0:
 83 |                     continue
 84 |                 mask = instance_ins_ids == thing_id
 85 |                 instance_cat_id = np.unique(instance_cat_ids[mask])
 86 |                 assert len(instance_cat_id) == 1
 87 | 
 88 |                 anno = {}
 89 |                 anno['id'] = ann_id
 90 |                 ann_id += 1
 91 |                 anno['image_id'] = image['id']
 92 |                 anno["iscrowd"] = int(0)
 93 |                 anno["category_id"] = int(map_id[instance_cat_id[0]])
 94 | 
 95 |                 inds = np.nonzero(mask)
 96 |                 ymin, ymax = inds[0].min(), inds[0].max()
 97 |                 xmin, xmax = inds[1].min(), inds[1].max()
 98 |                 anno["bbox"] = [int(xmin), int(ymin), int(xmax - xmin + 1), int(ymax - ymin + 1)]
 99 |                 # if xmax <= xmin or ymax <= ymin:
100 |                 #     continue
101 |                 rle = mask_util.encode(np.array(mask[:, :, None], order="F", dtype="uint8"))[0]
102 |                 rle["counts"] = rle["counts"].decode("utf-8")
103 |                 anno["segmentation"] = rle
104 |                 anno["area"] = int(mask_util.area(rle))
105 |                 annotations.append(anno)
106 | 
107 |         # save this
108 |         ann_dict['images'] = images
109 |         ann_dict['categories'] = category_dict
110 |         ann_dict['annotations'] = annotations
111 | 
112 |         save_json(ann_dict, out_file)
113 | 


--------------------------------------------------------------------------------
/maskclip/data/datasets/register_ade20k_instance.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | import json
 3 | import logging
 4 | import numpy as np
 5 | import os
 6 | from PIL import Image
 7 | 
 8 | from detectron2.data import DatasetCatalog, MetadataCatalog
 9 | from detectron2.data.datasets.coco import load_coco_json, register_coco_instances
10 | from detectron2.utils.file_io import PathManager
11 | 
12 | ADE_CATEGORIES = [{'id': 7, 'name': 'bed'}, {'id': 8, 'name': 'windowpane'}, {'id': 10, 'name': 'cabinet'}, {'id': 12, 'name': 'person'}, {'id': 14, 'name': 'door'}, {'id': 15, 'name': 'table'}, {'id': 18, 'name': 'curtain'}, {'id': 19, 'name': 'chair'}, {'id': 20, 'name': 'car'}, {'id': 22, 'name': 'painting'}, {'id': 23, 'name': 'sofa'}, {'id': 24, 'name': 'shelf'}, {'id': 27, 'name': 'mirror'}, {'id': 30, 'name': 'armchair'}, {'id': 31, 'name': 'seat'}, {'id': 32, 'name': 'fence'}, {'id': 33, 'name': 'desk'}, {'id': 35, 'name': 'wardrobe'}, {'id': 36, 'name': 'lamp'}, {'id': 37, 'name': 'bathtub'}, {'id': 38, 'name': 'railing'}, {'id': 39, 'name': 'cushion'}, {'id': 41, 'name': 'box'}, {'id': 42, 'name': 'column'}, {'id': 43, 'name': 'signboard'}, {'id': 44, 'name': 'chest of drawers'}, {'id': 45, 'name': 'counter'}, {'id': 47, 'name': 'sink'}, {'id': 49, 'name': 'fireplace'}, {'id': 50, 'name': 'refrigerator'}, {'id': 53, 'name': 'stairs'}, {'id': 55, 'name': 'case'}, {'id': 56, 'name': 'pool table'}, {'id': 57, 'name': 'pillow'}, {'id': 58, 'name': 'screen door'}, {'id': 62, 'name': 'bookcase'}, {'id': 64, 'name': 'coffee table'}, {'id': 65, 'name': 'toilet'}, {'id': 66, 'name': 'flower'}, {'id': 67, 'name': 'book'}, {'id': 69, 'name': 'bench'}, {'id': 70, 'name': 'countertop'}, {'id': 71, 'name': 'stove'}, {'id': 72, 'name': 'palm'}, {'id': 73, 'name': 'kitchen island'}, {'id': 74, 'name': 'computer'}, {'id': 75, 'name': 'swivel chair'}, {'id': 76, 'name': 'boat'}, {'id': 78, 'name': 'arcade machine'}, {'id': 80, 'name': 'bus'}, {'id': 81, 'name': 'towel'}, {'id': 82, 'name': 'light'}, {'id': 83, 'name': 'truck'}, {'id': 85, 'name': 'chandelier'}, {'id': 86, 'name': 'awning'}, {'id': 87, 'name': 'streetlight'}, {'id': 88, 'name': 'booth'}, {'id': 89, 'name': 'television receiver'}, {'id': 90, 'name': 'airplane'}, {'id': 92, 'name': 'apparel'}, {'id': 93, 'name': 'pole'}, {'id': 95, 'name': 'bannister'}, {'id': 97, 'name': 'ottoman'}, {'id': 98, 'name': 'bottle'}, {'id': 102, 'name': 'van'}, {'id': 103, 'name': 'ship'}, {'id': 104, 'name': 'fountain'}, {'id': 107, 'name': 'washer'}, {'id': 108, 'name': 'plaything'}, {'id': 110, 'name': 'stool'}, {'id': 111, 'name': 'barrel'}, {'id': 112, 'name': 'basket'}, {'id': 115, 'name': 'bag'}, {'id': 116, 'name': 'minibike'}, {'id': 118, 'name': 'oven'}, {'id': 119, 'name': 'ball'}, {'id': 120, 'name': 'food'}, {'id': 121, 'name': 'step'}, {'id': 123, 'name': 'trade name'}, {'id': 124, 'name': 'microwave'}, {'id': 125, 'name': 'pot'}, {'id': 126, 'name': 'animal'}, {'id': 127, 'name': 'bicycle'}, {'id': 129, 'name': 'dishwasher'}, {'id': 130, 'name': 'screen'}, {'id': 132, 'name': 'sculpture'}, {'id': 133, 'name': 'hood'}, {'id': 134, 'name': 'sconce'}, {'id': 135, 'name': 'vase'}, {'id': 136, 'name': 'traffic light'}, {'id': 137, 'name': 'tray'}, {'id': 138, 'name': 'ashcan'}, {'id': 139, 'name': 'fan'}, {'id': 142, 'name': 'plate'}, {'id': 143, 'name': 'monitor'}, {'id': 144, 'name': 'bulletin board'}, {'id': 146, 'name': 'radiator'}, {'id': 147, 'name': 'glass'}, {'id': 148, 'name': 'clock'}, {'id': 149, 'name': 'flag'}]
13 | 
14 | 
15 | _PREDEFINED_SPLITS = {
16 |     # point annotations without masks
17 |     "ade20k_instance_train": (
18 |         "ADEChallengeData2016/images/training",
19 |         "ADEChallengeData2016/ade20k_instance_train.json",
20 |     ),
21 |     "ade20k_instance_val": (
22 |         "ADEChallengeData2016/images/validation",
23 |         "ADEChallengeData2016/ade20k_instance_val.json",
24 |     ),
25 | }
26 | 
27 | 
28 | def _get_ade_instances_meta():
29 |     thing_ids = [k["id"] for k in ADE_CATEGORIES]
30 |     assert len(thing_ids) == 100, len(thing_ids)
31 |     # Mapping from the incontiguous ADE category id to an id in [0, 99]
32 |     thing_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(thing_ids)}
33 |     thing_classes = [k["name"] for k in ADE_CATEGORIES]
34 |     ret = {
35 |         "thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id,
36 |         "thing_classes": thing_classes,
37 |     }
38 |     return ret
39 | 
40 | 
41 | def register_all_ade20k_instance(root):
42 |     for key, (image_root, json_file) in _PREDEFINED_SPLITS.items():
43 |         # Assume pre-defined datasets live in `./datasets`.
44 |         register_coco_instances(
45 |             key,
46 |             _get_ade_instances_meta(),
47 |             os.path.join(root, json_file) if "://" not in json_file else json_file,
48 |             os.path.join(root, image_root),
49 |         )
50 | 
51 | 
52 | _root = os.getenv("DETECTRON2_DATASETS", "datasets")
53 | register_all_ade20k_instance(_root)
54 | 


--------------------------------------------------------------------------------
/maskclip/evaluation/instance_evaluation.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | import contextlib
  3 | import copy
  4 | import io
  5 | import itertools
  6 | import json
  7 | import logging
  8 | import numpy as np
  9 | import os
 10 | import pickle
 11 | from collections import OrderedDict
 12 | import pycocotools.mask as mask_util
 13 | import torch
 14 | from pycocotools.coco import COCO
 15 | from pycocotools.cocoeval import COCOeval
 16 | from tabulate import tabulate
 17 | 
 18 | import detectron2.utils.comm as comm
 19 | from detectron2.config import CfgNode
 20 | from detectron2.data import MetadataCatalog
 21 | from detectron2.data.datasets.coco import convert_to_coco_json
 22 | from detectron2.evaluation.coco_evaluation import COCOEvaluator, _evaluate_predictions_on_coco
 23 | from detectron2.evaluation.fast_eval_api import COCOeval_opt
 24 | from detectron2.structures import Boxes, BoxMode, pairwise_iou
 25 | from detectron2.utils.file_io import PathManager
 26 | from detectron2.utils.logger import create_small_table
 27 | 
 28 | 
 29 | # modified from COCOEvaluator for instance segmetnat
 30 | class InstanceSegEvaluator(COCOEvaluator):
 31 |     """
 32 |     Evaluate AR for object proposals, AP for instance detection/segmentation, AP
 33 |     for keypoint detection outputs using COCO's metrics.
 34 |     See http://cocodataset.org/#detection-eval and
 35 |     http://cocodataset.org/#keypoints-eval to understand its metrics.
 36 |     The metrics range from 0 to 100 (instead of 0 to 1), where a -1 or NaN means
 37 |     the metric cannot be computed (e.g. due to no predictions made).
 38 | 
 39 |     In addition to COCO, this evaluator is able to support any bounding box detection,
 40 |     instance segmentation, or keypoint detection dataset.
 41 |     """
 42 | 
 43 |     def _eval_predictions(self, predictions, img_ids=None):
 44 |         """
 45 |         Evaluate predictions. Fill self._results with the metrics of the tasks.
 46 |         """
 47 |         self._logger.info("Preparing results for COCO format ...")
 48 |         coco_results = list(itertools.chain(*[x["instances"] for x in predictions]))
 49 |         tasks = self._tasks or self._tasks_from_predictions(coco_results)
 50 | 
 51 |         # unmap the category ids for COCO
 52 |         if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"):
 53 |             dataset_id_to_contiguous_id = self._metadata.thing_dataset_id_to_contiguous_id
 54 |             # all_contiguous_ids = list(dataset_id_to_contiguous_id.values())
 55 |             # num_classes = len(all_contiguous_ids)
 56 |             # assert min(all_contiguous_ids) == 0 and max(all_contiguous_ids) == num_classes - 1
 57 | 
 58 |             reverse_id_mapping = {v: k for k, v in dataset_id_to_contiguous_id.items()}
 59 |             for result in coco_results:
 60 |                 category_id = result["category_id"]
 61 |                 # assert category_id < num_classes, (
 62 |                 #     f"A prediction has class={category_id}, "
 63 |                 #     f"but the dataset only has {num_classes} classes and "
 64 |                 #     f"predicted class id should be in [0, {num_classes - 1}]."
 65 |                 # )
 66 |                 assert category_id in reverse_id_mapping, (
 67 |                     f"A prediction has class={category_id}, "
 68 |                     f"but the dataset only has class ids in {dataset_id_to_contiguous_id}."
 69 |                 )
 70 |                 result["category_id"] = reverse_id_mapping[category_id]
 71 | 
 72 |         if self._output_dir:
 73 |             file_path = os.path.join(self._output_dir, "coco_instances_results.json")
 74 |             self._logger.info("Saving results to {}".format(file_path))
 75 |             with PathManager.open(file_path, "w") as f:
 76 |                 f.write(json.dumps(coco_results))
 77 |                 f.flush()
 78 | 
 79 |         if not self._do_evaluation:
 80 |             self._logger.info("Annotations are not available for evaluation.")
 81 |             return
 82 | 
 83 |         self._logger.info(
 84 |             "Evaluating predictions with {} COCO API...".format(
 85 |                 "unofficial" if self._use_fast_impl else "official"
 86 |             )
 87 |         )
 88 |         for task in sorted(tasks):
 89 |             assert task in {"bbox", "segm", "keypoints"}, f"Got unknown task: {task}!"
 90 |             coco_eval = (
 91 |                 _evaluate_predictions_on_coco(
 92 |                     self._coco_api,
 93 |                     coco_results,
 94 |                     task,
 95 |                     kpt_oks_sigmas=self._kpt_oks_sigmas,
 96 |                     use_fast_impl=self._use_fast_impl,
 97 |                     img_ids=img_ids,
 98 |                     max_dets_per_image=self._max_dets_per_image,
 99 |                 )
100 |                 if len(coco_results) > 0
101 |                 else None  # cocoapi does not handle empty results very well
102 |             )
103 | 
104 |             res = self._derive_coco_results(
105 |                 coco_eval, task, class_names=self._metadata.get("thing_classes")
106 |             )
107 |             self._results[task] = res
108 | 


--------------------------------------------------------------------------------
/maskclip/config.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Copyright (c) Facebook, Inc. and its affiliates.
  3 | from detectron2.config import CfgNode as CN
  4 | 
  5 | 
  6 | def add_maskformer2_config(cfg):
  7 |     """
  8 |     Add config for MASK_FORMER.
  9 |     """
 10 |     # NOTE: configs from original maskformer
 11 |     # data config
 12 |     # select the dataset mapper
 13 |     cfg.INPUT.DATASET_MAPPER_NAME = "mask_former_semantic"
 14 |     # Color augmentation
 15 |     cfg.INPUT.COLOR_AUG_SSD = False
 16 |     # We retry random cropping until no single category in semantic segmentation GT occupies more
 17 |     # than `SINGLE_CATEGORY_MAX_AREA` part of the crop.
 18 |     cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA = 1.0
 19 |     # Pad image and segmentation GT in dataset mapper.
 20 |     cfg.INPUT.SIZE_DIVISIBILITY = -1
 21 | 
 22 |     # solver config
 23 |     # weight decay on embedding
 24 |     cfg.SOLVER.WEIGHT_DECAY_EMBED = 0.0
 25 |     # optimizer
 26 |     cfg.SOLVER.OPTIMIZER = "ADAMW"
 27 |     cfg.SOLVER.BACKBONE_MULTIPLIER = 0.1
 28 | 
 29 |     # mask_former model config
 30 |     cfg.MODEL.MASK_FORMER = CN()
 31 | 
 32 |     # loss
 33 |     cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION = True
 34 |     cfg.MODEL.MASK_FORMER.NO_OBJECT_WEIGHT = 0.1
 35 |     cfg.MODEL.MASK_FORMER.CLASS_WEIGHT = 1.0
 36 |     cfg.MODEL.MASK_FORMER.DICE_WEIGHT = 1.0
 37 |     cfg.MODEL.MASK_FORMER.MASK_WEIGHT = 20.0
 38 | 
 39 |     # transformer config
 40 |     cfg.MODEL.MASK_FORMER.NHEADS = 8
 41 |     cfg.MODEL.MASK_FORMER.DROPOUT = 0.1
 42 |     cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD = 2048
 43 |     cfg.MODEL.MASK_FORMER.ENC_LAYERS = 0
 44 |     cfg.MODEL.MASK_FORMER.DEC_LAYERS = 6
 45 |     cfg.MODEL.MASK_FORMER.PRE_NORM = False
 46 | 
 47 |     cfg.MODEL.MASK_FORMER.HIDDEN_DIM = 256
 48 |     cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES = 100
 49 | 
 50 |     cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE = "res5"
 51 |     cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ = False
 52 | 
 53 |     # mask_former inference config
 54 |     cfg.MODEL.MASK_FORMER.TEST = CN()
 55 |     cfg.MODEL.MASK_FORMER.TEST.SEMANTIC_ON = True
 56 |     cfg.MODEL.MASK_FORMER.TEST.INSTANCE_ON = False
 57 |     cfg.MODEL.MASK_FORMER.TEST.PANOPTIC_ON = False
 58 |     cfg.MODEL.MASK_FORMER.TEST.OBJECT_MASK_THRESHOLD = 0.0
 59 |     cfg.MODEL.MASK_FORMER.TEST.OVERLAP_THRESHOLD = 0.0
 60 |     cfg.MODEL.MASK_FORMER.TEST.SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE = False
 61 | 
 62 |     # Sometimes `backbone.size_divisibility` is set to 0 for some backbone (e.g. ResNet)
 63 |     # you can use this config to override
 64 |     cfg.MODEL.MASK_FORMER.SIZE_DIVISIBILITY = 32
 65 | 
 66 |     # pixel decoder config
 67 |     cfg.MODEL.SEM_SEG_HEAD.MASK_DIM = 256
 68 |     # adding transformer in pixel decoder
 69 |     cfg.MODEL.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS = 0
 70 |     # pixel decoder
 71 |     cfg.MODEL.SEM_SEG_HEAD.PIXEL_DECODER_NAME = "BasePixelDecoder"
 72 | 
 73 |     # swin transformer backbone
 74 |     cfg.MODEL.SWIN = CN()
 75 |     cfg.MODEL.SWIN.PRETRAIN_IMG_SIZE = 224
 76 |     cfg.MODEL.SWIN.PATCH_SIZE = 4
 77 |     cfg.MODEL.SWIN.EMBED_DIM = 96
 78 |     cfg.MODEL.SWIN.DEPTHS = [2, 2, 6, 2]
 79 |     cfg.MODEL.SWIN.NUM_HEADS = [3, 6, 12, 24]
 80 |     cfg.MODEL.SWIN.WINDOW_SIZE = 7
 81 |     cfg.MODEL.SWIN.MLP_RATIO = 4.0
 82 |     cfg.MODEL.SWIN.QKV_BIAS = True
 83 |     cfg.MODEL.SWIN.QK_SCALE = None
 84 |     cfg.MODEL.SWIN.DROP_RATE = 0.0
 85 |     cfg.MODEL.SWIN.ATTN_DROP_RATE = 0.0
 86 |     cfg.MODEL.SWIN.DROP_PATH_RATE = 0.3
 87 |     cfg.MODEL.SWIN.APE = False
 88 |     cfg.MODEL.SWIN.PATCH_NORM = True
 89 |     cfg.MODEL.SWIN.OUT_FEATURES = ["res2", "res3", "res4", "res5"]
 90 |     cfg.MODEL.SWIN.USE_CHECKPOINT = False
 91 | 
 92 |     # NOTE: maskformer2 extra configs
 93 |     # transformer module
 94 |     cfg.MODEL.MASK_FORMER.TRANSFORMER_DECODER_NAME = "MultiScaleMaskedTransformerDecoder"
 95 | 
 96 |     # LSJ aug
 97 |     cfg.INPUT.IMAGE_SIZE = 1024
 98 |     cfg.INPUT.MIN_SCALE = 0.1
 99 |     cfg.INPUT.MAX_SCALE = 2.0
100 | 
101 |     # MSDeformAttn encoder configs
102 |     cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES = ["res3", "res4", "res5"]
103 |     cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_POINTS = 4
104 |     cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_HEADS = 8
105 | 
106 |     # point loss configs
107 |     # Number of points sampled during training for a mask point head.
108 |     cfg.MODEL.MASK_FORMER.TRAIN_NUM_POINTS = 112 * 112
109 |     # Oversampling parameter for PointRend point sampling during training. Parameter `k` in the
110 |     # original paper.
111 |     cfg.MODEL.MASK_FORMER.OVERSAMPLE_RATIO = 3.0
112 |     # Importance sampling parameter for PointRend point sampling during training. Parametr `beta` in
113 |     # the original paper.
114 |     cfg.MODEL.MASK_FORMER.IMPORTANCE_SAMPLE_RATIO = 0.75
115 | 
116 |     # add MaskCLIP configs
117 |     cfg.MODEL.CLIP_MODEL = CN()
118 |     cfg.MODEL.CLIP_MODEL.NAME = 'ViT-L/14@336px'
119 |     cfg.MODEL.CLIP_MODEL.INPUT_RESOLUTION = 336
120 |     cfg.MODEL.CLIP_MODEL.PATCH_SIZE = 14
121 |     cfg.MODEL.CLIP_MODEL.WIDTH = 1024
122 |     cfg.MODEL.CLIP_MODEL.LAYERS = 24
123 |     cfg.MODEL.CLIP_MODEL.HEADS = 16
124 |     cfg.MODEL.CLIP_MODEL.OUTPUT_DIM = 768
125 | 
126 |     cfg.MODEL.CLIP_MODEL.TEMPERATURE = 0.01
127 | 
128 | 
129 | 
130 | 


--------------------------------------------------------------------------------
/datasets/README.md:
--------------------------------------------------------------------------------
  1 | # Prepare Datasets for Mask2Former
  2 | 
  3 | A dataset can be used by accessing [DatasetCatalog](https://detectron2.readthedocs.io/modules/data.html#detectron2.data.DatasetCatalog)
  4 | for its data, or [MetadataCatalog](https://detectron2.readthedocs.io/modules/data.html#detectron2.data.MetadataCatalog) for its metadata (class names, etc).
  5 | This document explains how to setup the builtin datasets so they can be used by the above APIs.
  6 | [Use Custom Datasets](https://detectron2.readthedocs.io/tutorials/datasets.html) gives a deeper dive on how to use `DatasetCatalog` and `MetadataCatalog`,
  7 | and how to add new datasets to them.
  8 | 
  9 | MaskFormer has builtin support for a few datasets.
 10 | The datasets are assumed to exist in a directory specified by the environment variable
 11 | `DETECTRON2_DATASETS`.
 12 | Under this directory, detectron2 will look for datasets in the structure described below, if needed.
 13 | ```
 14 | $DETECTRON2_DATASETS/
 15 |   ADEChallengeData2016/
 16 |   coco/
 17 |   cityscapes/
 18 |   mapillary_vistas/
 19 | ```
 20 | 
 21 | You can set the location for builtin datasets by `export DETECTRON2_DATASETS=/path/to/datasets`.
 22 | If left unset, the default is `./datasets` relative to your current working directory.
 23 | 
 24 | The [model zoo](https://github.com/facebookresearch/MaskFormer/blob/master/MODEL_ZOO.md)
 25 | contains configs and models that use these builtin datasets.
 26 | 
 27 | 
 28 | ## Expected dataset structure for [COCO](https://cocodataset.org/#download):
 29 | 
 30 | ```
 31 | coco/
 32 |   annotations/
 33 |     instances_{train,val}2017.json
 34 |     panoptic_{train,val}2017.json
 35 |   {train,val}2017/
 36 |     # image files that are mentioned in the corresponding json
 37 |   panoptic_{train,val}2017/  # png annotations
 38 |   panoptic_semseg_{train,val}2017/  # generated by the script mentioned below
 39 | ```
 40 | 
 41 | Install panopticapi by:
 42 | ```
 43 | pip install git+https://github.com/cocodataset/panopticapi.git
 44 | ```
 45 | Then, run `python datasets/prepare_coco_semantic_annos_from_panoptic_annos.py`, to extract semantic annotations from panoptic annotations (only used for evaluation).
 46 | 
 47 | 
 48 | ## Expected dataset structure for [cityscapes](https://www.cityscapes-dataset.com/downloads/):
 49 | ```
 50 | cityscapes/
 51 |   gtFine/
 52 |     train/
 53 |       aachen/
 54 |         color.png, instanceIds.png, labelIds.png, polygons.json,
 55 |         labelTrainIds.png
 56 |       ...
 57 |     val/
 58 |     test/
 59 |     # below are generated Cityscapes panoptic annotation
 60 |     cityscapes_panoptic_train.json
 61 |     cityscapes_panoptic_train/
 62 |     cityscapes_panoptic_val.json
 63 |     cityscapes_panoptic_val/
 64 |     cityscapes_panoptic_test.json
 65 |     cityscapes_panoptic_test/
 66 |   leftImg8bit/
 67 |     train/
 68 |     val/
 69 |     test/
 70 | ```
 71 | Install cityscapes scripts by:
 72 | ```
 73 | pip install git+https://github.com/mcordts/cityscapesScripts.git
 74 | ```
 75 | 
 76 | Note: to create labelTrainIds.png, first prepare the above structure, then run cityscapesescript with:
 77 | ```
 78 | CITYSCAPES_DATASET=/path/to/abovementioned/cityscapes python cityscapesscripts/preparation/createTrainIdLabelImgs.py
 79 | ```
 80 | These files are not needed for instance segmentation.
 81 | 
 82 | Note: to generate Cityscapes panoptic dataset, run cityscapesescript with:
 83 | ```
 84 | CITYSCAPES_DATASET=/path/to/abovementioned/cityscapes python cityscapesscripts/preparation/createPanopticImgs.py
 85 | ```
 86 | These files are not needed for semantic and instance segmentation.
 87 | 
 88 | 
 89 | ## Expected dataset structure for [ADE20k](http://sceneparsing.csail.mit.edu/):
 90 | ```
 91 | ADEChallengeData2016/
 92 |   images/
 93 |   annotations/
 94 |   objectInfo150.txt
 95 |   # download instance annotation
 96 |   annotations_instance/
 97 |   # generated by prepare_ade20k_sem_seg.py
 98 |   annotations_detectron2/
 99 |   # below are generated by prepare_ade20k_pan_seg.py
100 |   ade20k_panoptic_{train,val}.json
101 |   ade20k_panoptic_{train,val}/
102 |   # below are generated by prepare_ade20k_ins_seg.py
103 |   ade20k_instance_{train,val}.json
104 | ```
105 | 
106 | The directory `annotations_detectron2` is generated by running `python datasets/prepare_ade20k_sem_seg.py`.
107 | 
108 | Install panopticapi by:
109 | ```bash
110 | pip install git+https://github.com/cocodataset/panopticapi.git
111 | ```
112 | 
113 | Download the instance annotation from http://sceneparsing.csail.mit.edu/:
114 | ```bash
115 | wget http://sceneparsing.csail.mit.edu/data/ChallengeData2017/annotations_instance.tar
116 | ```
117 | 
118 | Then, run `python datasets/prepare_ade20k_pan_seg.py`, to combine semantic and instance annotations for panoptic annotations.
119 | 
120 | And run `python datasets/prepare_ade20k_ins_seg.py`, to extract instance annotations in COCO format.
121 | 
122 | 
123 | ## Expected dataset structure for [Mapillary Vistas](https://www.mapillary.com/dataset/vistas):
124 | ```
125 | mapillary_vistas/
126 |   training/
127 |     images/
128 |     instances/
129 |     labels/
130 |     panoptic/
131 |   validation/
132 |     images/
133 |     instances/
134 |     labels/
135 |     panoptic/
136 |   mapillary_vistas_instance_{train,val}.json  # generated by the script mentioned below
137 | ```
138 | 
139 | No preprocessing is needed for Mapillary Vistas on semantic and panoptic segmentation.
140 | 
141 | If you want to evaluate instance segmentation on Mapillary Vistas, run `python datasets/prepare_mapillary_vistas_ins_seg.py` to generate COCO-style instance annotations.
142 | 
143 | 
144 | ## Expected dataset structure for [YouTubeVIS 2019](https://competitions.codalab.org/competitions/20128):
145 | 
146 | ```
147 | ytvis_2019/
148 |   {train,valid,test}.json
149 |   {train,valid,test}/
150 |     Annotations/
151 |     JPEGImages/
152 | ```
153 | 
154 | ## Expected dataset structure for [YouTubeVIS 2021](https://competitions.codalab.org/competitions/28988):
155 | 
156 | ```
157 | ytvis_2021/
158 |   {train,valid,test}.json
159 |   {train,valid,test}/
160 |     Annotations/
161 |     JPEGImages/
162 | ```
163 | 


--------------------------------------------------------------------------------
/maskclip/modeling/meta_arch/mask_former_head.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | import logging
  3 | from copy import deepcopy
  4 | from typing import Callable, Dict, List, Optional, Tuple, Union
  5 | 
  6 | import fvcore.nn.weight_init as weight_init
  7 | from torch import nn
  8 | from torch.nn import functional as F
  9 | 
 10 | from detectron2.config import configurable
 11 | from detectron2.layers import Conv2d, ShapeSpec, get_norm
 12 | from detectron2.modeling import SEM_SEG_HEADS_REGISTRY
 13 | 
 14 | from ..transformer_decoder.maskformer_transformer_decoder import build_transformer_decoder
 15 | from ..pixel_decoder.fpn import build_pixel_decoder
 16 | 
 17 | 
 18 | @SEM_SEG_HEADS_REGISTRY.register()
 19 | class MaskFormerHead(nn.Module):
 20 | 
 21 |     _version = 2
 22 | 
 23 |     def _load_from_state_dict(
 24 |         self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
 25 |     ):
 26 |         version = local_metadata.get("version", None)
 27 |         if version is None or version < 2:
 28 |             # Do not warn if train from scratch
 29 |             scratch = True
 30 |             logger = logging.getLogger(__name__)
 31 |             for k in list(state_dict.keys()):
 32 |                 newk = k
 33 |                 if "sem_seg_head" in k and not k.startswith(prefix + "predictor"):
 34 |                     newk = k.replace(prefix, prefix + "pixel_decoder.")
 35 |                     # logger.debug(f"{k} ==> {newk}")
 36 |                 if newk != k:
 37 |                     state_dict[newk] = state_dict[k]
 38 |                     del state_dict[k]
 39 |                     scratch = False
 40 | 
 41 |             if not scratch:
 42 |                 logger.warning(
 43 |                     f"Weight format of {self.__class__.__name__} have changed! "
 44 |                     "Please upgrade your models. Applying automatic conversion now ..."
 45 |                 )
 46 | 
 47 |     @configurable
 48 |     def __init__(
 49 |         self,
 50 |         input_shape: Dict[str, ShapeSpec],
 51 |         *,
 52 |         num_classes: int,
 53 |         pixel_decoder: nn.Module,
 54 |         loss_weight: float = 1.0,
 55 |         ignore_value: int = -1,
 56 |         # extra parameters
 57 |         transformer_predictor: nn.Module,
 58 |         transformer_in_feature: str,
 59 |     ):
 60 |         """
 61 |         NOTE: this interface is experimental.
 62 |         Args:
 63 |             input_shape: shapes (channels and stride) of the input features
 64 |             num_classes: number of classes to predict
 65 |             pixel_decoder: the pixel decoder module
 66 |             loss_weight: loss weight
 67 |             ignore_value: category id to be ignored during training.
 68 |             transformer_predictor: the transformer decoder that makes prediction
 69 |             transformer_in_feature: input feature name to the transformer_predictor
 70 |         """
 71 |         super().__init__()
 72 |         input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride)
 73 |         self.in_features = [k for k, v in input_shape]
 74 |         feature_strides = [v.stride for k, v in input_shape]
 75 |         feature_channels = [v.channels for k, v in input_shape]
 76 | 
 77 |         self.ignore_value = ignore_value
 78 |         self.common_stride = 4
 79 |         self.loss_weight = loss_weight
 80 | 
 81 |         self.pixel_decoder = pixel_decoder
 82 |         self.predictor = transformer_predictor
 83 |         self.transformer_in_feature = transformer_in_feature
 84 | 
 85 |         self.num_classes = num_classes
 86 | 
 87 |     @classmethod
 88 |     def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
 89 |         # figure out in_channels to transformer predictor
 90 |         if cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "transformer_encoder":
 91 |             transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
 92 |         elif cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "pixel_embedding":
 93 |             transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM
 94 |         elif cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "multi_scale_pixel_decoder":  # for maskformer2
 95 |             transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
 96 |         else:
 97 |             transformer_predictor_in_channels = input_shape[cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE].channels
 98 | 
 99 |         return {
100 |             "input_shape": {
101 |                 k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES
102 |             },
103 |             "ignore_value": cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
104 |             "num_classes": cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES,
105 |             "pixel_decoder": build_pixel_decoder(cfg, input_shape),
106 |             "loss_weight": cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT,
107 |             "transformer_in_feature": cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE,
108 |             "transformer_predictor": build_transformer_decoder(
109 |                 cfg,
110 |                 transformer_predictor_in_channels,
111 |                 mask_classification=True,
112 |             ),
113 |         }
114 | 
115 |     def forward(self, features, mask=None):
116 |         return self.layers(features, mask)
117 | 
118 |     def layers(self, features, mask=None):
119 |         mask_features, transformer_encoder_features, multi_scale_features = self.pixel_decoder.forward_features(features)
120 |         if self.transformer_in_feature == "multi_scale_pixel_decoder":
121 |             predictions = self.predictor(multi_scale_features, mask_features, mask)
122 |         else:
123 |             if self.transformer_in_feature == "transformer_encoder":
124 |                 assert (
125 |                     transformer_encoder_features is not None
126 |                 ), "Please use the TransformerEncoderPixelDecoder."
127 |                 predictions = self.predictor(transformer_encoder_features, mask_features, mask)
128 |             elif self.transformer_in_feature == "pixel_embedding":
129 |                 predictions = self.predictor(mask_features, mask_features, mask)
130 |             else:
131 |                 predictions = self.predictor(features[self.transformer_in_feature], mask_features, mask)
132 |         return predictions
133 | 


--------------------------------------------------------------------------------
/maskclip/data/dataset_mappers/coco_panoptic_new_baseline_dataset_mapper.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/d2/detr/dataset_mapper.py
  3 | import copy
  4 | import logging
  5 | 
  6 | import numpy as np
  7 | import torch
  8 | 
  9 | from detectron2.config import configurable
 10 | from detectron2.data import detection_utils as utils
 11 | from detectron2.data import transforms as T
 12 | from detectron2.data.transforms import TransformGen
 13 | from detectron2.structures import BitMasks, Boxes, Instances
 14 | 
 15 | __all__ = ["COCOPanopticNewBaselineDatasetMapper"]
 16 | 
 17 | 
 18 | def build_transform_gen(cfg, is_train):
 19 |     """
 20 |     Create a list of default :class:`Augmentation` from config.
 21 |     Now it includes resizing and flipping.
 22 |     Returns:
 23 |         list[Augmentation]
 24 |     """
 25 |     assert is_train, "Only support training augmentation"
 26 |     image_size = cfg.INPUT.IMAGE_SIZE
 27 |     min_scale = cfg.INPUT.MIN_SCALE
 28 |     max_scale = cfg.INPUT.MAX_SCALE
 29 | 
 30 |     augmentation = []
 31 | 
 32 |     if cfg.INPUT.RANDOM_FLIP != "none":
 33 |         augmentation.append(
 34 |             T.RandomFlip(
 35 |                 horizontal=cfg.INPUT.RANDOM_FLIP == "horizontal",
 36 |                 vertical=cfg.INPUT.RANDOM_FLIP == "vertical",
 37 |             )
 38 |         )
 39 | 
 40 |     augmentation.extend([
 41 |         # T.ResizeScale(
 42 |         #     min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size
 43 |         # ),
 44 |         # T.FixedSizeCrop(crop_size=(image_size, image_size)),
 45 | 
 46 |         T.Resize((image_size, image_size))
 47 |     ])
 48 | 
 49 |     return augmentation
 50 | 
 51 | 
 52 | # This is specifically designed for the COCO dataset.
 53 | class COCOPanopticNewBaselineDatasetMapper:
 54 |     """
 55 |     A callable which takes a dataset dict in Detectron2 Dataset format,
 56 |     and map it into a format used by MaskFormer.
 57 | 
 58 |     This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation.
 59 | 
 60 |     The callable currently does the following:
 61 | 
 62 |     1. Read the image from "file_name"
 63 |     2. Applies geometric transforms to the image and annotation
 64 |     3. Find and applies suitable cropping to the image and annotation
 65 |     4. Prepare image and annotation to Tensors
 66 |     """
 67 | 
 68 |     @configurable
 69 |     def __init__(
 70 |         self,
 71 |         is_train=True,
 72 |         *,
 73 |         tfm_gens,
 74 |         image_format,
 75 |     ):
 76 |         """
 77 |         NOTE: this interface is experimental.
 78 |         Args:
 79 |             is_train: for training or inference
 80 |             augmentations: a list of augmentations or deterministic transforms to apply
 81 |             crop_gen: crop augmentation
 82 |             tfm_gens: data augmentation
 83 |             image_format: an image format supported by :func:`detection_utils.read_image`.
 84 |         """
 85 |         self.tfm_gens = tfm_gens
 86 |         logging.getLogger(__name__).info(
 87 |             "[COCOPanopticNewBaselineDatasetMapper] Full TransformGens used in training: {}".format(
 88 |                 str(self.tfm_gens)
 89 |             )
 90 |         )
 91 | 
 92 |         self.img_format = image_format
 93 |         self.is_train = is_train
 94 | 
 95 |     @classmethod
 96 |     def from_config(cls, cfg, is_train=True):
 97 |         # Build augmentation
 98 |         tfm_gens = build_transform_gen(cfg, is_train)
 99 | 
100 |         ret = {
101 |             "is_train": is_train,
102 |             "tfm_gens": tfm_gens,
103 |             "image_format": cfg.INPUT.FORMAT,
104 |         }
105 |         return ret
106 | 
107 |     def __call__(self, dataset_dict):
108 |         """
109 |         Args:
110 |             dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
111 | 
112 |         Returns:
113 |             dict: a format that builtin models in detectron2 accept
114 |         """
115 |         dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
116 |         image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
117 |         utils.check_image_size(dataset_dict, image)
118 | 
119 |         image, transforms = T.apply_transform_gens(self.tfm_gens, image)
120 |         image_shape = image.shape[:2]  # h, w
121 | 
122 |         # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
123 |         # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
124 |         # Therefore it's important to use torch.Tensor.
125 |         dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
126 | 
127 |         if not self.is_train:
128 |             # USER: Modify this if you want to keep them for some reason.
129 |             dataset_dict.pop("annotations", None)
130 |             return dataset_dict
131 | 
132 |         if "pan_seg_file_name" in dataset_dict:
133 |             pan_seg_gt = utils.read_image(dataset_dict.pop("pan_seg_file_name"), "RGB")
134 |             segments_info = dataset_dict["segments_info"]
135 | 
136 |             # apply the same transformation to panoptic segmentation
137 |             pan_seg_gt = transforms.apply_segmentation(pan_seg_gt)
138 | 
139 |             from panopticapi.utils import rgb2id
140 | 
141 |             pan_seg_gt = rgb2id(pan_seg_gt)
142 | 
143 |             instances = Instances(image_shape)
144 |             classes = []
145 |             masks = []
146 |             for segment_info in segments_info:
147 |                 class_id = segment_info["category_id"]
148 |                 if not segment_info["iscrowd"]:
149 |                     classes.append(class_id)
150 |                     masks.append(pan_seg_gt == segment_info["id"])
151 | 
152 |             classes = np.array(classes)
153 |             instances.gt_classes = torch.tensor(classes, dtype=torch.int64)
154 |             if len(masks) == 0:
155 |                 # Some image does not have annotation (all ignored)
156 |                 instances.gt_masks = torch.zeros((0, pan_seg_gt.shape[-2], pan_seg_gt.shape[-1]))
157 |                 instances.gt_boxes = Boxes(torch.zeros((0, 4)))
158 |             else:
159 |                 masks = BitMasks(
160 |                     torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks])
161 |                 )
162 |                 instances.gt_masks = masks.tensor
163 |                 instances.gt_boxes = masks.get_bounding_boxes()
164 | 
165 |             dataset_dict["instances"] = instances
166 | 
167 |         return dataset_dict
168 | 


--------------------------------------------------------------------------------
/maskclip/data/dataset_mappers/mask_former_panoptic_dataset_mapper.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | import copy
  3 | import logging
  4 | 
  5 | import numpy as np
  6 | import torch
  7 | from torch.nn import functional as F
  8 | 
  9 | from detectron2.config import configurable
 10 | from detectron2.data import detection_utils as utils
 11 | from detectron2.data import transforms as T
 12 | from detectron2.structures import BitMasks, Instances
 13 | 
 14 | from .mask_former_semantic_dataset_mapper import MaskFormerSemanticDatasetMapper
 15 | 
 16 | __all__ = ["MaskFormerPanopticDatasetMapper"]
 17 | 
 18 | 
 19 | class MaskFormerPanopticDatasetMapper(MaskFormerSemanticDatasetMapper):
 20 |     """
 21 |     A callable which takes a dataset dict in Detectron2 Dataset format,
 22 |     and map it into a format used by MaskFormer for panoptic segmentation.
 23 | 
 24 |     The callable currently does the following:
 25 | 
 26 |     1. Read the image from "file_name"
 27 |     2. Applies geometric transforms to the image and annotation
 28 |     3. Find and applies suitable cropping to the image and annotation
 29 |     4. Prepare image and annotation to Tensors
 30 |     """
 31 | 
 32 |     @configurable
 33 |     def __init__(
 34 |         self,
 35 |         is_train=True,
 36 |         *,
 37 |         augmentations,
 38 |         image_format,
 39 |         ignore_label,
 40 |         size_divisibility,
 41 |     ):
 42 |         """
 43 |         NOTE: this interface is experimental.
 44 |         Args:
 45 |             is_train: for training or inference
 46 |             augmentations: a list of augmentations or deterministic transforms to apply
 47 |             image_format: an image format supported by :func:`detection_utils.read_image`.
 48 |             ignore_label: the label that is ignored to evaluation
 49 |             size_divisibility: pad image size to be divisible by this value
 50 |         """
 51 |         super().__init__(
 52 |             is_train,
 53 |             augmentations=augmentations,
 54 |             image_format=image_format,
 55 |             ignore_label=ignore_label,
 56 |             size_divisibility=size_divisibility,
 57 |         )
 58 | 
 59 |     def __call__(self, dataset_dict):
 60 |         """
 61 |         Args:
 62 |             dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
 63 | 
 64 |         Returns:
 65 |             dict: a format that builtin models in detectron2 accept
 66 |         """
 67 |         assert self.is_train, "MaskFormerPanopticDatasetMapper should only be used for training!"
 68 | 
 69 |         dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
 70 |         image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
 71 |         utils.check_image_size(dataset_dict, image)
 72 | 
 73 |         # semantic segmentation
 74 |         if "sem_seg_file_name" in dataset_dict:
 75 |             # PyTorch transformation not implemented for uint16, so converting it to double first
 76 |             sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name")).astype("double")
 77 |         else:
 78 |             sem_seg_gt = None
 79 | 
 80 |         # panoptic segmentation
 81 |         if "pan_seg_file_name" in dataset_dict:
 82 |             pan_seg_gt = utils.read_image(dataset_dict.pop("pan_seg_file_name"), "RGB")
 83 |             segments_info = dataset_dict["segments_info"]
 84 |         else:
 85 |             pan_seg_gt = None
 86 |             segments_info = None
 87 | 
 88 |         if pan_seg_gt is None:
 89 |             raise ValueError(
 90 |                 "Cannot find 'pan_seg_file_name' for panoptic segmentation dataset {}.".format(
 91 |                     dataset_dict["file_name"]
 92 |                 )
 93 |             )
 94 | 
 95 |         aug_input = T.AugInput(image, sem_seg=sem_seg_gt)
 96 |         aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input)
 97 |         image = aug_input.image
 98 |         if sem_seg_gt is not None:
 99 |             sem_seg_gt = aug_input.sem_seg
100 | 
101 |         # apply the same transformation to panoptic segmentation
102 |         pan_seg_gt = transforms.apply_segmentation(pan_seg_gt)
103 | 
104 |         from panopticapi.utils import rgb2id
105 | 
106 |         pan_seg_gt = rgb2id(pan_seg_gt)
107 | 
108 |         # Pad image and segmentation label here!
109 |         image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
110 |         if sem_seg_gt is not None:
111 |             sem_seg_gt = torch.as_tensor(sem_seg_gt.astype("long"))
112 |         pan_seg_gt = torch.as_tensor(pan_seg_gt.astype("long"))
113 | 
114 |         if self.size_divisibility > 0:
115 |             image_size = (image.shape[-2], image.shape[-1])
116 |             padding_size = [
117 |                 0,
118 |                 self.size_divisibility - image_size[1],
119 |                 0,
120 |                 self.size_divisibility - image_size[0],
121 |             ]
122 |             image = F.pad(image, padding_size, value=128).contiguous()
123 |             if sem_seg_gt is not None:
124 |                 sem_seg_gt = F.pad(sem_seg_gt, padding_size, value=self.ignore_label).contiguous()
125 |             pan_seg_gt = F.pad(
126 |                 pan_seg_gt, padding_size, value=0
127 |             ).contiguous()  # 0 is the VOID panoptic label
128 | 
129 |         image_shape = (image.shape[-2], image.shape[-1])  # h, w
130 | 
131 |         # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
132 |         # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
133 |         # Therefore it's important to use torch.Tensor.
134 |         dataset_dict["image"] = image
135 |         if sem_seg_gt is not None:
136 |             dataset_dict["sem_seg"] = sem_seg_gt.long()
137 | 
138 |         if "annotations" in dataset_dict:
139 |             raise ValueError("Pemantic segmentation dataset should not have 'annotations'.")
140 | 
141 |         # Prepare per-category binary masks
142 |         pan_seg_gt = pan_seg_gt.numpy()
143 |         instances = Instances(image_shape)
144 |         classes = []
145 |         masks = []
146 |         for segment_info in segments_info:
147 |             class_id = segment_info["category_id"]
148 |             if not segment_info["iscrowd"]:
149 |                 classes.append(class_id)
150 |                 masks.append(pan_seg_gt == segment_info["id"])
151 | 
152 |         classes = np.array(classes)
153 |         instances.gt_classes = torch.tensor(classes, dtype=torch.int64)
154 |         if len(masks) == 0:
155 |             # Some image does not have annotation (all ignored)
156 |             instances.gt_masks = torch.zeros((0, pan_seg_gt.shape[-2], pan_seg_gt.shape[-1]))
157 |         else:
158 |             masks = BitMasks(
159 |                 torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks])
160 |             )
161 |             instances.gt_masks = masks.tensor
162 | 
163 |         dataset_dict["instances"] = instances
164 | 
165 |         return dataset_dict
166 | 


--------------------------------------------------------------------------------
/maskclip/modeling/pixel_decoder/ops/modules/ms_deform_attn.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------------------------------
  2 | # Deformable DETR
  3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
  4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  5 | # ------------------------------------------------------------------------------------------------
  6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
  7 | # ------------------------------------------------------------------------------------------------
  8 | 
  9 | # Copyright (c) Facebook, Inc. and its affiliates.
 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
 11 | 
 12 | from __future__ import absolute_import
 13 | from __future__ import print_function
 14 | from __future__ import division
 15 | 
 16 | import warnings
 17 | import math
 18 | 
 19 | import torch
 20 | from torch import nn
 21 | import torch.nn.functional as F
 22 | from torch.nn.init import xavier_uniform_, constant_
 23 | 
 24 | from ..functions import MSDeformAttnFunction
 25 | from ..functions.ms_deform_attn_func import ms_deform_attn_core_pytorch
 26 | 
 27 | 
 28 | def _is_power_of_2(n):
 29 |     if (not isinstance(n, int)) or (n < 0):
 30 |         raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n)))
 31 |     return (n & (n-1) == 0) and n != 0
 32 | 
 33 | 
 34 | class MSDeformAttn(nn.Module):
 35 |     def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4):
 36 |         """
 37 |         Multi-Scale Deformable Attention Module
 38 |         :param d_model      hidden dimension
 39 |         :param n_levels     number of feature levels
 40 |         :param n_heads      number of attention heads
 41 |         :param n_points     number of sampling points per attention head per feature level
 42 |         """
 43 |         super().__init__()
 44 |         if d_model % n_heads != 0:
 45 |             raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads))
 46 |         _d_per_head = d_model // n_heads
 47 |         # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation
 48 |         if not _is_power_of_2(_d_per_head):
 49 |             warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 "
 50 |                           "which is more efficient in our CUDA implementation.")
 51 | 
 52 |         self.im2col_step = 128
 53 | 
 54 |         self.d_model = d_model
 55 |         self.n_levels = n_levels
 56 |         self.n_heads = n_heads
 57 |         self.n_points = n_points
 58 | 
 59 |         self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2)
 60 |         self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points)
 61 |         self.value_proj = nn.Linear(d_model, d_model)
 62 |         self.output_proj = nn.Linear(d_model, d_model)
 63 | 
 64 |         self._reset_parameters()
 65 | 
 66 |     def _reset_parameters(self):
 67 |         constant_(self.sampling_offsets.weight.data, 0.)
 68 |         thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
 69 |         grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
 70 |         grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1)
 71 |         for i in range(self.n_points):
 72 |             grid_init[:, :, i, :] *= i + 1
 73 |         with torch.no_grad():
 74 |             self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
 75 |         constant_(self.attention_weights.weight.data, 0.)
 76 |         constant_(self.attention_weights.bias.data, 0.)
 77 |         xavier_uniform_(self.value_proj.weight.data)
 78 |         constant_(self.value_proj.bias.data, 0.)
 79 |         xavier_uniform_(self.output_proj.weight.data)
 80 |         constant_(self.output_proj.bias.data, 0.)
 81 | 
 82 |     def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None):
 83 |         """
 84 |         :param query                       (N, Length_{query}, C)
 85 |         :param reference_points            (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area
 86 |                                         or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes
 87 |         :param input_flatten               (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C)
 88 |         :param input_spatial_shapes        (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
 89 |         :param input_level_start_index     (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}]
 90 |         :param input_padding_mask          (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements
 91 | 
 92 |         :return output                     (N, Length_{query}, C)
 93 |         """
 94 |         N, Len_q, _ = query.shape
 95 |         N, Len_in, _ = input_flatten.shape
 96 |         assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in
 97 | 
 98 |         value = self.value_proj(input_flatten)
 99 |         if input_padding_mask is not None:
100 |             value = value.masked_fill(input_padding_mask[..., None], float(0))
101 |         value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads)
102 |         sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2)
103 |         attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points)
104 |         attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points)
105 |         # N, Len_q, n_heads, n_levels, n_points, 2
106 |         if reference_points.shape[-1] == 2:
107 |             offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1)
108 |             sampling_locations = reference_points[:, :, None, :, None, :] \
109 |                                  + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
110 |         elif reference_points.shape[-1] == 4:
111 |             sampling_locations = reference_points[:, :, None, :, None, :2] \
112 |                                  + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
113 |         else:
114 |             raise ValueError(
115 |                 'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1]))
116 |         try:
117 |             output = MSDeformAttnFunction.apply(
118 |                 value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step)
119 |         except:
120 |             # CPU
121 |             output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights)
122 |         # # For FLOPs calculation only
123 |         # output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights)
124 |         output = self.output_proj(output)
125 |         return output
126 | 


--------------------------------------------------------------------------------
/maskclip/modeling/pixel_decoder/ops/build/lib.linux-x86_64-3.9/modules/ms_deform_attn.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------------------------------
  2 | # Deformable DETR
  3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
  4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  5 | # ------------------------------------------------------------------------------------------------
  6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
  7 | # ------------------------------------------------------------------------------------------------
  8 | 
  9 | # Copyright (c) Facebook, Inc. and its affiliates.
 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
 11 | 
 12 | from __future__ import absolute_import
 13 | from __future__ import print_function
 14 | from __future__ import division
 15 | 
 16 | import warnings
 17 | import math
 18 | 
 19 | import torch
 20 | from torch import nn
 21 | import torch.nn.functional as F
 22 | from torch.nn.init import xavier_uniform_, constant_
 23 | 
 24 | from ..functions import MSDeformAttnFunction
 25 | from ..functions.ms_deform_attn_func import ms_deform_attn_core_pytorch
 26 | 
 27 | 
 28 | def _is_power_of_2(n):
 29 |     if (not isinstance(n, int)) or (n < 0):
 30 |         raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n)))
 31 |     return (n & (n-1) == 0) and n != 0
 32 | 
 33 | 
 34 | class MSDeformAttn(nn.Module):
 35 |     def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4):
 36 |         """
 37 |         Multi-Scale Deformable Attention Module
 38 |         :param d_model      hidden dimension
 39 |         :param n_levels     number of feature levels
 40 |         :param n_heads      number of attention heads
 41 |         :param n_points     number of sampling points per attention head per feature level
 42 |         """
 43 |         super().__init__()
 44 |         if d_model % n_heads != 0:
 45 |             raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads))
 46 |         _d_per_head = d_model // n_heads
 47 |         # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation
 48 |         if not _is_power_of_2(_d_per_head):
 49 |             warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 "
 50 |                           "which is more efficient in our CUDA implementation.")
 51 | 
 52 |         self.im2col_step = 128
 53 | 
 54 |         self.d_model = d_model
 55 |         self.n_levels = n_levels
 56 |         self.n_heads = n_heads
 57 |         self.n_points = n_points
 58 | 
 59 |         self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2)
 60 |         self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points)
 61 |         self.value_proj = nn.Linear(d_model, d_model)
 62 |         self.output_proj = nn.Linear(d_model, d_model)
 63 | 
 64 |         self._reset_parameters()
 65 | 
 66 |     def _reset_parameters(self):
 67 |         constant_(self.sampling_offsets.weight.data, 0.)
 68 |         thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
 69 |         grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
 70 |         grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1)
 71 |         for i in range(self.n_points):
 72 |             grid_init[:, :, i, :] *= i + 1
 73 |         with torch.no_grad():
 74 |             self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
 75 |         constant_(self.attention_weights.weight.data, 0.)
 76 |         constant_(self.attention_weights.bias.data, 0.)
 77 |         xavier_uniform_(self.value_proj.weight.data)
 78 |         constant_(self.value_proj.bias.data, 0.)
 79 |         xavier_uniform_(self.output_proj.weight.data)
 80 |         constant_(self.output_proj.bias.data, 0.)
 81 | 
 82 |     def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None):
 83 |         """
 84 |         :param query                       (N, Length_{query}, C)
 85 |         :param reference_points            (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area
 86 |                                         or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes
 87 |         :param input_flatten               (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C)
 88 |         :param input_spatial_shapes        (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
 89 |         :param input_level_start_index     (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}]
 90 |         :param input_padding_mask          (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements
 91 | 
 92 |         :return output                     (N, Length_{query}, C)
 93 |         """
 94 |         N, Len_q, _ = query.shape
 95 |         N, Len_in, _ = input_flatten.shape
 96 |         assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in
 97 | 
 98 |         value = self.value_proj(input_flatten)
 99 |         if input_padding_mask is not None:
100 |             value = value.masked_fill(input_padding_mask[..., None], float(0))
101 |         value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads)
102 |         sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2)
103 |         attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points)
104 |         attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points)
105 |         # N, Len_q, n_heads, n_levels, n_points, 2
106 |         if reference_points.shape[-1] == 2:
107 |             offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1)
108 |             sampling_locations = reference_points[:, :, None, :, None, :] \
109 |                                  + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
110 |         elif reference_points.shape[-1] == 4:
111 |             sampling_locations = reference_points[:, :, None, :, None, :2] \
112 |                                  + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
113 |         else:
114 |             raise ValueError(
115 |                 'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1]))
116 |         try:
117 |             output = MSDeformAttnFunction.apply(
118 |                 value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step)
119 |         except:
120 |             # CPU
121 |             output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights)
122 |         # # For FLOPs calculation only
123 |         # output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights)
124 |         output = self.output_proj(output)
125 |         return output
126 | 


--------------------------------------------------------------------------------
/maskclip/modeling/pixel_decoder/ops/build/lib.linux-x86_64-cpython-39/modules/ms_deform_attn.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------------------------------
  2 | # Deformable DETR
  3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
  4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  5 | # ------------------------------------------------------------------------------------------------
  6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
  7 | # ------------------------------------------------------------------------------------------------
  8 | 
  9 | # Copyright (c) Facebook, Inc. and its affiliates.
 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
 11 | 
 12 | from __future__ import absolute_import
 13 | from __future__ import print_function
 14 | from __future__ import division
 15 | 
 16 | import warnings
 17 | import math
 18 | 
 19 | import torch
 20 | from torch import nn
 21 | import torch.nn.functional as F
 22 | from torch.nn.init import xavier_uniform_, constant_
 23 | 
 24 | from ..functions import MSDeformAttnFunction
 25 | from ..functions.ms_deform_attn_func import ms_deform_attn_core_pytorch
 26 | 
 27 | 
 28 | def _is_power_of_2(n):
 29 |     if (not isinstance(n, int)) or (n < 0):
 30 |         raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n)))
 31 |     return (n & (n-1) == 0) and n != 0
 32 | 
 33 | 
 34 | class MSDeformAttn(nn.Module):
 35 |     def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4):
 36 |         """
 37 |         Multi-Scale Deformable Attention Module
 38 |         :param d_model      hidden dimension
 39 |         :param n_levels     number of feature levels
 40 |         :param n_heads      number of attention heads
 41 |         :param n_points     number of sampling points per attention head per feature level
 42 |         """
 43 |         super().__init__()
 44 |         if d_model % n_heads != 0:
 45 |             raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads))
 46 |         _d_per_head = d_model // n_heads
 47 |         # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation
 48 |         if not _is_power_of_2(_d_per_head):
 49 |             warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 "
 50 |                           "which is more efficient in our CUDA implementation.")
 51 | 
 52 |         self.im2col_step = 128
 53 | 
 54 |         self.d_model = d_model
 55 |         self.n_levels = n_levels
 56 |         self.n_heads = n_heads
 57 |         self.n_points = n_points
 58 | 
 59 |         self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2)
 60 |         self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points)
 61 |         self.value_proj = nn.Linear(d_model, d_model)
 62 |         self.output_proj = nn.Linear(d_model, d_model)
 63 | 
 64 |         self._reset_parameters()
 65 | 
 66 |     def _reset_parameters(self):
 67 |         constant_(self.sampling_offsets.weight.data, 0.)
 68 |         thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
 69 |         grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
 70 |         grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1)
 71 |         for i in range(self.n_points):
 72 |             grid_init[:, :, i, :] *= i + 1
 73 |         with torch.no_grad():
 74 |             self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
 75 |         constant_(self.attention_weights.weight.data, 0.)
 76 |         constant_(self.attention_weights.bias.data, 0.)
 77 |         xavier_uniform_(self.value_proj.weight.data)
 78 |         constant_(self.value_proj.bias.data, 0.)
 79 |         xavier_uniform_(self.output_proj.weight.data)
 80 |         constant_(self.output_proj.bias.data, 0.)
 81 | 
 82 |     def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None):
 83 |         """
 84 |         :param query                       (N, Length_{query}, C)
 85 |         :param reference_points            (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area
 86 |                                         or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes
 87 |         :param input_flatten               (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C)
 88 |         :param input_spatial_shapes        (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
 89 |         :param input_level_start_index     (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}]
 90 |         :param input_padding_mask          (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements
 91 | 
 92 |         :return output                     (N, Length_{query}, C)
 93 |         """
 94 |         N, Len_q, _ = query.shape
 95 |         N, Len_in, _ = input_flatten.shape
 96 |         assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in
 97 | 
 98 |         value = self.value_proj(input_flatten)
 99 |         if input_padding_mask is not None:
100 |             value = value.masked_fill(input_padding_mask[..., None], float(0))
101 |         value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads)
102 |         sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2)
103 |         attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points)
104 |         attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points)
105 |         # N, Len_q, n_heads, n_levels, n_points, 2
106 |         if reference_points.shape[-1] == 2:
107 |             offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1)
108 |             sampling_locations = reference_points[:, :, None, :, None, :] \
109 |                                  + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
110 |         elif reference_points.shape[-1] == 4:
111 |             sampling_locations = reference_points[:, :, None, :, None, :2] \
112 |                                  + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
113 |         else:
114 |             raise ValueError(
115 |                 'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1]))
116 |         try:
117 |             output = MSDeformAttnFunction.apply(
118 |                 value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step)
119 |         except:
120 |             # CPU
121 |             output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights)
122 |         # # For FLOPs calculation only
123 |         # output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights)
124 |         output = self.output_proj(output)
125 |         return output
126 | 


--------------------------------------------------------------------------------
/maskclip/data/dataset_mappers/mask_former_instance_dataset_mapper.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | import copy
  3 | import logging
  4 | 
  5 | import numpy as np
  6 | import pycocotools.mask as mask_util
  7 | import torch
  8 | from torch.nn import functional as F
  9 | 
 10 | from detectron2.config import configurable
 11 | from detectron2.data import detection_utils as utils
 12 | from detectron2.data import transforms as T
 13 | from detectron2.projects.point_rend import ColorAugSSDTransform
 14 | from detectron2.structures import BitMasks, Instances, polygons_to_bitmask
 15 | 
 16 | __all__ = ["MaskFormerInstanceDatasetMapper"]
 17 | 
 18 | 
 19 | class MaskFormerInstanceDatasetMapper:
 20 |     """
 21 |     A callable which takes a dataset dict in Detectron2 Dataset format,
 22 |     and map it into a format used by MaskFormer for instance segmentation.
 23 | 
 24 |     The callable currently does the following:
 25 | 
 26 |     1. Read the image from "file_name"
 27 |     2. Applies geometric transforms to the image and annotation
 28 |     3. Find and applies suitable cropping to the image and annotation
 29 |     4. Prepare image and annotation to Tensors
 30 |     """
 31 | 
 32 |     @configurable
 33 |     def __init__(
 34 |         self,
 35 |         is_train=True,
 36 |         *,
 37 |         augmentations,
 38 |         image_format,
 39 |         size_divisibility,
 40 |     ):
 41 |         """
 42 |         NOTE: this interface is experimental.
 43 |         Args:
 44 |             is_train: for training or inference
 45 |             augmentations: a list of augmentations or deterministic transforms to apply
 46 |             image_format: an image format supported by :func:`detection_utils.read_image`.
 47 |             size_divisibility: pad image size to be divisible by this value
 48 |         """
 49 |         self.is_train = is_train
 50 |         self.tfm_gens = augmentations
 51 |         self.img_format = image_format
 52 |         self.size_divisibility = size_divisibility
 53 | 
 54 |         logger = logging.getLogger(__name__)
 55 |         mode = "training" if is_train else "inference"
 56 |         logger.info(f"[{self.__class__.__name__}] Augmentations used in {mode}: {augmentations}")
 57 | 
 58 |     @classmethod
 59 |     def from_config(cls, cfg, is_train=True):
 60 | 
 61 |         # # Build augmentation
 62 |         # augs = [
 63 |         #     T.ResizeShortestEdge(
 64 |         #         cfg.INPUT.MIN_SIZE_TRAIN,
 65 |         #         cfg.INPUT.MAX_SIZE_TRAIN,
 66 |         #         cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING,
 67 |         #     )
 68 |         # ]
 69 |         # if cfg.INPUT.CROP.ENABLED:
 70 |         #     augs.append(
 71 |         #         T.RandomCrop(
 72 |         #             cfg.INPUT.CROP.TYPE,
 73 |         #             cfg.INPUT.CROP.SIZE,
 74 |         #         )
 75 |         #     )
 76 |         # if cfg.INPUT.COLOR_AUG_SSD:
 77 |         #     augs.append(ColorAugSSDTransform(img_format=cfg.INPUT.FORMAT))
 78 |         # augs.append(T.RandomFlip())
 79 | 
 80 |         augs = [
 81 |             T.Resize((1024, 1024))
 82 |         ]
 83 | 
 84 |         ret = {
 85 |             "is_train": is_train,
 86 |             "augmentations": augs,
 87 |             "image_format": cfg.INPUT.FORMAT,
 88 |             "size_divisibility": cfg.INPUT.SIZE_DIVISIBILITY,
 89 |         }
 90 |         return ret
 91 | 
 92 |     def __call__(self, dataset_dict):
 93 |         """
 94 |         Args:
 95 |             dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
 96 | 
 97 |         Returns:
 98 |             dict: a format that builtin models in detectron2 accept
 99 |         """
100 |         assert self.is_train, "MaskFormerPanopticDatasetMapper should only be used for training!"
101 | 
102 |         dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
103 |         image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
104 |         utils.check_image_size(dataset_dict, image)
105 | 
106 |         aug_input = T.AugInput(image)
107 |         aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input)
108 |         image = aug_input.image
109 | 
110 |         # transform instnace masks
111 |         assert "annotations" in dataset_dict
112 |         for anno in dataset_dict["annotations"]:
113 |             anno.pop("keypoints", None)
114 | 
115 |         annos = [
116 |             utils.transform_instance_annotations(obj, transforms, image.shape[:2])
117 |             for obj in dataset_dict.pop("annotations")
118 |             if obj.get("iscrowd", 0) == 0
119 |         ]
120 | 
121 |         if len(annos):
122 |             assert "segmentation" in annos[0]
123 |         segms = [obj["segmentation"] for obj in annos]
124 |         masks = []
125 |         for segm in segms:
126 |             if isinstance(segm, list):
127 |                 # polygon
128 |                 masks.append(polygons_to_bitmask(segm, *image.shape[:2]))
129 |             elif isinstance(segm, dict):
130 |                 # COCO RLE
131 |                 masks.append(mask_util.decode(segm))
132 |             elif isinstance(segm, np.ndarray):
133 |                 assert segm.ndim == 2, "Expect segmentation of 2 dimensions, got {}.".format(
134 |                     segm.ndim
135 |                 )
136 |                 # mask array
137 |                 masks.append(segm)
138 |             else:
139 |                 raise ValueError(
140 |                     "Cannot convert segmentation of type '{}' to BitMasks!"
141 |                     "Supported types are: polygons as list[list[float] or ndarray],"
142 |                     " COCO-style RLE as a dict, or a binary segmentation mask "
143 |                     " in a 2D numpy array of shape HxW.".format(type(segm))
144 |                 )
145 | 
146 |         # Pad image and segmentation label here!
147 |         image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
148 |         masks = [torch.from_numpy(np.ascontiguousarray(x)) for x in masks]
149 | 
150 |         classes = [int(obj["category_id"]) for obj in annos]
151 |         classes = torch.tensor(classes, dtype=torch.int64)
152 | 
153 |         if self.size_divisibility > 0:
154 |             image_size = (image.shape[-2], image.shape[-1])
155 |             padding_size = [
156 |                 0,
157 |                 self.size_divisibility - image_size[1],
158 |                 0,
159 |                 self.size_divisibility - image_size[0],
160 |             ]
161 |             # pad image
162 |             image = F.pad(image, padding_size, value=128).contiguous()
163 |             # pad mask
164 |             masks = [F.pad(x, padding_size, value=0).contiguous() for x in masks]
165 | 
166 |         image_shape = (image.shape[-2], image.shape[-1])  # h, w
167 | 
168 |         # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
169 |         # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
170 |         # Therefore it's important to use torch.Tensor.
171 |         dataset_dict["image"] = image
172 | 
173 |         # Prepare per-category binary masks
174 |         instances = Instances(image_shape)
175 |         instances.gt_classes = classes
176 |         if len(masks) == 0:
177 |             # Some image does not have annotation (all ignored)
178 |             instances.gt_masks = torch.zeros((0, image.shape[-2], image.shape[-1]))
179 |         else:
180 |             masks = BitMasks(torch.stack(masks))
181 |             instances.gt_masks = masks.tensor
182 | 
183 |         dataset_dict["instances"] = instances
184 | 
185 |         return dataset_dict
186 | 


--------------------------------------------------------------------------------
/maskclip/data/dataset_mappers/mask_former_semantic_dataset_mapper.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | import copy
  3 | import logging
  4 | 
  5 | import numpy as np
  6 | import torch
  7 | from torch.nn import functional as F
  8 | 
  9 | from detectron2.config import configurable
 10 | from detectron2.data import MetadataCatalog
 11 | from detectron2.data import detection_utils as utils
 12 | from detectron2.data import transforms as T
 13 | from detectron2.projects.point_rend import ColorAugSSDTransform
 14 | from detectron2.structures import BitMasks, Instances
 15 | 
 16 | __all__ = ["MaskFormerSemanticDatasetMapper"]
 17 | 
 18 | 
 19 | class MaskFormerSemanticDatasetMapper:
 20 |     """
 21 |     A callable which takes a dataset dict in Detectron2 Dataset format,
 22 |     and map it into a format used by MaskFormer for semantic segmentation.
 23 | 
 24 |     The callable currently does the following:
 25 | 
 26 |     1. Read the image from "file_name"
 27 |     2. Applies geometric transforms to the image and annotation
 28 |     3. Find and applies suitable cropping to the image and annotation
 29 |     4. Prepare image and annotation to Tensors
 30 |     """
 31 | 
 32 |     @configurable
 33 |     def __init__(
 34 |         self,
 35 |         is_train=True,
 36 |         *,
 37 |         augmentations,
 38 |         image_format,
 39 |         ignore_label,
 40 |         size_divisibility,
 41 |     ):
 42 |         """
 43 |         NOTE: this interface is experimental.
 44 |         Args:
 45 |             is_train: for training or inference
 46 |             augmentations: a list of augmentations or deterministic transforms to apply
 47 |             image_format: an image format supported by :func:`detection_utils.read_image`.
 48 |             ignore_label: the label that is ignored to evaluation
 49 |             size_divisibility: pad image size to be divisible by this value
 50 |         """
 51 |         self.is_train = is_train
 52 |         self.tfm_gens = augmentations
 53 |         self.img_format = image_format
 54 |         self.ignore_label = ignore_label
 55 |         self.size_divisibility = size_divisibility
 56 | 
 57 |         logger = logging.getLogger(__name__)
 58 |         mode = "training" if is_train else "inference"
 59 |         logger.info(f"[{self.__class__.__name__}] Augmentations used in {mode}: {augmentations}")
 60 | 
 61 |     @classmethod
 62 |     def from_config(cls, cfg, is_train=True):
 63 | 
 64 |         # Build augmentation
 65 |         # augs = [
 66 |         #     T.ResizeShortestEdge(
 67 |         #         cfg.INPUT.MIN_SIZE_TRAIN,
 68 |         #         cfg.INPUT.MAX_SIZE_TRAIN,
 69 |         #         cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING,
 70 |         #     )
 71 |         # ]
 72 |         # if cfg.INPUT.CROP.ENABLED:
 73 |         #     augs.append(
 74 |         #         T.RandomCrop_CategoryAreaConstraint(
 75 |         #             cfg.INPUT.CROP.TYPE,
 76 |         #             cfg.INPUT.CROP.SIZE,
 77 |         #             cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA,
 78 |         #             cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
 79 |         #         )
 80 |         #     )
 81 |         # if cfg.INPUT.COLOR_AUG_SSD:
 82 |         #     augs.append(ColorAugSSDTransform(img_format=cfg.INPUT.FORMAT))
 83 |         # augs.append(T.RandomFlip())
 84 | 
 85 |         augs = [
 86 |             T.Resize((1024, 1024))
 87 |         ]
 88 | 
 89 |         # Assume always applies to the training set.
 90 |         dataset_names = cfg.DATASETS.TRAIN
 91 |         meta = MetadataCatalog.get(dataset_names[0])
 92 |         ignore_label = meta.ignore_label
 93 | 
 94 |         ret = {
 95 |             "is_train": is_train,
 96 |             "augmentations": augs,
 97 |             "image_format": cfg.INPUT.FORMAT,
 98 |             "ignore_label": ignore_label,
 99 |             "size_divisibility": cfg.INPUT.SIZE_DIVISIBILITY,
100 |         }
101 |         return ret
102 | 
103 |     def __call__(self, dataset_dict):
104 |         """
105 |         Args:
106 |             dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
107 | 
108 |         Returns:
109 |             dict: a format that builtin models in detectron2 accept
110 |         """
111 |         assert self.is_train, "MaskFormerSemanticDatasetMapper should only be used for training!"
112 | 
113 |         dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
114 |         image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
115 |         utils.check_image_size(dataset_dict, image)
116 | 
117 |         if "sem_seg_file_name" in dataset_dict:
118 |             # PyTorch transformation not implemented for uint16, so converting it to double first
119 |             sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name")).astype("double")
120 |         else:
121 |             sem_seg_gt = None
122 | 
123 |         if sem_seg_gt is None:
124 |             raise ValueError(
125 |                 "Cannot find 'sem_seg_file_name' for semantic segmentation dataset {}.".format(
126 |                     dataset_dict["file_name"]
127 |                 )
128 |             )
129 | 
130 |         aug_input = T.AugInput(image, sem_seg=sem_seg_gt)
131 |         aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input)
132 |         image = aug_input.image
133 |         sem_seg_gt = aug_input.sem_seg
134 | 
135 |         # Pad image and segmentation label here!
136 |         image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
137 |         if sem_seg_gt is not None:
138 |             sem_seg_gt = torch.as_tensor(sem_seg_gt.astype("long"))
139 | 
140 |         if self.size_divisibility > 0:
141 |             image_size = (image.shape[-2], image.shape[-1])
142 |             padding_size = [
143 |                 0,
144 |                 self.size_divisibility - image_size[1],
145 |                 0,
146 |                 self.size_divisibility - image_size[0],
147 |             ]
148 |             image = F.pad(image, padding_size, value=128).contiguous()
149 |             if sem_seg_gt is not None:
150 |                 sem_seg_gt = F.pad(sem_seg_gt, padding_size, value=self.ignore_label).contiguous()
151 | 
152 |         image_shape = (image.shape[-2], image.shape[-1])  # h, w
153 | 
154 |         # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
155 |         # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
156 |         # Therefore it's important to use torch.Tensor.
157 |         dataset_dict["image"] = image
158 | 
159 |         if sem_seg_gt is not None:
160 |             dataset_dict["sem_seg"] = sem_seg_gt.long()
161 | 
162 |         if "annotations" in dataset_dict:
163 |             raise ValueError("Semantic segmentation dataset should not have 'annotations'.")
164 | 
165 |         # Prepare per-category binary masks
166 |         if sem_seg_gt is not None:
167 |             sem_seg_gt = sem_seg_gt.numpy()
168 |             instances = Instances(image_shape)
169 |             classes = np.unique(sem_seg_gt)
170 |             # remove ignored region
171 |             classes = classes[classes != self.ignore_label]
172 |             instances.gt_classes = torch.tensor(classes, dtype=torch.int64)
173 | 
174 |             masks = []
175 |             for class_id in classes:
176 |                 masks.append(sem_seg_gt == class_id)
177 | 
178 |             if len(masks) == 0:
179 |                 # Some image does not have annotation (all ignored)
180 |                 instances.gt_masks = torch.zeros((0, sem_seg_gt.shape[-2], sem_seg_gt.shape[-1]))
181 |             else:
182 |                 masks = BitMasks(
183 |                     torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks])
184 |                 )
185 |                 instances.gt_masks = masks.tensor
186 | 
187 |             dataset_dict["instances"] = instances
188 | 
189 |         return dataset_dict
190 | 


--------------------------------------------------------------------------------
/maskclip/modeling/transformer_decoder/maskformer_transformer_decoder.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/detr.py
  3 | import fvcore.nn.weight_init as weight_init
  4 | import torch
  5 | from torch import nn
  6 | from torch.nn import functional as F
  7 | 
  8 | from detectron2.config import configurable
  9 | from detectron2.layers import Conv2d
 10 | from detectron2.utils.registry import Registry
 11 | 
 12 | from .position_encoding import PositionEmbeddingSine
 13 | from .transformer import Transformer
 14 | 
 15 | 
 16 | TRANSFORMER_DECODER_REGISTRY = Registry("TRANSFORMER_MODULE")
 17 | TRANSFORMER_DECODER_REGISTRY.__doc__ = """
 18 | Registry for transformer module in MaskFormer.
 19 | """
 20 | 
 21 | 
 22 | def build_transformer_decoder(cfg, in_channels, mask_classification=True):
 23 |     """
 24 |     Build a instance embedding branch from `cfg.MODEL.INS_EMBED_HEAD.NAME`.
 25 |     """
 26 |     name = cfg.MODEL.MASK_FORMER.TRANSFORMER_DECODER_NAME
 27 |     return TRANSFORMER_DECODER_REGISTRY.get(name)(cfg, in_channels, mask_classification)
 28 | 
 29 | 
 30 | @TRANSFORMER_DECODER_REGISTRY.register()
 31 | class StandardTransformerDecoder(nn.Module):
 32 |     @configurable
 33 |     def __init__(
 34 |         self,
 35 |         in_channels,
 36 |         mask_classification=True,
 37 |         *,
 38 |         num_classes: int,
 39 |         hidden_dim: int,
 40 |         num_queries: int,
 41 |         nheads: int,
 42 |         dropout: float,
 43 |         dim_feedforward: int,
 44 |         enc_layers: int,
 45 |         dec_layers: int,
 46 |         pre_norm: bool,
 47 |         deep_supervision: bool,
 48 |         mask_dim: int,
 49 |         enforce_input_project: bool,
 50 |     ):
 51 |         """
 52 |         NOTE: this interface is experimental.
 53 |         Args:
 54 |             in_channels: channels of the input features
 55 |             mask_classification: whether to add mask classifier or not
 56 |             num_classes: number of classes
 57 |             hidden_dim: Transformer feature dimension
 58 |             num_queries: number of queries
 59 |             nheads: number of heads
 60 |             dropout: dropout in Transformer
 61 |             dim_feedforward: feature dimension in feedforward network
 62 |             enc_layers: number of Transformer encoder layers
 63 |             dec_layers: number of Transformer decoder layers
 64 |             pre_norm: whether to use pre-LayerNorm or not
 65 |             deep_supervision: whether to add supervision to every decoder layers
 66 |             mask_dim: mask feature dimension
 67 |             enforce_input_project: add input project 1x1 conv even if input
 68 |                 channels and hidden dim is identical
 69 |         """
 70 |         super().__init__()
 71 | 
 72 |         self.mask_classification = mask_classification
 73 | 
 74 |         # positional encoding
 75 |         N_steps = hidden_dim // 2
 76 |         self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True)
 77 | 
 78 |         transformer = Transformer(
 79 |             d_model=hidden_dim,
 80 |             dropout=dropout,
 81 |             nhead=nheads,
 82 |             dim_feedforward=dim_feedforward,
 83 |             num_encoder_layers=enc_layers,
 84 |             num_decoder_layers=dec_layers,
 85 |             normalize_before=pre_norm,
 86 |             return_intermediate_dec=deep_supervision,
 87 |         )
 88 | 
 89 |         self.num_queries = num_queries
 90 |         self.transformer = transformer
 91 |         hidden_dim = transformer.d_model
 92 | 
 93 |         self.query_embed = nn.Embedding(num_queries, hidden_dim)
 94 | 
 95 |         if in_channels != hidden_dim or enforce_input_project:
 96 |             self.input_proj = Conv2d(in_channels, hidden_dim, kernel_size=1)
 97 |             weight_init.c2_xavier_fill(self.input_proj)
 98 |         else:
 99 |             self.input_proj = nn.Sequential()
100 |         self.aux_loss = deep_supervision
101 | 
102 |         # output FFNs
103 |         if self.mask_classification:
104 |             self.class_embed = nn.Linear(hidden_dim, num_classes + 1)
105 |         self.mask_embed = MLP(hidden_dim, hidden_dim, mask_dim, 3)
106 | 
107 |     @classmethod
108 |     def from_config(cls, cfg, in_channels, mask_classification):
109 |         ret = {}
110 |         ret["in_channels"] = in_channels
111 |         ret["mask_classification"] = mask_classification
112 | 
113 |         ret["num_classes"] = cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES
114 |         ret["hidden_dim"] = cfg.MODEL.MASK_FORMER.HIDDEN_DIM
115 |         ret["num_queries"] = cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES
116 |         # Transformer parameters:
117 |         ret["nheads"] = cfg.MODEL.MASK_FORMER.NHEADS
118 |         ret["dropout"] = cfg.MODEL.MASK_FORMER.DROPOUT
119 |         ret["dim_feedforward"] = cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD
120 |         ret["enc_layers"] = cfg.MODEL.MASK_FORMER.ENC_LAYERS
121 |         ret["dec_layers"] = cfg.MODEL.MASK_FORMER.DEC_LAYERS
122 |         ret["pre_norm"] = cfg.MODEL.MASK_FORMER.PRE_NORM
123 |         ret["deep_supervision"] = cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION
124 |         ret["enforce_input_project"] = cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ
125 | 
126 |         ret["mask_dim"] = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM
127 | 
128 |         return ret
129 | 
130 |     def forward(self, x, mask_features, mask=None):
131 |         if mask is not None:
132 |             mask = F.interpolate(mask[None].float(), size=x.shape[-2:]).to(torch.bool)[0]
133 |         pos = self.pe_layer(x, mask)
134 | 
135 |         src = x
136 |         hs, memory = self.transformer(self.input_proj(src), mask, self.query_embed.weight, pos)
137 | 
138 |         if self.mask_classification:
139 |             outputs_class = self.class_embed(hs)
140 |             out = {"pred_logits": outputs_class[-1]}
141 |         else:
142 |             out = {}
143 | 
144 |         if self.aux_loss:
145 |             # [l, bs, queries, embed]
146 |             mask_embed = self.mask_embed(hs)
147 |             outputs_seg_masks = torch.einsum("lbqc,bchw->lbqhw", mask_embed, mask_features)
148 |             out["pred_masks"] = outputs_seg_masks[-1]
149 |             out["aux_outputs"] = self._set_aux_loss(
150 |                 outputs_class if self.mask_classification else None, outputs_seg_masks
151 |             )
152 |         else:
153 |             # FIXME h_boxes takes the last one computed, keep this in mind
154 |             # [bs, queries, embed]
155 |             mask_embed = self.mask_embed(hs[-1])
156 |             outputs_seg_masks = torch.einsum("bqc,bchw->bqhw", mask_embed, mask_features)
157 |             out["pred_masks"] = outputs_seg_masks
158 |         return out
159 | 
160 |     @torch.jit.unused
161 |     def _set_aux_loss(self, outputs_class, outputs_seg_masks):
162 |         # this is a workaround to make torchscript happy, as torchscript
163 |         # doesn't support dictionary with non-homogeneous values, such
164 |         # as a dict having both a Tensor and a list.
165 |         if self.mask_classification:
166 |             return [
167 |                 {"pred_logits": a, "pred_masks": b}
168 |                 for a, b in zip(outputs_class[:-1], outputs_seg_masks[:-1])
169 |             ]
170 |         else:
171 |             return [{"pred_masks": b} for b in outputs_seg_masks[:-1]]
172 | 
173 | 
174 | class MLP(nn.Module):
175 |     """Very simple multi-layer perceptron (also called FFN)"""
176 | 
177 |     def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
178 |         super().__init__()
179 |         self.num_layers = num_layers
180 |         h = [hidden_dim] * (num_layers - 1)
181 |         self.layers = nn.ModuleList(
182 |             nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])
183 |         )
184 | 
185 |     def forward(self, x):
186 |         for i, layer in enumerate(self.layers):
187 |             x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
188 |         return x
189 | 


--------------------------------------------------------------------------------
/maskclip/data/dataset_mappers/coco_instance_new_baseline_dataset_mapper.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/d2/detr/dataset_mapper.py
  3 | import copy
  4 | import logging
  5 | 
  6 | import numpy as np
  7 | import torch
  8 | 
  9 | from detectron2.config import configurable
 10 | from detectron2.data import detection_utils as utils
 11 | from detectron2.data import transforms as T
 12 | from detectron2.data.transforms import TransformGen
 13 | from detectron2.structures import BitMasks, Instances
 14 | 
 15 | from pycocotools import mask as coco_mask
 16 | 
 17 | __all__ = ["COCOInstanceNewBaselineDatasetMapper"]
 18 | 
 19 | 
 20 | def convert_coco_poly_to_mask(segmentations, height, width):
 21 |     masks = []
 22 |     for polygons in segmentations:
 23 |         rles = coco_mask.frPyObjects(polygons, height, width)
 24 |         mask = coco_mask.decode(rles)
 25 |         if len(mask.shape) < 3:
 26 |             mask = mask[..., None]
 27 |         mask = torch.as_tensor(mask, dtype=torch.uint8)
 28 |         mask = mask.any(dim=2)
 29 |         masks.append(mask)
 30 |     if masks:
 31 |         masks = torch.stack(masks, dim=0)
 32 |     else:
 33 |         masks = torch.zeros((0, height, width), dtype=torch.uint8)
 34 |     return masks
 35 | 
 36 | 
 37 | def build_transform_gen(cfg, is_train):
 38 |     """
 39 |     Create a list of default :class:`Augmentation` from config.
 40 |     Now it includes resizing and flipping.
 41 |     Returns:
 42 |         list[Augmentation]
 43 |     """
 44 |     assert is_train, "Only support training augmentation"
 45 |     image_size = cfg.INPUT.IMAGE_SIZE
 46 |     min_scale = cfg.INPUT.MIN_SCALE
 47 |     max_scale = cfg.INPUT.MAX_SCALE
 48 | 
 49 |     augmentation = []
 50 | 
 51 |     if cfg.INPUT.RANDOM_FLIP != "none":
 52 |         augmentation.append(
 53 |             T.RandomFlip(
 54 |                 horizontal=cfg.INPUT.RANDOM_FLIP == "horizontal",
 55 |                 vertical=cfg.INPUT.RANDOM_FLIP == "vertical",
 56 |             )
 57 |         )
 58 | 
 59 |     augmentation.extend([
 60 |         T.ResizeScale(
 61 |             min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size
 62 |         ),
 63 |         T.FixedSizeCrop(crop_size=(image_size, image_size)),
 64 |     ])
 65 | 
 66 |     return augmentation
 67 | 
 68 | 
 69 | # This is specifically designed for the COCO dataset.
 70 | class COCOInstanceNewBaselineDatasetMapper:
 71 |     """
 72 |     A callable which takes a dataset dict in Detectron2 Dataset format,
 73 |     and map it into a format used by MaskFormer.
 74 | 
 75 |     This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation.
 76 | 
 77 |     The callable currently does the following:
 78 | 
 79 |     1. Read the image from "file_name"
 80 |     2. Applies geometric transforms to the image and annotation
 81 |     3. Find and applies suitable cropping to the image and annotation
 82 |     4. Prepare image and annotation to Tensors
 83 |     """
 84 | 
 85 |     @configurable
 86 |     def __init__(
 87 |         self,
 88 |         is_train=True,
 89 |         *,
 90 |         tfm_gens,
 91 |         image_format,
 92 |     ):
 93 |         """
 94 |         NOTE: this interface is experimental.
 95 |         Args:
 96 |             is_train: for training or inference
 97 |             augmentations: a list of augmentations or deterministic transforms to apply
 98 |             tfm_gens: data augmentation
 99 |             image_format: an image format supported by :func:`detection_utils.read_image`.
100 |         """
101 |         self.tfm_gens = tfm_gens
102 |         logging.getLogger(__name__).info(
103 |             "[COCOInstanceNewBaselineDatasetMapper] Full TransformGens used in training: {}".format(str(self.tfm_gens))
104 |         )
105 | 
106 |         self.img_format = image_format
107 |         self.is_train = is_train
108 |     
109 |     @classmethod
110 |     def from_config(cls, cfg, is_train=True):
111 |         # Build augmentation
112 |         tfm_gens = build_transform_gen(cfg, is_train)
113 | 
114 |         ret = {
115 |             "is_train": is_train,
116 |             "tfm_gens": tfm_gens,
117 |             "image_format": cfg.INPUT.FORMAT,
118 |         }
119 |         return ret
120 | 
121 |     def __call__(self, dataset_dict):
122 |         """
123 |         Args:
124 |             dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
125 | 
126 |         Returns:
127 |             dict: a format that builtin models in detectron2 accept
128 |         """
129 |         dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
130 |         image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
131 |         utils.check_image_size(dataset_dict, image)
132 | 
133 |         # TODO: get padding mask
134 |         # by feeding a "segmentation mask" to the same transforms
135 |         padding_mask = np.ones(image.shape[:2])
136 | 
137 |         image, transforms = T.apply_transform_gens(self.tfm_gens, image)
138 |         # the crop transformation has default padding value 0 for segmentation
139 |         padding_mask = transforms.apply_segmentation(padding_mask)
140 |         padding_mask = ~ padding_mask.astype(bool)
141 | 
142 |         image_shape = image.shape[:2]  # h, w
143 | 
144 |         # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
145 |         # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
146 |         # Therefore it's important to use torch.Tensor.
147 |         dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
148 |         dataset_dict["padding_mask"] = torch.as_tensor(np.ascontiguousarray(padding_mask))
149 | 
150 |         if not self.is_train:
151 |             # USER: Modify this if you want to keep them for some reason.
152 |             dataset_dict.pop("annotations", None)
153 |             return dataset_dict
154 | 
155 |         if "annotations" in dataset_dict:
156 |             # USER: Modify this if you want to keep them for some reason.
157 |             for anno in dataset_dict["annotations"]:
158 |                 # Let's always keep mask
159 |                 # if not self.mask_on:
160 |                 #     anno.pop("segmentation", None)
161 |                 anno.pop("keypoints", None)
162 | 
163 |             # USER: Implement additional transformations if you have other types of data
164 |             annos = [
165 |                 utils.transform_instance_annotations(obj, transforms, image_shape)
166 |                 for obj in dataset_dict.pop("annotations")
167 |                 if obj.get("iscrowd", 0) == 0
168 |             ]
169 |             # NOTE: does not support BitMask due to augmentation
170 |             # Current BitMask cannot handle empty objects
171 |             instances = utils.annotations_to_instances(annos, image_shape)
172 |             # After transforms such as cropping are applied, the bounding box may no longer
173 |             # tightly bound the object. As an example, imagine a triangle object
174 |             # [(0,0), (2,0), (0,2)] cropped by a box [(1,0),(2,2)] (XYXY format). The tight
175 |             # bounding box of the cropped triangle should be [(1,0),(2,1)], which is not equal to
176 |             # the intersection of original bounding box and the cropping box.
177 |             instances.gt_boxes = instances.gt_masks.get_bounding_boxes()
178 |             # Need to filter empty instances first (due to augmentation)
179 |             instances = utils.filter_empty_instances(instances)
180 |             # Generate masks from polygon
181 |             h, w = instances.image_size
182 |             # image_size_xyxy = torch.as_tensor([w, h, w, h], dtype=torch.float)
183 |             if hasattr(instances, 'gt_masks'):
184 |                 gt_masks = instances.gt_masks
185 |                 gt_masks = convert_coco_poly_to_mask(gt_masks.polygons, h, w)
186 |                 instances.gt_masks = gt_masks
187 |             dataset_dict["instances"] = instances
188 | 
189 |         return dataset_dict
190 | 


--------------------------------------------------------------------------------
/maskclip/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.cu:
--------------------------------------------------------------------------------
  1 | /*!
  2 | **************************************************************************************************
  3 | * Deformable DETR
  4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
  5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  6 | **************************************************************************************************
  7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
  8 | **************************************************************************************************
  9 | */
 10 | 
 11 | /*!
 12 | * Copyright (c) Facebook, Inc. and its affiliates.
 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
 14 | */
 15 | 
 16 | #include <vector>
 17 | #include "cuda/ms_deform_im2col_cuda.cuh"
 18 | 
 19 | #include <ATen/ATen.h>
 20 | #include <ATen/cuda/CUDAContext.h>
 21 | #include <cuda.h>
 22 | #include <cuda_runtime.h>
 23 | 
 24 | 
 25 | at::Tensor ms_deform_attn_cuda_forward(
 26 |     const at::Tensor &value, 
 27 |     const at::Tensor &spatial_shapes,
 28 |     const at::Tensor &level_start_index,
 29 |     const at::Tensor &sampling_loc,
 30 |     const at::Tensor &attn_weight,
 31 |     const int im2col_step)
 32 | {
 33 |     AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
 34 |     AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
 35 |     AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
 36 |     AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
 37 |     AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
 38 | 
 39 |     AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
 40 |     AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
 41 |     AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
 42 |     AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
 43 |     AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
 44 | 
 45 |     const int batch = value.size(0);
 46 |     const int spatial_size = value.size(1);
 47 |     const int num_heads = value.size(2);
 48 |     const int channels = value.size(3);
 49 | 
 50 |     const int num_levels = spatial_shapes.size(0);
 51 | 
 52 |     const int num_query = sampling_loc.size(1);
 53 |     const int num_point = sampling_loc.size(4);
 54 | 
 55 |     const int im2col_step_ = std::min(batch, im2col_step);
 56 | 
 57 |     AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
 58 |     
 59 |     auto output = at::zeros({batch, num_query, num_heads, channels}, value.options());
 60 | 
 61 |     const int batch_n = im2col_step_;
 62 |     auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
 63 |     auto per_value_size = spatial_size * num_heads * channels;
 64 |     auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
 65 |     auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
 66 |     for (int n = 0; n < batch/im2col_step_; ++n)
 67 |     {
 68 |         auto columns = output_n.select(0, n);
 69 |         AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_forward_cuda", ([&] {
 70 |             ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(),
 71 |                 value.data<scalar_t>() + n * im2col_step_ * per_value_size,
 72 |                 spatial_shapes.data<int64_t>(),
 73 |                 level_start_index.data<int64_t>(),
 74 |                 sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
 75 |                 attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
 76 |                 batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
 77 |                 columns.data<scalar_t>());
 78 | 
 79 |         }));
 80 |     }
 81 | 
 82 |     output = output.view({batch, num_query, num_heads*channels});
 83 | 
 84 |     return output;
 85 | }
 86 | 
 87 | 
 88 | std::vector<at::Tensor> ms_deform_attn_cuda_backward(
 89 |     const at::Tensor &value, 
 90 |     const at::Tensor &spatial_shapes,
 91 |     const at::Tensor &level_start_index,
 92 |     const at::Tensor &sampling_loc,
 93 |     const at::Tensor &attn_weight,
 94 |     const at::Tensor &grad_output,
 95 |     const int im2col_step)
 96 | {
 97 | 
 98 |     AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
 99 |     AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
100 |     AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
101 |     AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
102 |     AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
103 |     AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous");
104 | 
105 |     AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
106 |     AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
107 |     AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
108 |     AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
109 |     AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
110 |     AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor");
111 | 
112 |     const int batch = value.size(0);
113 |     const int spatial_size = value.size(1);
114 |     const int num_heads = value.size(2);
115 |     const int channels = value.size(3);
116 | 
117 |     const int num_levels = spatial_shapes.size(0);
118 | 
119 |     const int num_query = sampling_loc.size(1);
120 |     const int num_point = sampling_loc.size(4);
121 | 
122 |     const int im2col_step_ = std::min(batch, im2col_step);
123 | 
124 |     AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
125 | 
126 |     auto grad_value = at::zeros_like(value);
127 |     auto grad_sampling_loc = at::zeros_like(sampling_loc);
128 |     auto grad_attn_weight = at::zeros_like(attn_weight);
129 | 
130 |     const int batch_n = im2col_step_;
131 |     auto per_value_size = spatial_size * num_heads * channels;
132 |     auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
133 |     auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
134 |     auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
135 |     
136 |     for (int n = 0; n < batch/im2col_step_; ++n)
137 |     {
138 |         auto grad_output_g = grad_output_n.select(0, n);
139 |         AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_backward_cuda", ([&] {
140 |             ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(),
141 |                                     grad_output_g.data<scalar_t>(),
142 |                                     value.data<scalar_t>() + n * im2col_step_ * per_value_size,
143 |                                     spatial_shapes.data<int64_t>(),
144 |                                     level_start_index.data<int64_t>(),
145 |                                     sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
146 |                                     attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
147 |                                     batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
148 |                                     grad_value.data<scalar_t>() +  n * im2col_step_ * per_value_size,
149 |                                     grad_sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
150 |                                     grad_attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size);
151 | 
152 |         }));
153 |     }
154 | 
155 |     return {
156 |         grad_value, grad_sampling_loc, grad_attn_weight
157 |     };
158 | }


--------------------------------------------------------------------------------
/maskclip/data/datasets/register_coco_panoptic_annos_semseg.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | import json
  3 | import os
  4 | 
  5 | from detectron2.data import DatasetCatalog, MetadataCatalog
  6 | from detectron2.data.datasets import load_sem_seg
  7 | from detectron2.data.datasets.builtin_meta import COCO_CATEGORIES
  8 | from detectron2.utils.file_io import PathManager
  9 | 
 10 | 
 11 | _PREDEFINED_SPLITS_COCO_PANOPTIC = {
 12 |     "coco_2017_train_panoptic": (
 13 |         # This is the original panoptic annotation directory
 14 |         "coco/panoptic_train2017",
 15 |         "coco/annotations/panoptic_train2017.json",
 16 |         # This directory contains semantic annotations that are
 17 |         # converted from panoptic annotations.
 18 |         # It is used by PanopticFPN.
 19 |         # You can use the script at detectron2/datasets/prepare_panoptic_fpn.py
 20 |         # to create these directories.
 21 |         "coco/panoptic_semseg_train2017",
 22 |     ),
 23 |     "coco_2017_val_panoptic": (
 24 |         "coco/panoptic_val2017",
 25 |         "coco/annotations/panoptic_val2017.json",
 26 |         "coco/panoptic_semseg_val2017",
 27 |     ),
 28 | }
 29 | 
 30 | 
 31 | def get_metadata():
 32 |     meta = {}
 33 |     # The following metadata maps contiguous id from [0, #thing categories +
 34 |     # #stuff categories) to their names and colors. We have to replica of the
 35 |     # same name and color under "thing_*" and "stuff_*" because the current
 36 |     # visualization function in D2 handles thing and class classes differently
 37 |     # due to some heuristic used in Panoptic FPN. We keep the same naming to
 38 |     # enable reusing existing visualization functions.
 39 |     thing_classes = [k["name"] for k in COCO_CATEGORIES if k["isthing"] == 1]
 40 |     thing_colors = [k["color"] for k in COCO_CATEGORIES if k["isthing"] == 1]
 41 |     stuff_classes = [k["name"] for k in COCO_CATEGORIES]
 42 |     stuff_colors = [k["color"] for k in COCO_CATEGORIES]
 43 | 
 44 |     meta["thing_classes"] = thing_classes
 45 |     meta["thing_colors"] = thing_colors
 46 |     meta["stuff_classes"] = stuff_classes
 47 |     meta["stuff_colors"] = stuff_colors
 48 | 
 49 |     # Convert category id for training:
 50 |     #   category id: like semantic segmentation, it is the class id for each
 51 |     #   pixel. Since there are some classes not used in evaluation, the category
 52 |     #   id is not always contiguous and thus we have two set of category ids:
 53 |     #       - original category id: category id in the original dataset, mainly
 54 |     #           used for evaluation.
 55 |     #       - contiguous category id: [0, #classes), in order to train the linear
 56 |     #           softmax classifier.
 57 |     thing_dataset_id_to_contiguous_id = {}
 58 |     stuff_dataset_id_to_contiguous_id = {}
 59 | 
 60 |     for i, cat in enumerate(COCO_CATEGORIES):
 61 |         if cat["isthing"]:
 62 |             thing_dataset_id_to_contiguous_id[cat["id"]] = i
 63 |         # else:
 64 |         #     stuff_dataset_id_to_contiguous_id[cat["id"]] = i
 65 | 
 66 |         # in order to use sem_seg evaluator
 67 |         stuff_dataset_id_to_contiguous_id[cat["id"]] = i
 68 | 
 69 |     meta["thing_dataset_id_to_contiguous_id"] = thing_dataset_id_to_contiguous_id
 70 |     meta["stuff_dataset_id_to_contiguous_id"] = stuff_dataset_id_to_contiguous_id
 71 | 
 72 |     return meta
 73 | 
 74 | 
 75 | def load_coco_panoptic_json(json_file, image_dir, gt_dir, semseg_dir, meta):
 76 |     """
 77 |     Args:
 78 |         image_dir (str): path to the raw dataset. e.g., "~/coco/train2017".
 79 |         gt_dir (str): path to the raw annotations. e.g., "~/coco/panoptic_train2017".
 80 |         json_file (str): path to the json file. e.g., "~/coco/annotations/panoptic_train2017.json".
 81 |     Returns:
 82 |         list[dict]: a list of dicts in Detectron2 standard format. (See
 83 |         `Using Custom Datasets </tutorials/datasets.html>`_ )
 84 |     """
 85 | 
 86 |     def _convert_category_id(segment_info, meta):
 87 |         if segment_info["category_id"] in meta["thing_dataset_id_to_contiguous_id"]:
 88 |             segment_info["category_id"] = meta["thing_dataset_id_to_contiguous_id"][
 89 |                 segment_info["category_id"]
 90 |             ]
 91 |             segment_info["isthing"] = True
 92 |         else:
 93 |             segment_info["category_id"] = meta["stuff_dataset_id_to_contiguous_id"][
 94 |                 segment_info["category_id"]
 95 |             ]
 96 |             segment_info["isthing"] = False
 97 |         return segment_info
 98 | 
 99 |     with PathManager.open(json_file) as f:
100 |         json_info = json.load(f)
101 | 
102 |     ret = []
103 |     for ann in json_info["annotations"]:
104 |         image_id = int(ann["image_id"])
105 |         # TODO: currently we assume image and label has the same filename but
106 |         # different extension, and images have extension ".jpg" for COCO. Need
107 |         # to make image extension a user-provided argument if we extend this
108 |         # function to support other COCO-like datasets.
109 |         image_file = os.path.join(image_dir, os.path.splitext(ann["file_name"])[0] + ".jpg")
110 |         label_file = os.path.join(gt_dir, ann["file_name"])
111 |         sem_label_file = os.path.join(semseg_dir, ann["file_name"])
112 |         segments_info = [_convert_category_id(x, meta) for x in ann["segments_info"]]
113 |         ret.append(
114 |             {
115 |                 "file_name": image_file,
116 |                 "image_id": image_id,
117 |                 "pan_seg_file_name": label_file,
118 |                 "sem_seg_file_name": sem_label_file,
119 |                 "segments_info": segments_info,
120 |             }
121 |         )
122 |     assert len(ret), f"No images found in {image_dir}!"
123 |     assert PathManager.isfile(ret[0]["file_name"]), ret[0]["file_name"]
124 |     assert PathManager.isfile(ret[0]["pan_seg_file_name"]), ret[0]["pan_seg_file_name"]
125 |     assert PathManager.isfile(ret[0]["sem_seg_file_name"]), ret[0]["sem_seg_file_name"]
126 |     return ret
127 | 
128 | 
129 | def register_coco_panoptic_annos_sem_seg(
130 |     name, metadata, image_root, panoptic_root, panoptic_json, sem_seg_root, instances_json
131 | ):
132 |     panoptic_name = name
133 |     delattr(MetadataCatalog.get(panoptic_name), "thing_classes")
134 |     delattr(MetadataCatalog.get(panoptic_name), "thing_colors")
135 |     MetadataCatalog.get(panoptic_name).set(
136 |         thing_classes=metadata["thing_classes"],
137 |         thing_colors=metadata["thing_colors"],
138 |         # thing_dataset_id_to_contiguous_id=metadata["thing_dataset_id_to_contiguous_id"],
139 |     )
140 | 
141 |     # the name is "coco_2017_train_panoptic_with_sem_seg" and "coco_2017_val_panoptic_with_sem_seg"
142 |     semantic_name = name + "_with_sem_seg"
143 |     DatasetCatalog.register(
144 |         semantic_name,
145 |         lambda: load_coco_panoptic_json(panoptic_json, image_root, panoptic_root, sem_seg_root, metadata),
146 |     )
147 |     MetadataCatalog.get(semantic_name).set(
148 |         sem_seg_root=sem_seg_root,
149 |         panoptic_root=panoptic_root,
150 |         image_root=image_root,
151 |         panoptic_json=panoptic_json,
152 |         json_file=instances_json,
153 |         evaluator_type="coco_panoptic_seg",
154 |         ignore_label=255,
155 |         label_divisor=1000,
156 |         **metadata,
157 |     )
158 | 
159 | 
160 | def register_all_coco_panoptic_annos_sem_seg(root):
161 |     for (
162 |         prefix,
163 |         (panoptic_root, panoptic_json, semantic_root),
164 |     ) in _PREDEFINED_SPLITS_COCO_PANOPTIC.items():
165 |         prefix_instances = prefix[: -len("_panoptic")]
166 |         instances_meta = MetadataCatalog.get(prefix_instances)
167 |         image_root, instances_json = instances_meta.image_root, instances_meta.json_file
168 | 
169 |         register_coco_panoptic_annos_sem_seg(
170 |             prefix,
171 |             get_metadata(),
172 |             image_root,
173 |             os.path.join(root, panoptic_root),
174 |             os.path.join(root, panoptic_json),
175 |             os.path.join(root, semantic_root),
176 |             instances_json,
177 |         )
178 | 
179 | 
180 | _root = os.getenv("DETECTRON2_DATASETS", "datasets")
181 | register_all_coco_panoptic_annos_sem_seg(_root)
182 | 


--------------------------------------------------------------------------------
/maskclip/modeling/matcher.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/models/matcher.py
  3 | """
  4 | Modules to compute the matching cost and solve the corresponding LSAP.
  5 | """
  6 | import torch
  7 | import torch.nn.functional as F
  8 | from scipy.optimize import linear_sum_assignment
  9 | from torch import nn
 10 | from torch.cuda.amp import autocast
 11 | 
 12 | from detectron2.projects.point_rend.point_features import point_sample
 13 | 
 14 | 
 15 | def batch_dice_loss(inputs: torch.Tensor, targets: torch.Tensor):
 16 |     """
 17 |     Compute the DICE loss, similar to generalized IOU for masks
 18 |     Args:
 19 |         inputs: A float tensor of arbitrary shape.
 20 |                 The predictions for each example.
 21 |         targets: A float tensor with the same shape as inputs. Stores the binary
 22 |                  classification label for each element in inputs
 23 |                 (0 for the negative class and 1 for the positive class).
 24 |     """
 25 |     inputs = inputs.sigmoid()
 26 |     inputs = inputs.flatten(1)
 27 |     numerator = 2 * torch.einsum("nc,mc->nm", inputs, targets)
 28 |     denominator = inputs.sum(-1)[:, None] + targets.sum(-1)[None, :]
 29 |     loss = 1 - (numerator + 1) / (denominator + 1)
 30 |     return loss
 31 | 
 32 | 
 33 | batch_dice_loss_jit = torch.jit.script(
 34 |     batch_dice_loss
 35 | )  # type: torch.jit.ScriptModule
 36 | 
 37 | 
 38 | def batch_sigmoid_ce_loss(inputs: torch.Tensor, targets: torch.Tensor):
 39 |     """
 40 |     Args:
 41 |         inputs: A float tensor of arbitrary shape.
 42 |                 The predictions for each example.
 43 |         targets: A float tensor with the same shape as inputs. Stores the binary
 44 |                  classification label for each element in inputs
 45 |                 (0 for the negative class and 1 for the positive class).
 46 |     Returns:
 47 |         Loss tensor
 48 |     """
 49 |     hw = inputs.shape[1]
 50 | 
 51 |     pos = F.binary_cross_entropy_with_logits(
 52 |         inputs, torch.ones_like(inputs), reduction="none"
 53 |     )
 54 |     neg = F.binary_cross_entropy_with_logits(
 55 |         inputs, torch.zeros_like(inputs), reduction="none"
 56 |     )
 57 | 
 58 |     loss = torch.einsum("nc,mc->nm", pos, targets) + torch.einsum(
 59 |         "nc,mc->nm", neg, (1 - targets)
 60 |     )
 61 | 
 62 |     return loss / hw
 63 | 
 64 | 
 65 | batch_sigmoid_ce_loss_jit = torch.jit.script(
 66 |     batch_sigmoid_ce_loss
 67 | )  # type: torch.jit.ScriptModule
 68 | 
 69 | 
 70 | class HungarianMatcher(nn.Module):
 71 |     """This class computes an assignment between the targets and the predictions of the network
 72 | 
 73 |     For efficiency reasons, the targets don't include the no_object. Because of this, in general,
 74 |     there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions,
 75 |     while the others are un-matched (and thus treated as non-objects).
 76 |     """
 77 | 
 78 |     def __init__(self, cost_class: float = 1, cost_mask: float = 1, cost_dice: float = 1, num_points: int = 0):
 79 |         """Creates the matcher
 80 | 
 81 |         Params:
 82 |             cost_class: This is the relative weight of the classification error in the matching cost
 83 |             cost_mask: This is the relative weight of the focal loss of the binary mask in the matching cost
 84 |             cost_dice: This is the relative weight of the dice loss of the binary mask in the matching cost
 85 |         """
 86 |         super().__init__()
 87 |         self.cost_class = cost_class
 88 |         self.cost_mask = cost_mask
 89 |         self.cost_dice = cost_dice
 90 | 
 91 |         assert cost_class != 0 or cost_mask != 0 or cost_dice != 0, "all costs cant be 0"
 92 | 
 93 |         self.num_points = num_points
 94 | 
 95 |     @torch.no_grad()
 96 |     def memory_efficient_forward(self, outputs, targets):
 97 |         """More memory-friendly matching"""
 98 |         bs, num_queries = outputs["pred_logits"].shape[:2]
 99 | 
100 |         indices = []
101 | 
102 |         # Iterate through batch size
103 |         for b in range(bs):
104 | 
105 |             out_prob = outputs["pred_logits"][b].softmax(-1)  # [num_queries, num_classes]
106 |             tgt_ids = targets[b]["labels"]
107 | 
108 |             # Compute the classification cost. Contrary to the loss, we don't use the NLL,
109 |             # but approximate it in 1 - proba[target class].
110 |             # The 1 is a constant that doesn't change the matching, it can be ommitted.
111 |             cost_class = -out_prob[:, tgt_ids]
112 | 
113 |             out_mask = outputs["pred_masks"][b]  # [num_queries, H_pred, W_pred]
114 |             # gt masks are already padded when preparing target
115 |             tgt_mask = targets[b]["masks"].to(out_mask)
116 | 
117 |             out_mask = out_mask[:, None]
118 |             tgt_mask = tgt_mask[:, None]
119 |             # all masks share the same set of points for efficient matching!
120 |             point_coords = torch.rand(1, self.num_points, 2, device=out_mask.device)
121 |             # get gt labels
122 |             tgt_mask = point_sample(
123 |                 tgt_mask,
124 |                 point_coords.repeat(tgt_mask.shape[0], 1, 1),
125 |                 align_corners=False,
126 |             ).squeeze(1)
127 | 
128 |             out_mask = point_sample(
129 |                 out_mask,
130 |                 point_coords.repeat(out_mask.shape[0], 1, 1),
131 |                 align_corners=False,
132 |             ).squeeze(1)
133 | 
134 |             with autocast(enabled=False):
135 |                 out_mask = out_mask.float()
136 |                 tgt_mask = tgt_mask.float()
137 |                 # Compute the focal loss between masks
138 |                 cost_mask = batch_sigmoid_ce_loss_jit(out_mask, tgt_mask)
139 | 
140 |                 # Compute the dice loss betwen masks
141 |                 cost_dice = batch_dice_loss_jit(out_mask, tgt_mask)
142 |             
143 |             # Final cost matrix
144 |             C = (
145 |                 self.cost_mask * cost_mask
146 |                 + self.cost_class * cost_class
147 |                 + self.cost_dice * cost_dice
148 |             )
149 |             C = C.reshape(num_queries, -1).cpu()
150 | 
151 |             indices.append(linear_sum_assignment(C))
152 | 
153 |         return [
154 |             (torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64))
155 |             for i, j in indices
156 |         ]
157 | 
158 |     @torch.no_grad()
159 |     def forward(self, outputs, targets):
160 |         """Performs the matching
161 | 
162 |         Params:
163 |             outputs: This is a dict that contains at least these entries:
164 |                  "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
165 |                  "pred_masks": Tensor of dim [batch_size, num_queries, H_pred, W_pred] with the predicted masks
166 | 
167 |             targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
168 |                  "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
169 |                            objects in the target) containing the class labels
170 |                  "masks": Tensor of dim [num_target_boxes, H_gt, W_gt] containing the target masks
171 | 
172 |         Returns:
173 |             A list of size batch_size, containing tuples of (index_i, index_j) where:
174 |                 - index_i is the indices of the selected predictions (in order)
175 |                 - index_j is the indices of the corresponding selected targets (in order)
176 |             For each batch element, it holds:
177 |                 len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
178 |         """
179 |         return self.memory_efficient_forward(outputs, targets)
180 | 
181 |     def __repr__(self, _repr_indent=4):
182 |         head = "Matcher " + self.__class__.__name__
183 |         body = [
184 |             "cost_class: {}".format(self.cost_class),
185 |             "cost_mask: {}".format(self.cost_mask),
186 |             "cost_dice: {}".format(self.cost_dice),
187 |         ]
188 |         lines = [head] + [" " * _repr_indent + line for line in body]
189 |         return "\n".join(lines)
190 | 


--------------------------------------------------------------------------------
/maskclip/modeling/utils.py:
--------------------------------------------------------------------------------
  1 | # copy from pytorch
  2 | import math
  3 | from typing import Optional, List, Tuple
  4 | 
  5 | import einops
  6 | import torch
  7 | import torch.nn.functional as F
  8 | from torch import Tensor
  9 | from torch._C import _add_docstr
 10 | 
 11 | 
 12 | 
 13 | linear = torch._C._nn.linear
 14 | 
 15 | 
 16 | def _in_projection_packed(
 17 |     q: Tensor,
 18 |     k: Tensor,
 19 |     v: Tensor,
 20 |     w: Tensor,
 21 |     b: Optional[Tensor] = None,
 22 | ) -> List[Tensor]:
 23 |     r"""
 24 |     Performs the in-projection step of the attention operation, using packed weights.
 25 |     Output is a triple containing projection tensors for query, key and value.
 26 | 
 27 |     Args:
 28 |         q, k, v: query, key and value tensors to be projected. For self-attention,
 29 |             these are typically the same tensor; for encoder-decoder attention,
 30 |             k and v are typically the same tensor. (We take advantage of these
 31 |             identities for performance if they are present.) Regardless, q, k and v
 32 |             must share a common embedding dimension; otherwise their shapes may vary.
 33 |         w: projection weights for q, k and v, packed into a single tensor. Weights
 34 |             are packed along dimension 0, in q, k, v order.
 35 |         b: optional projection biases for q, k and v, packed into a single tensor
 36 |             in q, k, v order.
 37 | 
 38 |     Shape:
 39 |         Inputs:
 40 |         - q: :math:`(..., E)` where E is the embedding dimension
 41 |         - k: :math:`(..., E)` where E is the embedding dimension
 42 |         - v: :math:`(..., E)` where E is the embedding dimension
 43 |         - w: :math:`(E * 3, E)` where E is the embedding dimension
 44 |         - b: :math:`E * 3` where E is the embedding dimension
 45 | 
 46 |         Output:
 47 |         - in output list :math:`[q', k', v']`, each output tensor will have the
 48 |             same shape as the corresponding input tensor.
 49 |     """
 50 |     E = q.size(-1)
 51 |     if k is v:
 52 |         if q is k:
 53 |             # self-attention
 54 |             return linear(q, w, b).chunk(3, dim=-1)
 55 |         else:
 56 |             # encoder-decoder attention
 57 |             w_q, w_kv = w.split([E, E * 2])
 58 |             if b is None:
 59 |                 b_q = b_kv = None
 60 |             else:
 61 |                 b_q, b_kv = b.split([E, E * 2])
 62 |             return (linear(q, w_q, b_q),) + linear(k, w_kv, b_kv).chunk(2, dim=-1)
 63 |     else:
 64 |         w_q, w_k, w_v = w.chunk(3)
 65 |         if b is None:
 66 |             b_q = b_k = b_v = None
 67 |         else:
 68 |             b_q, b_k, b_v = b.chunk(3)
 69 |         return linear(q, w_q, b_q), linear(k, w_k, b_k), linear(v, w_v, b_v)
 70 | 
 71 | 
 72 | def _in_projection(
 73 |     q: Tensor,
 74 |     k: Tensor,
 75 |     v: Tensor,
 76 |     w_q: Tensor,
 77 |     w_k: Tensor,
 78 |     w_v: Tensor,
 79 |     b_q: Optional[Tensor] = None,
 80 |     b_k: Optional[Tensor] = None,
 81 |     b_v: Optional[Tensor] = None,
 82 | ) -> Tuple[Tensor, Tensor, Tensor]:
 83 |     r"""
 84 |     Performs the in-projection step of the attention operation. This is simply
 85 |     a triple of linear projections, with shape constraints on the weights which
 86 |     ensure embedding dimension uniformity in the projected outputs.
 87 |     Output is a triple containing projection tensors for query, key and value.
 88 | 
 89 |     Args:
 90 |         q, k, v: query, key and value tensors to be projected.
 91 |         w_q, w_k, w_v: weights for q, k and v, respectively.
 92 |         b_q, b_k, b_v: optional biases for q, k and v, respectively.
 93 | 
 94 |     Shape:
 95 |         Inputs:
 96 |         - q: :math:`(Qdims..., Eq)` where Eq is the query embedding dimension and Qdims are any
 97 |             number of leading dimensions.
 98 |         - k: :math:`(Kdims..., Ek)` where Ek is the key embedding dimension and Kdims are any
 99 |             number of leading dimensions.
100 |         - v: :math:`(Vdims..., Ev)` where Ev is the value embedding dimension and Vdims are any
101 |             number of leading dimensions.
102 |         - w_q: :math:`(Eq, Eq)`
103 |         - w_k: :math:`(Eq, Ek)`
104 |         - w_v: :math:`(Eq, Ev)`
105 |         - b_q: :math:`(Eq)`
106 |         - b_k: :math:`(Eq)`
107 |         - b_v: :math:`(Eq)`
108 | 
109 |         Output: in output triple :math:`(q', k', v')`,
110 |          - q': :math:`[Qdims..., Eq]`
111 |          - k': :math:`[Kdims..., Eq]`
112 |          - v': :math:`[Vdims..., Eq]`
113 | 
114 |     """
115 |     Eq, Ek, Ev = q.size(-1), k.size(-1), v.size(-1)
116 |     assert w_q.shape == (Eq, Eq), f"expecting query weights shape of {(Eq, Eq)}, but got {w_q.shape}"
117 |     assert w_k.shape == (Eq, Ek), f"expecting key weights shape of {(Eq, Ek)}, but got {w_k.shape}"
118 |     assert w_v.shape == (Eq, Ev), f"expecting value weights shape of {(Eq, Ev)}, but got {w_v.shape}"
119 |     assert b_q is None or b_q.shape == (Eq,), f"expecting query bias shape of {(Eq,)}, but got {b_q.shape}"
120 |     assert b_k is None or b_k.shape == (Eq,), f"expecting key bias shape of {(Eq,)}, but got {b_k.shape}"
121 |     assert b_v is None or b_v.shape == (Eq,), f"expecting value bias shape of {(Eq,)}, but got {b_v.shape}"
122 |     return linear(q, w_q, b_q), linear(k, w_k, b_k), linear(v, w_v, b_v)
123 | 
124 | 
125 | def _scaled_dot_product_attention(
126 |     q: Tensor,
127 |     k: Tensor,
128 |     v: Tensor,
129 |     attn_mask: Optional[Tensor] = None,
130 |     dropout_p: float = 0.0,
131 |     q_masks: Tensor = None,
132 |     k_masks: Tensor = None,
133 |     num_heads=None,
134 | ) -> Tuple[Tensor, Tensor]:
135 |     r"""
136 |     Computes scaled dot product attention on query, key and value tensors, using
137 |     an optional attention mask if passed, and applying dropout if a probability
138 |     greater than 0.0 is specified.
139 |     Returns a tensor pair containing attended values and attention weights.
140 | 
141 |     Args:
142 |         q, k, v: query, key and value tensors. See Shape section for shape details.
143 |         attn_mask: optional tensor containing mask values to be added to calculated
144 |             attention. May be 2D or 3D; see Shape section for details.
145 |         dropout_p: dropout probability. If greater than 0.0, dropout is applied.
146 | 
147 |     Shape:
148 |         - q: :math:`(B, Nt, E)` where B is batch size, Nt is the target sequence length,
149 |             and E is embedding dimension.
150 |         - key: :math:`(B, Ns, E)` where B is batch size, Ns is the source sequence length,
151 |             and E is embedding dimension.
152 |         - value: :math:`(B, Ns, E)` where B is batch size, Ns is the source sequence length,
153 |             and E is embedding dimension.
154 |         - attn_mask: either a 3D tensor of shape :math:`(B, Nt, Ns)` or a 2D tensor of
155 |             shape :math:`(Nt, Ns)`.
156 | 
157 |         - Output: attention values have shape :math:`(B, Nt, E)`; attention weights
158 |             have shape :math:`(B, Nt, Ns)`
159 |     """
160 |     B, Nt, E = q.shape
161 | 
162 |     # aux_mask = torch.einsum('b q c, l b c -> b q l', q_prime, k_prime)
163 |     # aux_mask = einops.repeat(aux_mask, 'b q l -> (b h) q l', h=num_heads)
164 | 
165 |     q = q / math.sqrt(E)
166 | 
167 |     # (B, Nt, E) x (B, E, Ns) -> (B, Nt, Ns)
168 |     attn = torch.bmm(q, k.transpose(-2, -1))
169 | 
170 |     if q_masks is not None:
171 |         len_q = q_masks.shape[1]
172 |         q = einops.rearrange(q, '(b h) t e -> b t (h e)', h=num_heads)
173 |         k = einops.rearrange(k, '(b h) t e -> b t (h e)', h=num_heads)
174 |         mask_res = q[:, :len_q, None, :] * k[:, None, len_q+1:, :] # Bx100x576x1024
175 |         # assert mask_res[-1] == 1024
176 |         q_masks = q_masks / math.sqrt(E)
177 |         attn2 = q_masks * k_masks[:, None, :, :]
178 |         attn2 = attn2.sum(-1)
179 |         attn[:, :attn2.shape[1], attn2.shape[1]+1:] += attn2
180 |         attn[:, :attn2.shape[1], attn2.shape[1]+1:] /= 2
181 | 
182 |     if attn_mask is not None:
183 |         attn += attn_mask
184 |     attn = F.softmax(attn, dim=-1)
185 |     if dropout_p > 0.0:
186 |         attn = F.dropout(attn, p=dropout_p)
187 |     # (B, Nt, Ns) x (B, Ns, E) -> (B, Nt, E)
188 |     output = torch.bmm(attn, v)
189 |     if q_masks is not None:
190 |         return output, attn, mask_res
191 |     else:
192 |         return output, attn, None
193 | 


--------------------------------------------------------------------------------
/maskclip/modeling/maskclip.py:
--------------------------------------------------------------------------------
  1 | from collections import OrderedDict
  2 | from typing import Tuple, Union
  3 | 
  4 | import numpy as np
  5 | import torch
  6 | import torch.nn as nn
  7 | import torch.nn.functional as F
  8 | import clip
  9 | 
 10 | from .attention import MultiheadAttention
 11 | 
 12 | 
 13 | def gelu(x):
 14 |     return x * torch.sigmoid(1.702 * x)
 15 | 
 16 | 
 17 | class LayerNorm(nn.LayerNorm):
 18 |     """Subclass torch's LayerNorm to handle fp16."""
 19 | 
 20 |     def forward(self, x: torch.Tensor):
 21 |         orig_type = x.dtype
 22 |         ret = super().forward(x.type(torch.float32))
 23 |         return ret.type(orig_type)
 24 | 
 25 | 
 26 | class QuickGELU(nn.Module):
 27 |     def forward(self, x: torch.Tensor):
 28 |         return gelu(x)
 29 | 
 30 | 
 31 | class ResidualAttentionBlock(nn.Module):
 32 |     def __init__(self, d_model: int, n_head: int, clip_patch_size: int, need_masks_embed: bool):
 33 |         super().__init__()
 34 | 
 35 |         self.n_head = n_head
 36 |         self.clip_patch_size = clip_patch_size
 37 |         self.attn = MultiheadAttention(d_model, n_head, need_masks_embed=need_masks_embed)
 38 |         self.ln_1 = LayerNorm(d_model)
 39 |         self.mlp = nn.Sequential(OrderedDict([
 40 |             ("c_fc", nn.Linear(d_model, d_model * 4)),
 41 |             ("gelu", QuickGELU()),
 42 |             ("c_proj", nn.Linear(d_model * 4, d_model))
 43 |         ]))
 44 |         self.ln_2 = LayerNorm(d_model)
 45 | 
 46 |     def attention(self, x: torch.Tensor, attn_mask, masks_embed):
 47 |         x, _, masks_res = self.attn(x, x, x, need_weights=False, attn_mask=attn_mask, masks_embed=masks_embed)
 48 |         return x, masks_res
 49 | 
 50 |     def forward(self, x: torch.Tensor, masks: torch.Tensor, masks_embed: torch.Tensor = None):
 51 |         l, b, d = x.shape
 52 |         _, q, _, _ = masks.shape
 53 |         masks = (masks.sigmoid() >= 0.5).float()
 54 |         masks = F.max_pool2d(masks, self.clip_patch_size).flatten(2)
 55 | 
 56 |         attn_mask = torch.empty((b, l, l), device=x.device, dtype=torch.bool)
 57 |         attn_mask[:, :, :] = False
 58 |         attn_mask[:, :, :q] = True
 59 |         attn_mask[:, :q, q+1:] = masks == 0.
 60 |         
 61 |         attn_mask = torch.repeat_interleave(attn_mask, self.n_head, dim=0)
 62 | 
 63 |         x_res, masks_res = self.attention(self.ln_1(x), attn_mask=attn_mask, masks_embed=masks_embed)
 64 |         x = x + x_res
 65 |         x = x + self.mlp(self.ln_2(x))
 66 |         return x, masks_res
 67 | 
 68 | 
 69 | class Transformer(nn.Module):
 70 |     def __init__(
 71 |             self, width: int, layers: int, heads: int,
 72 |             clip_input_resolution, clip_patch_size, clip_width, clip_layers, clip_heads
 73 |         ):
 74 |         super().__init__()
 75 |         self.width = width
 76 |         self.layers = layers
 77 |         self.resblocks = nn.Sequential(*[ResidualAttentionBlock(width, heads, clip_patch_size, (i + 1) % 6 == 0) 
 78 |             for i in range(layers)]
 79 |         )
 80 | 
 81 |         self.clip_input_resolution = clip_input_resolution
 82 |         self.clip_patch_size = clip_patch_size
 83 |         self.clip_width = clip_width
 84 |         self.clip_layers = clip_layers
 85 |         self.clip_heads = clip_heads
 86 |         self.clip_num_patches_dim = clip_input_resolution // clip_patch_size
 87 |         self.clip_num_patches = self.clip_num_patches_dim ** 2
 88 | 
 89 |         self.idxs = [5, 11, 17, 23]
 90 |         self.conv1_added_params = nn.Sequential(
 91 |             *[nn.Conv2d(1, clip_width, clip_patch_size, clip_patch_size, bias=True) 
 92 |                 for i in range(len(self.idxs))]
 93 |         )
 94 |         self.conv3_added_params = nn.Sequential(
 95 |             *[nn.Conv2d(clip_width, clip_patch_size ** 2, 1, 1, bias=True) 
 96 |                 for i in range(len(self.idxs))]
 97 |         )
 98 |         self.apply(self.init_weights)
 99 | 
100 |     def init_weights(self, m):
101 |         if isinstance(m, nn.Conv2d):
102 |             nn.init.kaiming_normal_(m.weight)
103 |             nn.init.uniform_(m.bias)
104 | 
105 |     def forward(self, x: torch.Tensor, masks: torch.Tensor):
106 | 
107 |         masks_list = []
108 |         for i, block in enumerate(list(self.resblocks.modules())[0]):
109 | 
110 |             if i in self.idxs:
111 |                 masks_embed = self.conv1_added_params[i//6](masks.tanh().unsqueeze(2).flatten(0, 1)) #(Bxq)x1xhxw
112 |                 masks_embed = masks_embed.reshape(
113 |                     x.shape[1], masks.shape[1], self.clip_width, self.clip_num_patches
114 |                 ).permute(1, 3, 0, 2)
115 |                 x, masks_res = block(x, masks, masks_embed)  # masks_res: Bx100x256x1024
116 | 
117 |                 masks_res = masks_res.permute(0, 1, 3, 2).reshape(
118 |                     x.shape[1], 100, self.clip_width, self.clip_num_patches_dim, self.clip_num_patches_dim
119 |                 ).flatten(0, 1)
120 |                 masks_res = gelu(masks_res)
121 |                 masks_res = self.conv3_added_params[i//6](masks_res)
122 |                 masks_res = masks_res.reshape(
123 |                     -1, self.clip_patch_size, self.clip_patch_size, self.clip_num_patches_dim, self.clip_num_patches_dim
124 |                 ).permute(0,3,1,4,2)
125 |                 masks_res = masks_res.flatten(1, 2).flatten(2, 3)
126 |                 masks_res = masks_res.squeeze(1).reshape(
127 |                     x.shape[1], 100, self.clip_input_resolution, self.clip_input_resolution
128 |                 )
129 | 
130 |                 masks += masks_res
131 |                 masks_list.append(masks)
132 |             else:
133 |                 x, _ = block(x, masks)
134 | 
135 |         return x, masks_list
136 | 
137 | 
138 | class VisionTransformer(nn.Module):
139 |     def __init__(self, input_resolution: int, patch_size: int, width: int, layers: int, heads: int, output_dim: int):
140 |         super().__init__()
141 |         self.input_resolution = input_resolution
142 |         self.output_dim = output_dim
143 |         self.conv1 = nn.Conv2d(in_channels=3, out_channels=width, kernel_size=patch_size, stride=patch_size, bias=False)
144 | 
145 |         scale = width ** -0.5
146 |         self.class_embedding = nn.Parameter(scale * torch.randn(width))
147 |         self.positional_embedding = nn.Parameter(scale * torch.randn((input_resolution // patch_size) ** 2 + 1, width))
148 |         self.ln_pre = LayerNorm(width)
149 | 
150 |         self.transformer = Transformer(width, layers, heads, input_resolution, patch_size, width, layers, heads)
151 | 
152 |         self.ln_post = LayerNorm(width)
153 |         self.proj = nn.Parameter(scale * torch.randn(width, output_dim))
154 | 
155 |     def forward(self, x: torch.Tensor, masks: torch.Tensor):
156 |         q = masks.shape[1]
157 | 
158 |         x = self.conv1(x)  # shape = [*, width, grid, grid]
159 |         x = x.reshape(x.shape[0], x.shape[1], -1)  # shape = [*, width, grid ** 2]
160 |         x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
161 |         x = torch.cat([self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1)  # shape = [*, grid ** 2 + 1, width]
162 |         x = x + self.positional_embedding.to(x.dtype)
163 |         x = self.ln_pre(x)
164 | 
165 |         x = x.permute(1, 0, 2)  # NLD -> LND
166 |         cls_embed = x[0:1]
167 |         cls_embed = cls_embed.repeat(q, 1, 1)
168 |         x = torch.cat([cls_embed, x], dim=0)
169 | 
170 |         x, masks_list = self.transformer(x, masks)
171 |         x = x.permute(1, 0, 2)  # LND -> NLD
172 | 
173 |         x = self.ln_post(x[:, :q, :])
174 | 
175 |         if self.proj is not None:
176 |             x = x @ self.proj
177 | 
178 |         return x, masks_list
179 | 
180 | 
181 | class MaskCLIP(nn.Module):
182 |     def __init__(
183 |             self, 
184 |             # initialize CLIP
185 |             clip_model_name,
186 |             input_resolution,
187 |             patch_size,
188 |             width,
189 |             layers,
190 |             heads,
191 |             output_dim,
192 |             temperature
193 |         ):
194 |         super().__init__()
195 | 
196 |         self.temperature = temperature
197 | 
198 |         self.visual = VisionTransformer(
199 |                 input_resolution=input_resolution,
200 |                 patch_size=patch_size,
201 |                 width=width,
202 |                 layers=layers,
203 |                 heads=heads,
204 |                 output_dim=output_dim
205 |             )
206 | 
207 |         clip_, _ = clip.load(clip_model_name, device='cpu')
208 |         self.visual.load_state_dict(clip_.visual.state_dict(), strict=False)
209 | 
210 |         del clip_
211 | 
212 |     def forward(self, x, masks, txt_embed):
213 |         outputs = {}
214 | 
215 |         img_fet, masks_list = self.visual(x, masks)
216 | 
217 |         logits = torch.einsum('b q c, n c -> b q n', img_fet / img_fet.norm(dim=-1, keepdim=True), 
218 |             txt_embed.to(img_fet.device)) / self.temperature
219 |         outputs['pred_logits'] = logits
220 |         outputs['pred_masks'] = masks_list[-1]
221 |         outputs['aux_outputs'] = []
222 |         for i in range(len(masks_list) - 1):
223 |             outputs['aux_outputs'].append({'pred_logits': logits, 'pred_masks': masks_list[i]})
224 | 
225 |         return outputs
226 | 
227 | 


--------------------------------------------------------------------------------
/maskclip/modeling/meta_arch/per_pixel_baseline.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | import logging
  3 | from typing import Callable, Dict, List, Optional, Tuple, Union
  4 | 
  5 | import fvcore.nn.weight_init as weight_init
  6 | from torch import nn
  7 | from torch.nn import functional as F
  8 | 
  9 | from detectron2.config import configurable
 10 | from detectron2.layers import Conv2d, ShapeSpec, get_norm
 11 | from detectron2.modeling import SEM_SEG_HEADS_REGISTRY
 12 | 
 13 | from ..transformer_decoder.maskformer_transformer_decoder import StandardTransformerDecoder
 14 | from ..pixel_decoder.fpn import build_pixel_decoder
 15 | 
 16 | 
 17 | @SEM_SEG_HEADS_REGISTRY.register()
 18 | class PerPixelBaselineHead(nn.Module):
 19 | 
 20 |     _version = 2
 21 | 
 22 |     def _load_from_state_dict(
 23 |         self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
 24 |     ):
 25 |         version = local_metadata.get("version", None)
 26 |         if version is None or version < 2:
 27 |             logger = logging.getLogger(__name__)
 28 |             # Do not warn if train from scratch
 29 |             scratch = True
 30 |             logger = logging.getLogger(__name__)
 31 |             for k in list(state_dict.keys()):
 32 |                 newk = k
 33 |                 if "sem_seg_head" in k and not k.startswith(prefix + "predictor"):
 34 |                     newk = k.replace(prefix, prefix + "pixel_decoder.")
 35 |                     # logger.warning(f"{k} ==> {newk}")
 36 |                 if newk != k:
 37 |                     state_dict[newk] = state_dict[k]
 38 |                     del state_dict[k]
 39 |                     scratch = False
 40 | 
 41 |             if not scratch:
 42 |                 logger.warning(
 43 |                     f"Weight format of {self.__class__.__name__} have changed! "
 44 |                     "Please upgrade your models. Applying automatic conversion now ..."
 45 |                 )
 46 | 
 47 |     @configurable
 48 |     def __init__(
 49 |         self,
 50 |         input_shape: Dict[str, ShapeSpec],
 51 |         *,
 52 |         num_classes: int,
 53 |         pixel_decoder: nn.Module,
 54 |         loss_weight: float = 1.0,
 55 |         ignore_value: int = -1,
 56 |     ):
 57 |         """
 58 |         NOTE: this interface is experimental.
 59 |         Args:
 60 |             input_shape: shapes (channels and stride) of the input features
 61 |             num_classes: number of classes to predict
 62 |             pixel_decoder: the pixel decoder module
 63 |             loss_weight: loss weight
 64 |             ignore_value: category id to be ignored during training.
 65 |         """
 66 |         super().__init__()
 67 |         input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride)
 68 |         self.in_features = [k for k, v in input_shape]
 69 |         feature_strides = [v.stride for k, v in input_shape]
 70 |         feature_channels = [v.channels for k, v in input_shape]
 71 | 
 72 |         self.ignore_value = ignore_value
 73 |         self.common_stride = 4
 74 |         self.loss_weight = loss_weight
 75 | 
 76 |         self.pixel_decoder = pixel_decoder
 77 |         self.predictor = Conv2d(
 78 |             self.pixel_decoder.mask_dim, num_classes, kernel_size=1, stride=1, padding=0
 79 |         )
 80 |         weight_init.c2_msra_fill(self.predictor)
 81 | 
 82 |     @classmethod
 83 |     def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
 84 |         return {
 85 |             "input_shape": {
 86 |                 k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES
 87 |             },
 88 |             "ignore_value": cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
 89 |             "num_classes": cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES,
 90 |             "pixel_decoder": build_pixel_decoder(cfg, input_shape),
 91 |             "loss_weight": cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT,
 92 |         }
 93 | 
 94 |     def forward(self, features, targets=None):
 95 |         """
 96 |         Returns:
 97 |             In training, returns (None, dict of losses)
 98 |             In inference, returns (CxHxW logits, {})
 99 |         """
100 |         x = self.layers(features)
101 |         if self.training:
102 |             return None, self.losses(x, targets)
103 |         else:
104 |             x = F.interpolate(
105 |                 x, scale_factor=self.common_stride, mode="bilinear", align_corners=False
106 |             )
107 |             return x, {}
108 | 
109 |     def layers(self, features):
110 |         x, _, _ = self.pixel_decoder.forward_features(features)
111 |         x = self.predictor(x)
112 |         return x
113 | 
114 |     def losses(self, predictions, targets):
115 |         predictions = predictions.float()  # https://github.com/pytorch/pytorch/issues/48163
116 |         predictions = F.interpolate(
117 |             predictions, scale_factor=self.common_stride, mode="bilinear", align_corners=False
118 |         )
119 |         loss = F.cross_entropy(
120 |             predictions, targets, reduction="mean", ignore_index=self.ignore_value
121 |         )
122 |         losses = {"loss_sem_seg": loss * self.loss_weight}
123 |         return losses
124 | 
125 | 
126 | @SEM_SEG_HEADS_REGISTRY.register()
127 | class PerPixelBaselinePlusHead(PerPixelBaselineHead):
128 |     def _load_from_state_dict(
129 |         self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
130 |     ):
131 |         version = local_metadata.get("version", None)
132 |         if version is None or version < 2:
133 |             # Do not warn if train from scratch
134 |             scratch = True
135 |             logger = logging.getLogger(__name__)
136 |             for k in list(state_dict.keys()):
137 |                 newk = k
138 |                 if "sem_seg_head" in k and not k.startswith(prefix + "predictor"):
139 |                     newk = k.replace(prefix, prefix + "pixel_decoder.")
140 |                     logger.debug(f"{k} ==> {newk}")
141 |                 if newk != k:
142 |                     state_dict[newk] = state_dict[k]
143 |                     del state_dict[k]
144 |                     scratch = False
145 | 
146 |             if not scratch:
147 |                 logger.warning(
148 |                     f"Weight format of {self.__class__.__name__} have changed! "
149 |                     "Please upgrade your models. Applying automatic conversion now ..."
150 |                 )
151 | 
152 |     @configurable
153 |     def __init__(
154 |         self,
155 |         input_shape: Dict[str, ShapeSpec],
156 |         *,
157 |         # extra parameters
158 |         transformer_predictor: nn.Module,
159 |         transformer_in_feature: str,
160 |         deep_supervision: bool,
161 |         # inherit parameters
162 |         num_classes: int,
163 |         pixel_decoder: nn.Module,
164 |         loss_weight: float = 1.0,
165 |         ignore_value: int = -1,
166 |     ):
167 |         """
168 |         NOTE: this interface is experimental.
169 |         Args:
170 |             input_shape: shapes (channels and stride) of the input features
171 |             transformer_predictor: the transformer decoder that makes prediction
172 |             transformer_in_feature: input feature name to the transformer_predictor
173 |             deep_supervision: whether or not to add supervision to the output of
174 |                 every transformer decoder layer
175 |             num_classes: number of classes to predict
176 |             pixel_decoder: the pixel decoder module
177 |             loss_weight: loss weight
178 |             ignore_value: category id to be ignored during training.
179 |         """
180 |         super().__init__(
181 |             input_shape,
182 |             num_classes=num_classes,
183 |             pixel_decoder=pixel_decoder,
184 |             loss_weight=loss_weight,
185 |             ignore_value=ignore_value,
186 |         )
187 | 
188 |         del self.predictor
189 | 
190 |         self.predictor = transformer_predictor
191 |         self.transformer_in_feature = transformer_in_feature
192 |         self.deep_supervision = deep_supervision
193 | 
194 |     @classmethod
195 |     def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
196 |         ret = super().from_config(cfg, input_shape)
197 |         ret["transformer_in_feature"] = cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE
198 |         if cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "transformer_encoder":
199 |             in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
200 |         else:
201 |             in_channels = input_shape[ret["transformer_in_feature"]].channels
202 |         ret["transformer_predictor"] = StandardTransformerDecoder(
203 |             cfg, in_channels, mask_classification=False
204 |         )
205 |         ret["deep_supervision"] = cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION
206 |         return ret
207 | 
208 |     def forward(self, features, targets=None):
209 |         """
210 |         Returns:
211 |             In training, returns (None, dict of losses)
212 |             In inference, returns (CxHxW logits, {})
213 |         """
214 |         x, aux_outputs = self.layers(features)
215 |         if self.training:
216 |             if self.deep_supervision:
217 |                 losses = self.losses(x, targets)
218 |                 for i, aux_output in enumerate(aux_outputs):
219 |                     losses["loss_sem_seg" + f"_{i}"] = self.losses(
220 |                         aux_output["pred_masks"], targets
221 |                     )["loss_sem_seg"]
222 |                 return None, losses
223 |             else:
224 |                 return None, self.losses(x, targets)
225 |         else:
226 |             x = F.interpolate(
227 |                 x, scale_factor=self.common_stride, mode="bilinear", align_corners=False
228 |             )
229 |             return x, {}
230 | 
231 |     def layers(self, features):
232 |         mask_features, transformer_encoder_features, _ = self.pixel_decoder.forward_features(features)
233 |         if self.transformer_in_feature == "transformer_encoder":
234 |             assert (
235 |                 transformer_encoder_features is not None
236 |             ), "Please use the TransformerEncoderPixelDecoder."
237 |             predictions = self.predictor(transformer_encoder_features, mask_features)
238 |         else:
239 |             predictions = self.predictor(features[self.transformer_in_feature], mask_features)
240 |         if self.deep_supervision:
241 |             return predictions["pred_masks"], predictions["aux_outputs"]
242 |         else:
243 |             return predictions["pred_masks"], None
244 | 


--------------------------------------------------------------------------------