├── ovformer
    ├── evaluation
    │   ├── __init__.py
    │   └── instance_evaluation.py
    ├── utils
    │   ├── __init__.py
    │   └── misc.py
    ├── modeling
    │   ├── backbone
    │   │   └── __init__.py
    │   ├── meta_arch
    │   │   ├── __init__.py
    │   │   └── mask_former_head.py
    │   ├── pixel_decoder
    │   │   ├── __init__.py
    │   │   └── ops
    │   │   │   ├── make.sh
    │   │   │   ├── modules
    │   │   │       ├── __init__.py
    │   │   │       └── ms_deform_attn.py
    │   │   │   ├── functions
    │   │   │       ├── __init__.py
    │   │   │       └── ms_deform_attn_func.py
    │   │   │   ├── src
    │   │   │       ├── vision.cpp
    │   │   │       ├── cuda
    │   │   │       │   └── ms_deform_attn_cuda.h
    │   │   │       ├── cpu
    │   │   │       │   ├── ms_deform_attn_cpu.h
    │   │   │       │   └── ms_deform_attn_cpu.cpp
    │   │   │       └── ms_deform_attn.h
    │   │   │   ├── setup.py
    │   │   │   └── test.py
    │   ├── transformer_decoder
    │   │   ├── __init__.py
    │   │   ├── position_encoding.py
    │   │   ├── zero_shot_classifier.py
    │   │   └── maskformer_transformer_decoder.py
    │   ├── __init__.py
    │   └── util.py
    ├── data
    │   ├── dataset_mappers
    │   │   ├── __init__.py
    │   │   ├── coco_panoptic_new_baseline_dataset_mapper.py
    │   │   ├── mask_former_panoptic_dataset_mapper.py
    │   │   ├── mask_former_instance_dataset_mapper.py
    │   │   └── mask_former_semantic_dataset_mapper.py
    │   ├── __init__.py
    │   └── datasets
    │   │   ├── __init__.py
    │   │   ├── register_ade20k_instance.py
    │   │   └── lvis_v1.py
    ├── data_video
    │   ├── datasets
    │   │   ├── ytvis_api
    │   │   │   └── __init__.py
    │   │   ├── __init__.py
    │   │   ├── builtin.py
    │   │   └── burst.py
    │   ├── __init__.py
    │   └── augmentation.py
    ├── __init__.py
    ├── test_time_augmentation.py
    └── config.py
├── OVFormer.png
├── ovformer_video
    ├── utils
    │   ├── __init__.py
    │   └── memory.py
    ├── modeling
    │   ├── __init__.py
    │   └── transformer_decoder
    │   │   ├── __init__.py
    │   │   ├── position_encoding.py
    │   │   └── zero_shot_classifier.py
    ├── __init__.py
    └── config.py
├── requirements.txt
├── evaluate
    ├── __init__.py
    └── mask.py
├── configs
    ├── lvis
    │   ├── ovformer_SwinB_bs8.yaml
    │   ├── Base-COCO-InstanceSegmentation.yaml
    │   └── ovformer_R50_bs8.yaml
    ├── ovis
    │   ├── ovformer_SwinB_bs8.yaml
    │   └── ovformer_R50_bs8.yaml
    ├── burst
    │   ├── ovformer_SwinB_bs8.yaml
    │   └── ovformer_R50_bs8.yaml
    ├── youtubevis_2019
    │   ├── ovformer_SwinB_bs8.yaml
    │   └── ovformer_R50_bs8.yaml
    ├── youtubevis_2021
    │   ├── ovformer_SwinB_bs8.yaml
    │   └── ovformer_R50_bs8.yaml
    └── lvvis
    │   ├── video_ovformer_SwinB_bs8.yaml
    │   ├── Base-LVVIS-VideoInstanceSegmentation.yaml
    │   ├── ovformer_R50_bs8.yaml
    │   └── video_ovformer_R50_bs8.yaml
├── .gitignore
├── tools
    ├── remove_lvvis_novel.py
    ├── remove_lvis_rare.py
    ├── lvivs_test_instances_json.py
    ├── convert-thirdparty-pretrained-model-to-d2.py
    ├── ytvis_json.py
    ├── get_lvvis_cat_info.py
    ├── get_lvis_cat_info.py
    ├── vis_results.py
    ├── save_clip_features.py
    ├── analyze_model.py
    └── burst2ytvis.py
├── INSTALL.md
├── mAP.py
├── README.md
└── datasets
    └── README.md


/ovformer/evaluation/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/OVFormer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fanghaook/OVFormer/HEAD/OVFormer.png


--------------------------------------------------------------------------------
/ovformer/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/ovformer_video/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/ovformer/modeling/backbone/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/ovformer/data/dataset_mappers/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/ovformer/modeling/meta_arch/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/ovformer/modeling/pixel_decoder/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | opencv-python==4.8.0.74
2 | cython
3 | scipy
4 | shapely
5 | timm==0.5.4
6 | h5py
7 | submitit
8 | scikit-image


--------------------------------------------------------------------------------
/ovformer/data/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from . import datasets
3 | from .catalog import DatasetCatalog, MetadataCatalog 
4 | 


--------------------------------------------------------------------------------
/ovformer/data_video/datasets/ytvis_api/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # Modified by Bowen Cheng from https://github.com/youtubevos/cocoapi
3 | 


--------------------------------------------------------------------------------
/evaluate/__init__.py:
--------------------------------------------------------------------------------
1 | from .burst import BURST
2 | from .bursteval import BURSTeval
3 | from .lvvis import LVVIS
4 | from .lvviseval import LVVISeval
5 | from .mask import encode ,decode,area,toBbox


--------------------------------------------------------------------------------
/ovformer_video/modeling/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from .transformer_decoder.video_mask2former_transformer_decoder import VideoMultiScaleMaskedTransformerDecoder
3 | 


--------------------------------------------------------------------------------
/ovformer_video/modeling/transformer_decoder/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from .video_mask2former_transformer_decoder import VideoMultiScaleMaskedTransformerDecoder
3 | 


--------------------------------------------------------------------------------
/ovformer/modeling/transformer_decoder/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from .maskformer_transformer_decoder import StandardTransformerDecoder
3 | from .mask2former_transformer_decoder import MultiScaleMaskedTransformerDecoder
4 | 


--------------------------------------------------------------------------------
/ovformer_video/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | from . import modeling
 3 | 
 4 | # config
 5 | from .config import add_ovformer_video_config
 6 | 
 7 | # models
 8 | from .video_ovformer_model import VideoOVFormer
 9 | 
10 | 


--------------------------------------------------------------------------------
/ovformer/data_video/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC
3 | 
4 | from . import builtin  # ensure the builtin datasets are registered
5 | 
6 | __all__ = [k for k in globals().keys() if "builtin" not in k and not k.startswith("_")]
7 | 


--------------------------------------------------------------------------------
/ovformer/data/datasets/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | from . import (
 3 |     register_ade20k_full,
 4 |     register_ade20k_panoptic,
 5 |     register_coco_stuff_10k,
 6 |     register_mapillary_vistas,
 7 |     register_coco_panoptic_annos_semseg,
 8 |     register_ade20k_instance,
 9 |     register_mapillary_vistas_panoptic,
10 |     lvis_v1,
11 |     lvvis_oracle,
12 | )
13 | 


--------------------------------------------------------------------------------
/ovformer/modeling/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from .backbone.swin import D2SwinTransformer
3 | from .backbone.timm import build_timm_backbone
4 | from .pixel_decoder.fpn import BasePixelDecoder
5 | from .pixel_decoder.msdeformattn import MSDeformAttnPixelDecoder
6 | from .meta_arch.mask_former_head import MaskFormerHead
7 | from .meta_arch.per_pixel_baseline import PerPixelBaselineHead, PerPixelBaselinePlusHead
8 | 


--------------------------------------------------------------------------------
/ovformer_video/config.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | from detectron2.config import CfgNode as CN
 4 | 
 5 | 
 6 | def add_ovformer_video_config(cfg):
 7 |     # video data
 8 |     # DataLoader
 9 |     cfg.INPUT.SAMPLING_FRAME_NUM = 2
10 |     cfg.INPUT.SAMPLING_FRAME_RANGE = 20
11 |     cfg.INPUT.SAMPLING_FRAME_SHUFFLE = False
12 |     cfg.INPUT.AUGMENTATIONS = [] # "brightness", "contrast", "saturation", "rotation"
13 | 


--------------------------------------------------------------------------------
/ovformer/data_video/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC
 3 | 
 4 | from .dataset_mapper import YTVISDatasetMapper, CocoClipDatasetMapper
 5 | from .build import *
 6 | 
 7 | from .datasets import *
 8 | from .ytvis_eval import YTVISEvaluator
 9 | from .ovis_eval import OVISEvaluator
10 | from .lvvis_eval import LVVISEvaluator
11 | from .lvvis_eval_video import LVVISEvaluator_video
12 | from .burst_eval import BURSTEvaluator
13 | 


--------------------------------------------------------------------------------
/configs/lvis/ovformer_SwinB_bs8.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ovformer_R50_bs8.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "models/swin_base_patch4_window12_384_22k.pkl"
15 |   SEM_SEG_HEAD:
16 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
17 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
18 | 
19 | 


--------------------------------------------------------------------------------
/configs/ovis/ovformer_SwinB_bs8.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ovformer_R50_bs8.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "models/swin_base_patch4_window12_384_22k.pkl"
15 |   SEM_SEG_HEAD:
16 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
17 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
18 | 
19 | INPUT:
20 |   MIN_SIZE_TEST: 480
21 | 
22 | 


--------------------------------------------------------------------------------
/configs/burst/ovformer_SwinB_bs8.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ovformer_R50_bs8.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "models/swin_base_patch4_window12_384_22k.pkl"
15 |   SEM_SEG_HEAD:
16 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
17 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
18 | 
19 | INPUT:
20 |   MIN_SIZE_TEST: 480
21 | 
22 | 


--------------------------------------------------------------------------------
/configs/youtubevis_2019/ovformer_SwinB_bs8.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ovformer_R50_bs8.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "models/swin_base_patch4_window12_384_22k.pkl"
15 |   SEM_SEG_HEAD:
16 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
17 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
18 | 
19 | INPUT:
20 |   MIN_SIZE_TEST: 480
21 | 
22 | 


--------------------------------------------------------------------------------
/configs/youtubevis_2021/ovformer_SwinB_bs8.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ovformer_R50_bs8.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "models/swin_base_patch4_window12_384_22k.pkl"
15 |   SEM_SEG_HEAD:
16 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
17 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
18 | 
19 | INPUT:
20 |   MIN_SIZE_TEST: 480
21 | 
22 | 


--------------------------------------------------------------------------------
/configs/lvvis/video_ovformer_SwinB_bs8.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: video_ovformer_R50_bs8.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "models/OVFormer_swin_lvis.pth"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   SEM_SEG_HEAD:
18 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
19 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
20 | INPUT:
21 |   MIN_SIZE_TEST: 480
22 | 
23 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # output dir
 2 | output*
 3 | 
 4 | *.json
 5 | *.diff
 6 | *.jpg
 7 | !/projects/DensePose/doc/images/*.jpg
 8 | 
 9 | # compilation and distribution
10 | __pycache__
11 | _ext
12 | *.pyc
13 | *.pyd
14 | *.so
15 | *.dll
16 | *.egg-info/
17 | build/
18 | dist/
19 | wheels/
20 | 
21 | # pytorch/python/numpy formats
22 | *.pth
23 | *.pkl
24 | *.npy
25 | *.ts
26 | model_ts*.txt
27 | 
28 | # ipython/jupyter notebooks
29 | *.ipynb
30 | **/.ipynb_checkpoints/
31 | 
32 | # Editor temporaries
33 | *.swn
34 | *.swo
35 | *.swp
36 | *~
37 | 
38 | # editor settings
39 | .idea
40 | .vscode
41 | _darcs
42 | 
43 | # project dirs
44 | /detectron2/model_zoo/configs
45 | /datasets/*
46 | !/datasets/*.*
47 | /projects/*/datasets
48 | /models
49 | /snippet
50 | 
51 | detectron2


--------------------------------------------------------------------------------
/ovformer/modeling/pixel_decoder/ops/make.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # ------------------------------------------------------------------------------------------------
 3 | # Deformable DETR
 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | # ------------------------------------------------------------------------------------------------
 7 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | # ------------------------------------------------------------------------------------------------
 9 | 
10 | # Copyright (c) Facebook, Inc. and its affiliates.
11 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
12 | 
13 | python setup.py build install
14 | 


--------------------------------------------------------------------------------
/ovformer/modeling/pixel_decoder/ops/modules/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from .ms_deform_attn import MSDeformAttn
13 | 


--------------------------------------------------------------------------------
/tools/remove_lvvis_novel.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | 
 4 | if __name__ == '__main__':
 5 |     parser = argparse.ArgumentParser()
 6 |     parser.add_argument('--ann', default='datasets/LVVIS/train/train_instances_.json')
 7 |     args = parser.parse_args()
 8 | 
 9 |     print('Loading', args.ann)
10 |     data = json.load(open(args.ann, 'r'))
11 |     print('all #anns', len(data['annotations']))  # 15967
12 | 
13 |     novel_categories = [i['id'] for i in data['categories'] if i['partition'] in [2, 3]]
14 |     data['annotations'] = [x for x in data['annotations'] if x['category_id'] not in novel_categories]
15 | 
16 |     print('nonovel #anns', len(data['annotations']))  # 10884
17 |     out_path = args.ann[:-5] + 'nonovel.json'
18 |     print('Saving to', out_path)
19 |     json.dump(data, open(out_path, 'w'))
20 | 
21 | 


--------------------------------------------------------------------------------
/ovformer/modeling/pixel_decoder/ops/functions/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from .ms_deform_attn_func import MSDeformAttnFunction
13 | 
14 | 


--------------------------------------------------------------------------------
/tools/remove_lvis_rare.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | import argparse
 3 | import json
 4 | 
 5 | if __name__ == '__main__':
 6 |     parser = argparse.ArgumentParser()
 7 |     parser.add_argument('--ann', default='datasets/lvis/lvis_v1_train.json')
 8 |     args = parser.parse_args()
 9 | 
10 |     print('Loading', args.ann)
11 |     data = json.load(open(args.ann, 'r'))
12 |     catid2freq = {x['id']: x['frequency'] for x in data['categories']}
13 |     print('ori #anns', len(data['annotations']))
14 |     exclude = ['r']
15 |     data['annotations'] = [x for x in data['annotations'] \
16 |         if catid2freq[x['category_id']] not in exclude]
17 |     print('filtered #anns', len(data['annotations']))
18 |     out_path = args.ann[:-5] + '_norare.json'
19 |     print('Saving to', out_path)
20 |     json.dump(data, open(out_path, 'w'))
21 | 


--------------------------------------------------------------------------------
/ovformer/modeling/pixel_decoder/ops/src/vision.cpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #include "ms_deform_attn.h"
17 | 
18 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
19 |   m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward");
20 |   m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward");
21 | }
22 | 


--------------------------------------------------------------------------------
/configs/lvis/Base-COCO-InstanceSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_timm_backbone"
 5 |   WEIGHTS: "models/resnet50_miil_21k.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   TIMM:
 9 |     BASE_NAME: resnet50_in21k
10 | DATASETS:
11 |   TRAIN: ("lvis_v1_train_norare",)
12 |   TEST: ("lvis_v1_val",)
13 | SOLVER:
14 |   IMS_PER_BATCH: 8
15 |   BASE_LR: 0.0001
16 |   STEPS: (398250, 420375)
17 |   MAX_ITER: 442500
18 |   CHECKPOINT_PERIOD: 20000
19 |   WARMUP_FACTOR: 1.0
20 |   WARMUP_ITERS: 10
21 |   WEIGHT_DECAY: 0.05
22 |   OPTIMIZER: "ADAMW"
23 |   BACKBONE_MULTIPLIER: 0.1
24 |   CLIP_GRADIENTS:
25 |     ENABLED: True
26 |     CLIP_TYPE: "full_model"
27 |     CLIP_VALUE: 0.01
28 |     NORM_TYPE: 2.0
29 |   AMP:
30 |     ENABLED: True
31 | INPUT:
32 |   IMAGE_SIZE: 1024
33 |   MIN_SCALE: 0.1
34 |   MAX_SCALE: 2.0
35 |   FORMAT: "RGB"
36 |   DATASET_MAPPER_NAME: "coco_instance_lsj"
37 | TEST:
38 |   EVAL_PERIOD: 20000
39 | DATALOADER:
40 |   FILTER_EMPTY_ANNOTATIONS: True
41 |   NUM_WORKERS: 8
42 |   SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
43 |   REPEAT_THRESHOLD: 0.001
44 | VERSION: 2
45 | 


--------------------------------------------------------------------------------
/tools/lvivs_test_instances_json.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import cv2
 4 | 
 5 | val_instances = json.load(open('datasets/LVVIS/val/val_instances_.json', 'r'))
 6 | categories = val_instances['categories']
 7 | 
 8 | videos = []
 9 | video_folder = 'datasets/lvvis/test/JPEGImages'
10 | video_ids = sorted(os.listdir(video_folder))
11 | for video_id, video_name in enumerate(video_ids):
12 |     video_path = os.path.join(video_folder, video_name)
13 |     file_names = sorted(os.listdir(video_path))
14 | 
15 |     first_frame_path = os.path.join(video_path, file_names[0])
16 |     frame = cv2.imread(first_frame_path)
17 |     height, width, _ = frame.shape
18 | 
19 |     video_info = {
20 |         'id': video_id,
21 |         'width': width,
22 |         'height': height,
23 |         'length': len(file_names),
24 |         'file_names': [os.path.join(video_name, file_name) for file_name in file_names]
25 |     }
26 |     videos.append(video_info)
27 | 
28 | test_data = {
29 |     'videos': videos,
30 |     'categories': categories
31 | }
32 | 
33 | with open('datasets/LVVIS/test/test_instances.json', 'w') as f:
34 |     json.dump(test_data, f, indent=4)
35 | 
36 | print("test_instances.json done.")


--------------------------------------------------------------------------------
/INSTALL.md:
--------------------------------------------------------------------------------
 1 | ## Installation
 2 | 
 3 | ### Requirements
 4 | - Linux or macOS with Python ≥ 3.6
 5 | - PyTorch ≥ 1.9 and [torchvision](https://github.com/pytorch/vision/) that matches the PyTorch installation.
 6 |   Install them together at [pytorch.org](https://pytorch.org) to make sure of this. Note, please check
 7 |   PyTorch version matches that is required by Detectron2.
 8 | - Detectron2: follow [Detectron2 installation instructions](https://detectron2.readthedocs.io/tutorials/install.html).
 9 | - `pip install -r requirements.txt`
10 | 
11 | ### Example conda environment setup
12 | ```bash
13 | conda create --name ovformer python=3.8 -y
14 | conda activate ovformer
15 | conda install pytorch==1.9.0 torchvision==0.10.0 cudatoolkit=11.1 -c pytorch -c nvidia
16 | pip install git+https://github.com/cocodataset/panopticapi.git
17 | pip install git+https://github.com/lvis-dataset/lvis-api.git
18 | 
19 | # under your working directory
20 | git clone git@github.com:facebookresearch/detectron2.git
21 | cd detectron2
22 | pip install -e .
23 | 
24 | cd ..
25 | git clone https://github.com/fanghaook/OVFormer.git
26 | cd OVFormer
27 | pip install -r requirements.txt
28 | cd ovformer/modeling/pixel_decoder/ops
29 | sh make.sh
30 | cd ../../../..
31 | ```
32 | 


--------------------------------------------------------------------------------
/configs/lvvis/Base-LVVIS-VideoInstanceSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     NAME: "build_timm_backbone"
 4 |   WEIGHTS: "models/resnet50_miil_21k.pkl"
 5 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 6 |   PIXEL_STD: [58.395, 57.120, 57.375]
 7 |   TIMM:
 8 |     BASE_NAME: resnet50_in21k
 9 | DATASETS:
10 |   TRAIN: ("lvvis_train",)
11 |   TEST: ("lvvis_val",)
12 | SOLVER:
13 |   IMS_PER_BATCH: 8
14 |   BASE_LR: 0.0001
15 |   STEPS: (1000,)
16 |   MAX_ITER: 2000
17 |   CHECKPOINT_PERIOD: 1000
18 |   WARMUP_FACTOR: 1.0
19 |   WARMUP_ITERS: 10
20 |   WEIGHT_DECAY: 0.05
21 |   OPTIMIZER: "ADAMW"
22 |   BACKBONE_MULTIPLIER: 0.1
23 |   CLIP_GRADIENTS:
24 |     ENABLED: True
25 |     CLIP_TYPE: "full_model"
26 |     CLIP_VALUE: 0.01
27 |     NORM_TYPE: 2.0
28 |   AMP:
29 |     ENABLED: True
30 | INPUT:
31 |   MIN_SIZE_TRAIN_SAMPLING: "choice_by_clip"
32 |   RANDOM_FLIP: "flip_by_clip"
33 |   AUGMENTATIONS: []
34 |   MIN_SIZE_TRAIN: (360, 480)
35 |   MIN_SIZE_TEST: 360
36 |   CROP:
37 |     ENABLED: False
38 |     TYPE: "absolute_range"
39 |     SIZE: (600, 720)
40 |   FORMAT: "RGB"
41 |   SAMPLING_FRAME_NUM: 2
42 |   SAMPLING_FRAME_RANGE: 20
43 | TEST:
44 |   EVAL_PERIOD: 0
45 |   DETECTIONS_PER_IMAGE: 50
46 | DATALOADER:
47 |   FILTER_EMPTY_ANNOTATIONS: False
48 |   NUM_WORKERS: 4
49 | VERSION: 2
50 | 


--------------------------------------------------------------------------------
/ovformer/modeling/util.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import json
 3 | import numpy as np
 4 | from torch.nn import functional as F
 5 | 
 6 | def load_class_freq(
 7 |     path='datasets/metadata/lvis_v1_train_cat_info.json', freq_weight=1.0):
 8 |     cat_info = json.load(open(path, 'r'))
 9 |     if path=='datasets/metadata/lvis_v1_train_cat_info.json':
10 |         cat_info = torch.tensor(
11 |             [(c['image_count'] if c['frequency'] != 'r' else 0) for c in sorted(cat_info, key=lambda x: x['id'])])
12 |     else:
13 |         cat_info = torch.tensor(
14 |             [(c['image_count'] if c['frequency'] != 'n' else 0) for c in sorted(cat_info, key=lambda x: x['id'])])
15 |     freq_weight = cat_info.float() ** freq_weight
16 |     return freq_weight
17 | 
18 | 
19 | def get_fed_loss_inds(gt_classes, num_sample_cats, C, weight=None):
20 |     appeared = torch.unique(gt_classes) # C'
21 |     prob = appeared.new_ones(C + 1).float()
22 |     prob[-1] = 0
23 |     if len(appeared) < num_sample_cats:
24 |         if weight is not None:
25 |             prob[:C] = weight.float().clone()
26 |         prob[appeared] = 0
27 |         more_appeared = torch.multinomial(
28 |             prob, num_sample_cats - len(appeared),
29 |             replacement=False)
30 |         appeared = torch.cat([appeared, more_appeared])
31 |     return appeared
32 | 


--------------------------------------------------------------------------------
/tools/convert-thirdparty-pretrained-model-to-d2.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 3 | import argparse
 4 | import pickle 
 5 | import torch
 6 | 
 7 | """
 8 | Usage:
 9 | 
10 | cd models/
11 | wget https://miil-public-eu.oss-eu-central-1.aliyuncs.com/model-zoo/ImageNet_21K_P/models/resnet50_miil_21k.pth
12 | python ../tools/convert-thirdparty-pretrained-model-to-d2.py --path resnet50_miil_21k.pth
13 | 
14 | download swin_base_patch4_window12_384_22k.pth from https://github.com/microsoft/Swin-Transformer
15 | python ../tools/convert-thirdparty-pretrained-model-to-d2.py --path swin_base_patch4_window12_384_22k.pth
16 | 
17 | """
18 | 
19 | 
20 | if __name__ == "__main__":
21 |     parser = argparse.ArgumentParser()
22 |     parser.add_argument('--path', default='')
23 |     args = parser.parse_args()
24 | 
25 |     print('Loading', args.path)
26 |     model = torch.load(args.path, map_location="cpu")
27 |     # import pdb; pdb.set_trace()
28 |     if 'model' in model:
29 |         model = model['model']
30 |     if 'state_dict' in model:
31 |         model = model['state_dict']
32 |     ret = {
33 |         "model": model, 
34 |         "__author__": "third_party", 
35 |         "matching_heuristics": True
36 |     }
37 |     out_path = args.path.replace('.pth', '.pkl')
38 |     print('Saving to', out_path)
39 |     pickle.dump(ret, open(out_path, "wb"))


--------------------------------------------------------------------------------
/ovformer/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | from . import data  # register all new datasets
 3 | from . import modeling
 4 | 
 5 | # config
 6 | from .config import add_ovformer_config
 7 | 
 8 | # dataset loading
 9 | from .data.dataset_mappers.coco_instance_new_baseline_dataset_mapper import COCOInstanceNewBaselineDatasetMapper
10 | from .data.dataset_mappers.coco_panoptic_new_baseline_dataset_mapper import COCOPanopticNewBaselineDatasetMapper
11 | from .data.dataset_mappers.mask_former_instance_dataset_mapper import (
12 |     MaskFormerInstanceDatasetMapper,
13 | )
14 | from .data.dataset_mappers.mask_former_panoptic_dataset_mapper import (
15 |     MaskFormerPanopticDatasetMapper,
16 | )
17 | from .data.dataset_mappers.mask_former_semantic_dataset_mapper import (
18 |     MaskFormerSemanticDatasetMapper,
19 | )
20 | 
21 | # models
22 | from .ovformer_model_video import OVFormerVideo
23 | from .ovformer_model import OVFormer
24 | from .test_time_augmentation import SemanticSegmentorWithTTA
25 | 
26 | # evaluation
27 | from .evaluation.instance_evaluation import InstanceSegEvaluator
28 | 
29 | 
30 | from .data_video import (
31 |     YTVISDatasetMapper,
32 |     YTVISEvaluator,
33 |     OVISEvaluator,
34 |     LVVISEvaluator,
35 |     LVVISEvaluator_video,
36 |     BURSTEvaluator,
37 |     build_detection_train_loader,
38 |     build_detection_test_loader,
39 |     get_detection_dataset_dicts,
40 | )
41 | 


--------------------------------------------------------------------------------
/ovformer/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #pragma once
17 | #include <torch/extension.h>
18 | 
19 | at::Tensor ms_deform_attn_cuda_forward(
20 |     const at::Tensor &value, 
21 |     const at::Tensor &spatial_shapes,
22 |     const at::Tensor &level_start_index,
23 |     const at::Tensor &sampling_loc,
24 |     const at::Tensor &attn_weight,
25 |     const int im2col_step);
26 | 
27 | std::vector<at::Tensor> ms_deform_attn_cuda_backward(
28 |     const at::Tensor &value, 
29 |     const at::Tensor &spatial_shapes,
30 |     const at::Tensor &level_start_index,
31 |     const at::Tensor &sampling_loc,
32 |     const at::Tensor &attn_weight,
33 |     const at::Tensor &grad_output,
34 |     const int im2col_step);
35 | 
36 | 


--------------------------------------------------------------------------------
/ovformer/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #pragma once
17 | #include <torch/extension.h>
18 | 
19 | at::Tensor
20 | ms_deform_attn_cpu_forward(
21 |     const at::Tensor &value, 
22 |     const at::Tensor &spatial_shapes,
23 |     const at::Tensor &level_start_index,
24 |     const at::Tensor &sampling_loc,
25 |     const at::Tensor &attn_weight,
26 |     const int im2col_step);
27 | 
28 | std::vector<at::Tensor>
29 | ms_deform_attn_cpu_backward(
30 |     const at::Tensor &value, 
31 |     const at::Tensor &spatial_shapes,
32 |     const at::Tensor &level_start_index,
33 |     const at::Tensor &sampling_loc,
34 |     const at::Tensor &attn_weight,
35 |     const at::Tensor &grad_output,
36 |     const int im2col_step);
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/tools/ytvis_json.py:
--------------------------------------------------------------------------------
 1 | # Split results.json into base categories and novel categories, and zip compress them
 2 | # YTVIS19：AP*40 = APb*33 + APn*7
 3 | # YTVIS21：AP*40 = APb*34 + APn*6
 4 | import json
 5 | import zipfile
 6 | import os
 7 | 
 8 | # novel_id_ytvis19 = [6, 7, 9, 11, 23, 24, 39]
 9 | # novel_id_ytvis21 = [11, 14, 15, 20, 30, 39]
10 | novel_id = [6, 7, 9, 11, 23, 24, 39]
11 | novel_list = []
12 | base_list = []
13 | 
14 | results = json.load(open('output/inference/ytvis_2019_val/results.json', 'r'))
15 | for result in results:
16 |     if result['category_id'] in novel_id:
17 |         novel_list.append(result)
18 |     else:
19 |         base_list.append(result)
20 | 
21 | # all.zip
22 | file_name = "results.json"
23 | with open(file_name, 'w') as json_file:
24 |     json.dump(results, json_file)
25 | zip_file_name = "all.zip"
26 | with zipfile.ZipFile(zip_file_name, "w", zipfile.ZIP_DEFLATED) as zipf:
27 |     zipf.write(file_name)
28 | 
29 | # novel.zip
30 | with open(file_name, 'w') as json_file:
31 |     json.dump(novel_list, json_file)
32 | zip_file_name = "novel.zip"
33 | with zipfile.ZipFile(zip_file_name, "w", zipfile.ZIP_DEFLATED) as zipf:
34 |     zipf.write(file_name)
35 | 
36 | # base.zip
37 | with open(file_name, 'w') as json_file:
38 |     json.dump(base_list, json_file)
39 | zip_file_name = "base.zip"
40 | with zipfile.ZipFile(zip_file_name, "w", zipfile.ZIP_DEFLATED) as zipf:
41 |     zipf.write(file_name)
42 | 
43 | # delete results.json
44 | if os.path.exists(file_name):
45 |     os.remove(file_name)


--------------------------------------------------------------------------------
/configs/lvis/ovformer_R50_bs8.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-COCO-InstanceSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "OVFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 255
 7 |     NUM_CLASSES: 1203
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["layer2", "layer3", "layer4", "layer5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["layer3", "layer4", "layer5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     CLIP_TEXT_PATH: "datasets/metadata/fg_bg_5_10_lvis_ens.npy"
22 |     CLIP_CLASSIFIER: True
23 |     DEEP_SUPERVISION: True
24 |     NO_OBJECT_WEIGHT: 0.1
25 |     OBJECT_WEIGHT: 2.0
26 |     CLASS_WEIGHT: 2.0
27 |     MASK_WEIGHT: 5.0
28 |     DICE_WEIGHT: 5.0
29 |     HIDDEN_DIM: 256
30 |     NUM_OBJECT_QUERIES: 100
31 |     NHEADS: 8
32 |     DROPOUT: 0.0
33 |     DIM_FEEDFORWARD: 2048
34 |     ENC_LAYERS: 0
35 |     PRE_NORM: False
36 |     ENFORCE_INPUT_PROJ: False
37 |     SIZE_DIVISIBILITY: 32
38 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
39 |     TRAIN_NUM_POINTS: 12544
40 |     OVERSAMPLE_RATIO: 3.0
41 |     IMPORTANCE_SAMPLE_RATIO: 0.75
42 |     TEST:
43 |       SEMANTIC_ON: False
44 |       INSTANCE_ON: True
45 |       PANOPTIC_ON: False
46 |       OVERLAP_THRESHOLD: 0.8
47 |       OBJECT_MASK_THRESHOLD: 0.8
48 | 


--------------------------------------------------------------------------------
/ovformer/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.cpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #include <vector>
17 | 
18 | #include <ATen/ATen.h>
19 | #include <ATen/cuda/CUDAContext.h>
20 | 
21 | 
22 | at::Tensor
23 | ms_deform_attn_cpu_forward(
24 |     const at::Tensor &value, 
25 |     const at::Tensor &spatial_shapes,
26 |     const at::Tensor &level_start_index,
27 |     const at::Tensor &sampling_loc,
28 |     const at::Tensor &attn_weight,
29 |     const int im2col_step)
30 | {
31 |     AT_ERROR("Not implement on cpu");
32 | }
33 | 
34 | std::vector<at::Tensor>
35 | ms_deform_attn_cpu_backward(
36 |     const at::Tensor &value, 
37 |     const at::Tensor &spatial_shapes,
38 |     const at::Tensor &level_start_index,
39 |     const at::Tensor &sampling_loc,
40 |     const at::Tensor &attn_weight,
41 |     const at::Tensor &grad_output,
42 |     const int im2col_step)
43 | {
44 |     AT_ERROR("Not implement on cpu");
45 | }
46 | 
47 | 


--------------------------------------------------------------------------------
/tools/get_lvvis_cat_info.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | 
 4 | if __name__ == '__main__':
 5 |     parser = argparse.ArgumentParser()
 6 |     parser.add_argument("--ann", default='datasets/LVVIS/train/train_instances_nonovel.json')
 7 |     parser.add_argument("--add_freq", action='store_true')
 8 | 
 9 |     args = parser.parse_args()
10 | 
11 |     print('Loading', args.ann)
12 |     data = json.load(open(args.ann, 'r'))
13 |     cats = data['categories']
14 |     videos = data['videos']
15 | 
16 |     video_count = {x['id']: set() for x in cats}
17 |     image_count = {x['id']: 0 for x in cats}
18 |     ann_count = {x['id']: 0 for x in cats}
19 | 
20 |     for x in data['annotations']:
21 |         video_count[x['category_id']].add(x['video_id'])
22 |         ann_count[x['category_id']] += x['length']
23 | 
24 |     for category_id, video_set in video_count.items():
25 |         for video_id in video_set:
26 |             image_count[category_id] += videos[video_id]['length']
27 | 
28 |     num_freqs = {x: 0 for x in ['b', 'n']}
29 |     for x in cats:
30 |         x['image_count'] = image_count[x['id']]
31 |         x['instance_count'] = ann_count[x['id']]
32 |         if args.add_freq:
33 |             freq = 'b'
34 |             if x['image_count'] == 0:
35 |                 freq = 'n'
36 |             x['frequency'] = freq
37 |             num_freqs[freq] += 1
38 | 
39 |     if args.add_freq:
40 |         for x in ['b', 'n']:
41 |             print(x, num_freqs[x])
42 |     out = cats # {'categories': cats}
43 |     out_path = 'datasets/metadata/lvvis_train_cat_info.json'
44 |     print('Saving to', out_path)
45 |     json.dump(out, open(out_path, 'w'))
46 | 


--------------------------------------------------------------------------------
/tools/get_lvis_cat_info.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | import argparse
 3 | import json
 4 | 
 5 | if __name__ == '__main__':
 6 |     parser = argparse.ArgumentParser()
 7 |     parser.add_argument("--ann", default='datasets/lvis/lvis_v1_train.json')
 8 |     parser.add_argument("--add_freq", action='store_true')
 9 |     parser.add_argument("--r_thresh", type=int, default=10)
10 |     parser.add_argument("--c_thresh", type=int, default=100)
11 |     args = parser.parse_args()
12 | 
13 |     print('Loading', args.ann)
14 |     data = json.load(open(args.ann, 'r'))
15 |     cats = data['categories']
16 |     image_count = {x['id']: set() for x in cats}
17 |     ann_count = {x['id']: 0 for x in cats}
18 |     for x in data['annotations']:
19 |         image_count[x['category_id']].add(x['image_id'])
20 |         ann_count[x['category_id']] += 1
21 |     num_freqs = {x: 0 for x in ['r', 'f', 'c']}
22 |     for x in cats:
23 |         x['image_count'] = len(image_count[x['id']])
24 |         x['instance_count'] = ann_count[x['id']]
25 |         if args.add_freq:
26 |             freq = 'f'
27 |             if x['image_count'] < args.c_thresh:
28 |                 freq = 'c'
29 |             if x['image_count'] < args.r_thresh:
30 |                 freq = 'r'
31 |             x['frequency'] = freq
32 |             num_freqs[freq] += 1
33 |     print(cats)
34 |     image_counts = sorted([x['image_count'] for x in cats])
35 |     # print('image count', image_counts)
36 |     # import pdb; pdb.set_trace()
37 |     if args.add_freq:
38 |         for x in ['r', 'c', 'f']:
39 |             print(x, num_freqs[x])
40 |     out = cats # {'categories': cats}
41 |     out_path = 'datasets/metadata/lvis_v1_train_cat_info.json'
42 |     print('Saving to', out_path)
43 |     json.dump(out, open(out_path, 'w'))
44 |     
45 | 


--------------------------------------------------------------------------------
/configs/ovis/ovformer_R50_bs8.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../lvis/Base-COCO-InstanceSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "OVFormerVideo"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 255
 7 |     NUM_CLASSES: 25
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["layer2", "layer3", "layer4", "layer5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["layer3", "layer4", "layer5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     CLIP_TEXT_PATH: "datasets/metadata/fg_bg_5_10_ovis_ens.npy"
22 |     CLIP_IMAGE_PATH: "datasets/metadata/ovis_val_clip_feature.pkl"
23 |     CLIP_CLASSIFIER: True
24 |     DEEP_SUPERVISION: True
25 |     NO_OBJECT_WEIGHT: 0.1
26 |     OBJECT_WEIGHT: 2.0
27 |     CLASS_WEIGHT: 2.0
28 |     MASK_WEIGHT: 5.0
29 |     DICE_WEIGHT: 5.0
30 |     HIDDEN_DIM: 256
31 |     NUM_OBJECT_QUERIES: 100
32 |     NHEADS: 8
33 |     DROPOUT: 0.0
34 |     DIM_FEEDFORWARD: 2048
35 |     ENC_LAYERS: 0
36 |     PRE_NORM: False
37 |     ENFORCE_INPUT_PROJ: False
38 |     SIZE_DIVISIBILITY: 32
39 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
40 |     TRAIN_NUM_POINTS: 12544
41 |     OVERSAMPLE_RATIO: 3.0
42 |     IMPORTANCE_SAMPLE_RATIO: 0.75
43 |     TEST:
44 |       SEMANTIC_ON: False
45 |       INSTANCE_ON: True
46 |       PANOPTIC_ON: False
47 |       OVERLAP_THRESHOLD: 0.8
48 |       OBJECT_MASK_THRESHOLD: 0.8
49 | 
50 | DATASETS:
51 |   TEST: ("ovis_val",)
52 | 
53 | INPUT:
54 |   MIN_SIZE_TEST: 360
55 | 
56 | TEST:
57 |   DETECTIONS_PER_IMAGE: 20
58 | 


--------------------------------------------------------------------------------
/configs/burst/ovformer_R50_bs8.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../lvis/Base-COCO-InstanceSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "OVFormerVideo"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 255
 7 |     NUM_CLASSES: 482
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["layer2", "layer3", "layer4", "layer5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["layer3", "layer4", "layer5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     CLIP_TEXT_PATH: "datasets/metadata/fg_bg_5_10_burst_ens.npy"
22 |     CLIP_IMAGE_PATH: "datasets/metadata/burst_val_clip_feature.pkl"
23 |     CLIP_CLASSIFIER: True
24 |     DEEP_SUPERVISION: True
25 |     NO_OBJECT_WEIGHT: 0.1
26 |     OBJECT_WEIGHT: 2.0
27 |     CLASS_WEIGHT: 2.0
28 |     MASK_WEIGHT: 5.0
29 |     DICE_WEIGHT: 5.0
30 |     HIDDEN_DIM: 256
31 |     NUM_OBJECT_QUERIES: 100
32 |     NHEADS: 8
33 |     DROPOUT: 0.0
34 |     DIM_FEEDFORWARD: 2048
35 |     ENC_LAYERS: 0
36 |     PRE_NORM: False
37 |     ENFORCE_INPUT_PROJ: False
38 |     SIZE_DIVISIBILITY: 32
39 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
40 |     TRAIN_NUM_POINTS: 12544
41 |     OVERSAMPLE_RATIO: 3.0
42 |     IMPORTANCE_SAMPLE_RATIO: 0.75
43 |     TEST:
44 |       SEMANTIC_ON: False
45 |       INSTANCE_ON: True
46 |       PANOPTIC_ON: False
47 |       OVERLAP_THRESHOLD: 0.8
48 |       OBJECT_MASK_THRESHOLD: 0.8
49 | 
50 | DATASETS:
51 |   TEST: ("burst_val",)
52 | 
53 | INPUT:
54 |   MIN_SIZE_TEST: 360
55 | 
56 | TEST:
57 |   DETECTIONS_PER_IMAGE: 50
58 | 


--------------------------------------------------------------------------------
/configs/lvvis/ovformer_R50_bs8.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../lvis/Base-COCO-InstanceSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "OVFormerVideo"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 255
 7 |     NUM_CLASSES: 1196
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["layer2", "layer3", "layer4", "layer5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["layer3", "layer4", "layer5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     CLIP_TEXT_PATH: "datasets/metadata/fg_bg_5_10_lvvis_ens.npy"
22 |     CLIP_IMAGE_PATH: "datasets/metadata/lvvis_val_clip_feature.pkl"
23 |     CLIP_CLASSIFIER: True
24 |     DEEP_SUPERVISION: True
25 |     NO_OBJECT_WEIGHT: 0.1
26 |     OBJECT_WEIGHT: 2.0
27 |     CLASS_WEIGHT: 2.0
28 |     MASK_WEIGHT: 5.0
29 |     DICE_WEIGHT: 5.0
30 |     HIDDEN_DIM: 256
31 |     NUM_OBJECT_QUERIES: 100
32 |     NHEADS: 8
33 |     DROPOUT: 0.0
34 |     DIM_FEEDFORWARD: 2048
35 |     ENC_LAYERS: 0
36 |     PRE_NORM: False
37 |     ENFORCE_INPUT_PROJ: False
38 |     SIZE_DIVISIBILITY: 32
39 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
40 |     TRAIN_NUM_POINTS: 12544
41 |     OVERSAMPLE_RATIO: 3.0
42 |     IMPORTANCE_SAMPLE_RATIO: 0.75
43 |     TEST:
44 |       SEMANTIC_ON: False
45 |       INSTANCE_ON: True
46 |       PANOPTIC_ON: False
47 |       OVERLAP_THRESHOLD: 0.8
48 |       OBJECT_MASK_THRESHOLD: 0.8
49 | 
50 | DATASETS:
51 |   TEST: ("lvvis_val",)
52 | 
53 | INPUT:
54 |   MIN_SIZE_TEST: 360
55 | 
56 | TEST:
57 |   DETECTIONS_PER_IMAGE: 50
58 | 


--------------------------------------------------------------------------------
/configs/youtubevis_2019/ovformer_R50_bs8.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../lvis/Base-COCO-InstanceSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "OVFormerVideo"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 255
 7 |     NUM_CLASSES: 40
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["layer2", "layer3", "layer4", "layer5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["layer3", "layer4", "layer5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     CLIP_TEXT_PATH: "datasets/metadata/fg_bg_5_10_ytvis19_ens.npy"
22 |     CLIP_IMAGE_PATH: "datasets/metadata/ytvis_2019_val_clip_feature.pkl"
23 |     CLIP_CLASSIFIER: True
24 |     DEEP_SUPERVISION: True
25 |     NO_OBJECT_WEIGHT: 0.1
26 |     OBJECT_WEIGHT: 2.0
27 |     CLASS_WEIGHT: 2.0
28 |     MASK_WEIGHT: 5.0
29 |     DICE_WEIGHT: 5.0
30 |     HIDDEN_DIM: 256
31 |     NUM_OBJECT_QUERIES: 100
32 |     NHEADS: 8
33 |     DROPOUT: 0.0
34 |     DIM_FEEDFORWARD: 2048
35 |     ENC_LAYERS: 0
36 |     PRE_NORM: False
37 |     ENFORCE_INPUT_PROJ: False
38 |     SIZE_DIVISIBILITY: 32
39 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
40 |     TRAIN_NUM_POINTS: 12544
41 |     OVERSAMPLE_RATIO: 3.0
42 |     IMPORTANCE_SAMPLE_RATIO: 0.75
43 |     TEST:
44 |       SEMANTIC_ON: False
45 |       INSTANCE_ON: True
46 |       PANOPTIC_ON: False
47 |       OVERLAP_THRESHOLD: 0.8
48 |       OBJECT_MASK_THRESHOLD: 0.8
49 | 
50 | DATASETS:
51 |   TEST: ("ytvis_2019_val",)
52 | 
53 | INPUT:
54 |   MIN_SIZE_TEST: 360
55 | 
56 | TEST:
57 |   DETECTIONS_PER_IMAGE: 10
58 | 


--------------------------------------------------------------------------------
/configs/youtubevis_2021/ovformer_R50_bs8.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../lvis/Base-COCO-InstanceSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "OVFormerVideo"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 255
 7 |     NUM_CLASSES: 40
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["layer2", "layer3", "layer4", "layer5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["layer3", "layer4", "layer5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     CLIP_TEXT_PATH: "datasets/metadata/fg_bg_5_10_ytvis21_ens.npy"
22 |     CLIP_IMAGE_PATH: "datasets/metadata/ytvis_2021_val_clip_feature.pkl"
23 |     CLIP_CLASSIFIER: True
24 |     DEEP_SUPERVISION: True
25 |     NO_OBJECT_WEIGHT: 0.1
26 |     OBJECT_WEIGHT: 2.0
27 |     CLASS_WEIGHT: 2.0
28 |     MASK_WEIGHT: 5.0
29 |     DICE_WEIGHT: 5.0
30 |     HIDDEN_DIM: 256
31 |     NUM_OBJECT_QUERIES: 100
32 |     NHEADS: 8
33 |     DROPOUT: 0.0
34 |     DIM_FEEDFORWARD: 2048
35 |     ENC_LAYERS: 0
36 |     PRE_NORM: False
37 |     ENFORCE_INPUT_PROJ: False
38 |     SIZE_DIVISIBILITY: 32
39 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
40 |     TRAIN_NUM_POINTS: 12544
41 |     OVERSAMPLE_RATIO: 3.0
42 |     IMPORTANCE_SAMPLE_RATIO: 0.75
43 |     TEST:
44 |       SEMANTIC_ON: False
45 |       INSTANCE_ON: True
46 |       PANOPTIC_ON: False
47 |       OVERLAP_THRESHOLD: 0.8
48 |       OBJECT_MASK_THRESHOLD: 0.8
49 | 
50 | DATASETS:
51 |   TEST: ("ytvis_2021_val",)
52 | 
53 | INPUT:
54 |   MIN_SIZE_TEST: 360
55 | 
56 | TEST:
57 |   DETECTIONS_PER_IMAGE: 10
58 | 


--------------------------------------------------------------------------------
/configs/lvvis/video_ovformer_R50_bs8.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-LVVIS-VideoInstanceSegmentation.yaml
 2 | MODEL:
 3 |   WEIGHTS: "models/ovformer_r50_lvis.pth"
 4 |   META_ARCHITECTURE: "VideoOVFormer"
 5 |   SEM_SEG_HEAD:
 6 |     NAME: "MaskFormerHead"
 7 |     IGNORE_VALUE: 255
 8 |     NUM_CLASSES: 1196
 9 |     LOSS_WEIGHT: 1.0
10 |     CONVS_DIM: 256
11 |     MASK_DIM: 256
12 |     NORM: "GN"
13 |     # pixel decoder
14 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
15 |     IN_FEATURES: ["layer2", "layer3", "layer4", "layer5"]
16 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["layer3", "layer4", "layer5"]
17 |     COMMON_STRIDE: 4
18 |     TRANSFORMER_ENC_LAYERS: 6
19 |   MASK_FORMER:
20 |     TRANSFORMER_DECODER_NAME: "VideoMultiScaleMaskedTransformerDecoder"
21 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
22 |     CLIP_TEXT_PATH: "datasets/metadata/fg_bg_5_10_lvvis_ens.npy"
23 |     CLIP_IMAGE_PATH: "datasets/metadata/lvvis_val_clip_feature.pkl"
24 |     #CLIP_IMAGE_PATH: "datasets/metadata/lvvis_test_clip_feature.pkl"
25 |     CLIP_CLASSIFIER: True
26 |     DEEP_SUPERVISION: True
27 |     NO_OBJECT_WEIGHT: 0.1
28 |     OBJECT_WEIGHT: 2.0
29 |     CLASS_WEIGHT: 2.0
30 |     MASK_WEIGHT: 5.0
31 |     DICE_WEIGHT: 5.0
32 |     HIDDEN_DIM: 256
33 |     NUM_OBJECT_QUERIES: 100
34 |     NHEADS: 8
35 |     DROPOUT: 0.0
36 |     DIM_FEEDFORWARD: 2048
37 |     ENC_LAYERS: 0
38 |     PRE_NORM: False
39 |     ENFORCE_INPUT_PROJ: False
40 |     SIZE_DIVISIBILITY: 32
41 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
42 |     TRAIN_NUM_POINTS: 12544
43 |     OVERSAMPLE_RATIO: 3.0
44 |     IMPORTANCE_SAMPLE_RATIO: 0.75
45 |     TEST:
46 |       SEMANTIC_ON: False
47 |       INSTANCE_ON: True
48 |       PANOPTIC_ON: False
49 |       OVERLAP_THRESHOLD: 0.8
50 |       OBJECT_MASK_THRESHOLD: 0.8
51 | 
52 | DATASETS:
53 |   TEST: ("lvvis_val",)
54 |   # TEST: ("lvvis_test",)


--------------------------------------------------------------------------------
/ovformer/modeling/pixel_decoder/ops/src/ms_deform_attn.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #pragma once
17 | 
18 | #include "cpu/ms_deform_attn_cpu.h"
19 | 
20 | #ifdef WITH_CUDA
21 | #include "cuda/ms_deform_attn_cuda.h"
22 | #endif
23 | 
24 | 
25 | at::Tensor
26 | ms_deform_attn_forward(
27 |     const at::Tensor &value, 
28 |     const at::Tensor &spatial_shapes,
29 |     const at::Tensor &level_start_index,
30 |     const at::Tensor &sampling_loc,
31 |     const at::Tensor &attn_weight,
32 |     const int im2col_step)
33 | {
34 |     if (value.type().is_cuda())
35 |     {
36 | #ifdef WITH_CUDA
37 |         return ms_deform_attn_cuda_forward(
38 |             value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step);
39 | #else
40 |         AT_ERROR("Not compiled with GPU support");
41 | #endif
42 |     }
43 |     AT_ERROR("Not implemented on the CPU");
44 | }
45 | 
46 | std::vector<at::Tensor>
47 | ms_deform_attn_backward(
48 |     const at::Tensor &value, 
49 |     const at::Tensor &spatial_shapes,
50 |     const at::Tensor &level_start_index,
51 |     const at::Tensor &sampling_loc,
52 |     const at::Tensor &attn_weight,
53 |     const at::Tensor &grad_output,
54 |     const int im2col_step)
55 | {
56 |     if (value.type().is_cuda())
57 |     {
58 | #ifdef WITH_CUDA
59 |         return ms_deform_attn_cuda_backward(
60 |             value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step);
61 | #else
62 |         AT_ERROR("Not compiled with GPU support");
63 | #endif
64 |     }
65 |     AT_ERROR("Not implemented on the CPU");
66 | }
67 | 
68 | 


--------------------------------------------------------------------------------
/mAP.py:
--------------------------------------------------------------------------------
 1 | from evaluate.lvvis import LVVIS
 2 | from evaluate.burst import BURST
 3 | from evaluate.bursteval import BURSTeval
 4 | from evaluate.lvviseval import LVVISeval
 5 | 
 6 | import sys
 7 | import numpy as np
 8 | import os
 9 | from pycocotools.coco import COCO
10 | from pycocotools.cocoeval import COCOeval
11 | import json
12 | import itertools
13 | import torch 
14 | from detectron2.utils.file_io import PathManager
15 | import argparse
16 | import json
17 | from datetime import datetime
18 | 
19 | import os
20 | import sys
21 | import logging
22 | 
23 | 
24 | def pth_to_json(pth_path):
25 |     predictions=torch.load(pth_path)
26 |     coco_results = list(itertools.chain(*[x["instances"] for x in predictions]))
27 |     for result in coco_results:
28 |         category_id = result["category_id"]
29 |         result["category_id"] = category_id+1
30 |     file_path = os.path.join(os.path.dirname(pth_path), "instances_results.json")
31 |     with PathManager.open(file_path, "w") as f:
32 |                     f.write(json.dumps(coco_results))
33 |                     f.flush()
34 | 
35 | if __name__ == '__main__':
36 |     parser = argparse.ArgumentParser()
37 |     parser.add_argument('--dt', default='output')
38 |     parser.add_argument('--et', default='("lvvis_val",)') # ("burst_val",)
39 | 
40 |     args = parser.parse_args()
41 |     dt_path=os.path.join(args.dt,'inference/results.json')
42 |     output_file = os.path.join(os.path.dirname(args.dt), "results.txt")
43 |     logging.basicConfig(filename=output_file, level=logging.INFO, filemode='a', format='%(asctime)s - %(levelname)s - %(message)s')
44 | 
45 | 
46 |     console = logging.StreamHandler(sys.stdout)
47 |     console.setLevel(logging.INFO)
48 |     formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
49 |     console.setFormatter(formatter)
50 |     logging.getLogger('').addHandler(console)
51 |     logger = logging.getLogger(__name__)
52 |     eval_type=args.et
53 | 
54 |     if 'lvvis' in eval_type:
55 |         DATAEVAL=LVVIS
56 |         DATAEVALeval=LVVISeval
57 |         gt_path='datasets/LVVIS/val/val_instances_.json'
58 | 
59 |     elif 'burst' in eval_type:
60 |         DATAEVAL=BURST
61 |         DATAEVALeval=BURSTeval
62 |         gt_path='datasets/burst/b2y_val.json'
63 |     else:
64 |         logger.info("\n")
65 |         logger.info(f"\nAnnotations is invalid\n")
66 |         raise NotImplementedError
67 |     
68 |     current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
69 |     logger.info("\n")
70 |     logger.info(f"\n===== {current_time} =====\n")
71 |     ytvosGT = DATAEVAL(gt_path)
72 |     ytvosDT = ytvosGT.loadRes(dt_path)
73 |     ytvosEval = DATAEVALeval(ytvosGT, ytvosDT, "segm")
74 |     ytvosEval.evaluate()
75 |     ytvosEval.accumulate()
76 |     ytvosEval.summarize()


--------------------------------------------------------------------------------
/ovformer/modeling/transformer_decoder/position_encoding.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py
 3 | """
 4 | Various positional encodings for the transformer.
 5 | """
 6 | import math
 7 | 
 8 | import torch
 9 | from torch import nn
10 | 
11 | 
12 | class PositionEmbeddingSine(nn.Module):
13 |     """
14 |     This is a more standard version of the position embedding, very similar to the one
15 |     used by the Attention is all you need paper, generalized to work on images.
16 |     """
17 | 
18 |     def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
19 |         super().__init__()
20 |         self.num_pos_feats = num_pos_feats
21 |         self.temperature = temperature
22 |         self.normalize = normalize
23 |         if scale is not None and normalize is False:
24 |             raise ValueError("normalize should be True if scale is passed")
25 |         if scale is None:
26 |             scale = 2 * math.pi
27 |         self.scale = scale
28 | 
29 |     def forward(self, x, mask=None):
30 |         if mask is None:
31 |             mask = torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool)
32 |         not_mask = ~mask
33 |         y_embed = not_mask.cumsum(1, dtype=torch.float32)
34 |         x_embed = not_mask.cumsum(2, dtype=torch.float32)
35 |         if self.normalize:
36 |             eps = 1e-6
37 |             y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
38 |             x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
39 | 
40 |         dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
41 |         dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode='floor') / self.num_pos_feats)
42 | 
43 |         pos_x = x_embed[:, :, :, None] / dim_t
44 |         pos_y = y_embed[:, :, :, None] / dim_t
45 |         pos_x = torch.stack(
46 |             (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4
47 |         ).flatten(3)
48 |         pos_y = torch.stack(
49 |             (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4
50 |         ).flatten(3)
51 |         pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
52 |         return pos
53 |     
54 |     def __repr__(self, _repr_indent=4):
55 |         head = "Positional encoding " + self.__class__.__name__
56 |         body = [
57 |             "num_pos_feats: {}".format(self.num_pos_feats),
58 |             "temperature: {}".format(self.temperature),
59 |             "normalize: {}".format(self.normalize),
60 |             "scale: {}".format(self.scale),
61 |         ]
62 |         # _repr_indent = 4
63 |         lines = [head] + [" " * _repr_indent + line for line in body]
64 |         return "\n".join(lines)
65 | 


--------------------------------------------------------------------------------
/ovformer_video/modeling/transformer_decoder/position_encoding.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py
 3 | """
 4 | Various positional encodings for the transformer.
 5 | """
 6 | import math
 7 | 
 8 | import torch
 9 | from torch import nn
10 | 
11 | 
12 | class PositionEmbeddingSine3D(nn.Module):
13 |     """
14 |     This is a more standard version of the position embedding, very similar to the one
15 |     used by the Attention is all you need paper, generalized to work on images.
16 |     """
17 | 
18 |     def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
19 |         super().__init__()
20 |         self.num_pos_feats = num_pos_feats
21 |         self.temperature = temperature
22 |         self.normalize = normalize
23 |         if scale is not None and normalize is False:
24 |             raise ValueError("normalize should be True if scale is passed")
25 |         if scale is None:
26 |             scale = 2 * math.pi
27 |         self.scale = scale
28 | 
29 |     def forward(self, x, mask=None):
30 |         # b, t, c, h, w
31 |         assert x.dim() == 5, f"{x.shape} should be a 5-dimensional Tensor, got {x.dim()}-dimensional Tensor instead"
32 |         if mask is None:
33 |             mask = torch.zeros((x.size(0), x.size(1), x.size(3), x.size(4)), device=x.device, dtype=torch.bool)
34 |         not_mask = ~mask
35 |         z_embed = not_mask.cumsum(1, dtype=torch.float32)
36 |         y_embed = not_mask.cumsum(2, dtype=torch.float32)
37 |         x_embed = not_mask.cumsum(3, dtype=torch.float32)
38 |         if self.normalize:
39 |             eps = 1e-6
40 |             z_embed = z_embed / (z_embed[:, -1:, :, :] + eps) * self.scale
41 |             y_embed = y_embed / (y_embed[:, :, -1:, :] + eps) * self.scale
42 |             x_embed = x_embed / (x_embed[:, :, :, -1:] + eps) * self.scale
43 | 
44 |         dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
45 |         dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
46 | 
47 |         dim_t_z = torch.arange((self.num_pos_feats * 2), dtype=torch.float32, device=x.device)
48 |         dim_t_z = self.temperature ** (2 * (dim_t_z // 2) / (self.num_pos_feats * 2))
49 | 
50 |         pos_x = x_embed[:, :, :, :, None] / dim_t
51 |         pos_y = y_embed[:, :, :, :, None] / dim_t
52 |         pos_z = z_embed[:, :, :, :, None] / dim_t_z
53 |         pos_x = torch.stack((pos_x[:, :, :, :, 0::2].sin(), pos_x[:, :, :, :, 1::2].cos()), dim=5).flatten(4)
54 |         pos_y = torch.stack((pos_y[:, :, :, :, 0::2].sin(), pos_y[:, :, :, :, 1::2].cos()), dim=5).flatten(4)
55 |         pos_z = torch.stack((pos_z[:, :, :, :, 0::2].sin(), pos_z[:, :, :, :, 1::2].cos()), dim=5).flatten(4)
56 |         pos = (torch.cat((pos_y, pos_x), dim=4) + pos_z).permute(0, 1, 4, 2, 3)  # b, t, c, h, w
57 |         return pos
58 | 


--------------------------------------------------------------------------------
/ovformer_video/utils/memory.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | 
 3 | import logging
 4 | from contextlib import contextmanager
 5 | from functools import wraps
 6 | import torch
 7 | from torch.cuda.amp import autocast
 8 | 
 9 | __all__ = ["retry_if_cuda_oom"]
10 | 
11 | 
12 | @contextmanager
13 | def _ignore_torch_cuda_oom():
14 |     """
15 |     A context which ignores CUDA OOM exception from pytorch.
16 |     """
17 |     try:
18 |         yield
19 |     except RuntimeError as e:
20 |         # NOTE: the string may change?
21 |         if "CUDA out of memory. " in str(e):
22 |             pass
23 |         else:
24 |             raise
25 | 
26 | 
27 | def retry_if_cuda_oom(func):
28 |     """
29 |     Makes a function retry itself after encountering
30 |     pytorch's CUDA OOM error.
31 |     It will first retry after calling `torch.cuda.empty_cache()`.
32 |     If that still fails, it will then retry by trying to convert inputs to CPUs.
33 |     In this case, it expects the function to dispatch to CPU implementation.
34 |     The return values may become CPU tensors as well and it's user's
35 |     responsibility to convert it back to CUDA tensor if needed.
36 |     Args:
37 |         func: a stateless callable that takes tensor-like objects as arguments
38 |     Returns:
39 |         a callable which retries `func` if OOM is encountered.
40 |     Examples:
41 |     ::
42 |         output = retry_if_cuda_oom(some_torch_function)(input1, input2)
43 |         # output may be on CPU even if inputs are on GPU
44 |     Note:
45 |         1. When converting inputs to CPU, it will only look at each argument and check
46 |            if it has `.device` and `.to` for conversion. Nested structures of tensors
47 |            are not supported.
48 |         2. Since the function might be called more than once, it has to be
49 |            stateless.
50 |     """
51 | 
52 |     def maybe_to_cpu(x):
53 |         try:
54 |             like_gpu_tensor = x.device.type == "cuda" and hasattr(x, "to")
55 |         except AttributeError:
56 |             like_gpu_tensor = False
57 |         if like_gpu_tensor:
58 |             return x.to(device="cpu").to(torch.float32)
59 |         else:
60 |             return x
61 | 
62 |     @wraps(func)
63 |     def wrapped(*args, **kwargs):
64 |         with _ignore_torch_cuda_oom():
65 |             return func(*args, **kwargs)
66 | 
67 |         # Clear cache and retry
68 |         torch.cuda.empty_cache()
69 |         with _ignore_torch_cuda_oom():
70 |             return func(*args, **kwargs)
71 | 
72 |         # Try on CPU. This slows down the code significantly, therefore print a notice.
73 |         logger = logging.getLogger(__name__)
74 |         logger.info("Attempting to copy inputs to CPU due to CUDA OOM")
75 |         new_args = (maybe_to_cpu(x) for x in args)
76 |         new_kwargs = {k: maybe_to_cpu(v) for k, v in kwargs.items()}
77 |         with autocast(enabled=False):
78 |             return func(*new_args, **new_kwargs)
79 | 
80 |     return wrapped
81 | 


--------------------------------------------------------------------------------
/ovformer/modeling/pixel_decoder/ops/setup.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | import os
13 | import glob
14 | 
15 | import torch
16 | 
17 | from torch.utils.cpp_extension import CUDA_HOME
18 | from torch.utils.cpp_extension import CppExtension
19 | from torch.utils.cpp_extension import CUDAExtension
20 | 
21 | from setuptools import find_packages
22 | from setuptools import setup
23 | 
24 | requirements = ["torch", "torchvision"]
25 | 
26 | def get_extensions():
27 |     this_dir = os.path.dirname(os.path.abspath(__file__))
28 |     extensions_dir = os.path.join(this_dir, "src")
29 | 
30 |     main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
31 |     source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
32 |     source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
33 | 
34 |     sources = main_file + source_cpu
35 |     extension = CppExtension
36 |     extra_compile_args = {"cxx": []}
37 |     define_macros = []
38 | 
39 |     # Force cuda since torch ask for a device, not if cuda is in fact available.
40 |     if (os.environ.get('FORCE_CUDA') or torch.cuda.is_available()) and CUDA_HOME is not None:
41 |         extension = CUDAExtension
42 |         sources += source_cuda
43 |         define_macros += [("WITH_CUDA", None)]
44 |         extra_compile_args["nvcc"] = [
45 |             "-DCUDA_HAS_FP16=1",
46 |             "-D__CUDA_NO_HALF_OPERATORS__",
47 |             "-D__CUDA_NO_HALF_CONVERSIONS__",
48 |             "-D__CUDA_NO_HALF2_OPERATORS__",
49 |         ]
50 |     else:
51 |         if CUDA_HOME is None:
52 |             raise NotImplementedError('CUDA_HOME is None. Please set environment variable CUDA_HOME.')
53 |         else:
54 |             raise NotImplementedError('No CUDA runtime is found. Please set FORCE_CUDA=1 or test it by running torch.cuda.is_available().')
55 | 
56 |     sources = [os.path.join(extensions_dir, s) for s in sources]
57 |     include_dirs = [extensions_dir]
58 |     ext_modules = [
59 |         extension(
60 |             "MultiScaleDeformableAttention",
61 |             sources,
62 |             include_dirs=include_dirs,
63 |             define_macros=define_macros,
64 |             extra_compile_args=extra_compile_args,
65 |         )
66 |     ]
67 |     return ext_modules
68 | 
69 | setup(
70 |     name="MultiScaleDeformableAttention",
71 |     version="1.0",
72 |     author="Weijie Su",
73 |     url="https://github.com/fundamentalvision/Deformable-DETR",
74 |     description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention",
75 |     packages=find_packages(exclude=("configs", "tests",)),
76 |     ext_modules=get_extensions(),
77 |     cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
78 | )
79 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Unified Embedding Alignment for Open-Vocabulary Video Instance Segmentation (ECCV 2024)
 2 | 
 3 | [Hao Fang](https://fanghaook.github.io/), 
 4 | Peng Wu, 
 5 | [Yawei Li](https://scholar.google.com.hk/citations?user=IFLsTGsAAAAJ), 
 6 | [Xinxin Zhang](https://scholar.google.cz/citations?user=rPv44PoAAAAJ), 
 7 | [Xiankai Lu](https://scholar.google.com.hk/citations?user=QS5V5b8AAAAJ)
 8 | 
 9 | [[`paper`](https://arxiv.org/pdf/2407.07427)] [[`BibTeX`](#CitingOVFormer)]
10 | 
11 | <div align="center">
12 |   <img src="OVFormer.png" width="100%" height="100%"/>
13 | </div><br/>
14 | 
15 | ## Installation
16 | 
17 | See [installation instructions](INSTALL.md).
18 | 
19 | ## Data Preparation
20 | See [Preparing Datasets for OVFormer](./datasets/README.md).
21 | 
22 | ##  Getting Started
23 | We firstly train the OVFormer model on LVIS dataset:
24 | ```bash
25 | python train_net.py --num-gpus 4 \
26 |   --config-file configs/lvis/ovformer_R50_bs8.yaml
27 | ```
28 | To evaluate model's zero-shot generalization performance on VIS Datasets, use
29 | ```bash
30 | python train_net_video.py \
31 |   --config-file configs/youtubevis_2019/ovformer_R50_bs8.yaml \
32 |   --eval-only MODEL.WEIGHTS models/ovformer_r50_lvis.pth
33 | ```
34 | YTVIS19/21 requires splitting the results.json into base and novel categories by [Tool](./tools/ytvis_json.py), 
35 | OVIS directly packages and uploads to the specified server, BURST needs to run ```mAP.py```.
36 | You are expected to get results like this:
37 | 
38 | |  Model   | Backbone | YTVIS19 | YTVIS21 | OVIS | BURST |  weights  |
39 | |:--------:|:--------:|:-------:|:-------:|:----:|:-----:|:---------:| 
40 | | OVFormer |   R-50   |  34.8   |  29.8   | 15.1 |  6.8  | [model](https://drive.google.com/file/d/1-tMcjp8xIYr9E5r5JYOESGajXtMAs33y/view?usp=sharing) |
41 | | OVFormer |  Swin-B  |  44.3   |  37.6   | 21.3 |  7.6  | [model](https://drive.google.com/file/d/102qxZlu05yXILfghhrwjxv-tL3MlcYu7/view?usp=sharing) |
42 | 
43 | Then, we video-based train the OVFormer model on LV-VIS dataset:
44 | ```bash
45 | python train_net_lvvis.py --num-gpus 4 \
46 |   --config-file configs/lvvis/video_ovformer_R50_bs8.yaml
47 | ```
48 | To evaluate a model's performance on LV-VIS dataset, use
49 | ```bash
50 | python train_net_lvvis.py \
51 |   --config-file configs/lvvis/video_ovformer_R50_bs8.yaml \
52 |   --eval-only MODEL.WEIGHTS models/ovformer_r50_lvvis.pth
53 | ```
54 | Run ```mAP.py```, you are expected to get results like this:
55 | 
56 | |     Model      | Backbone | LVVIS val | LVVIS test |  weights  |
57 | |:--------------:|:--------:|:---------:|:----------:|:---------:| 
58 | | OVFormer |   R-50   |   21.9    |    15.2    | [model](https://drive.google.com/file/d/1-zfEwdglPeVHzlc5Ky_HJlZtMgGXAy1S/view?usp=sharing) |
59 | | OVFormer |  Swin-B  |   24.7    |    19.5    | [model](https://drive.google.com/file/d/107BNsu9eTr5e70B4oj28jgHKBjYTWNWp/view?usp=sharing) |
60 | 
61 | ## <a name="CitingOVFormer"></a>Citing OVFormer
62 | ```BibTeX
63 | @inproceedings{fang2024unified,
64 |   title={Unified embedding alignment for open-vocabulary video instance segmentation},
65 |   author={Fang, Hao and Wu, Peng and Li, Yawei and Zhang, Xinxin and Lu, Xiankai},
66 |   booktitle={ECCV},
67 |   pages={225--241},
68 |   year={2025},
69 |   organization={Springer}
70 | }
71 | ```
72 | 
73 | ## Acknowledgement
74 | 
75 | This repo is based on [detectron2](https://github.com/facebookresearch/detectron2), 
76 | [Mask2Former](https://github.com/facebookresearch/Mask2Former),
77 | and [LVVIS](https://github.com/haochenheheda/LVVIS). Thanks for their great work!
78 | 


--------------------------------------------------------------------------------
/tools/vis_results.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import glob
 3 | import json
 4 | import cv2
 5 | import tqdm
 6 | from pycocotools import mask as pymask
 7 | import numpy as np
 8 | import tqdm
 9 | 
10 | def get_center(mask):
11 |     # Get the central part of the object
12 |     h1,h2 = np.argwhere(mask.sum(axis=1).reshape(-1)).min(), np.argwhere(mask.sum(axis=1).reshape(-1)).max()
13 |     w1,w2 = np.argwhere(mask.sum(axis=0).reshape(-1)).min(), np.argwhere(mask.sum(axis=0).reshape(-1)).max()
14 |     return int((h1+h2)/2), int((w1+w2)/2), h1, w1, h2, w2
15 | 
16 | color_map = [[20,255,20], [20, 20, 255], [255, 20, 20], [20, 255, 255], [255,20,255], [255,255,20],[42,42,128],[165, 42, 42], [134, 134, 103], [0, 0, 142], [255, 109, 65], \
17 |         [0, 226, 252], [5, 121, 0], [0, 60, 100], [250, 170, 30], [100, 170, 30], [179, 0, 194], [255, 77, 255], [120, 166, 157], \
18 |         [73, 77, 174], [0, 80, 100], [182, 182, 255], [0, 143, 149], [174, 57, 255], [0, 0, 230], [72, 0, 118], [255, 179, 240], \
19 |         [0, 125, 92], [209, 0, 151], [188, 208, 182], [145, 148, 174], [106, 0, 228], [0, 0, 70], [199, 100, 0], [166, 196, 102], \
20 |         [110, 76, 0], [133, 129, 255], [0, 0, 192], [183, 130, 88], [130, 114, 135], [107, 142, 35], [0, 228, 0], [174, 255, 243], [255, 208, 186]]
21 | 
22 | 
23 | output_dir = 'output/lvvis_vis'
24 | anno_json = 'datasets/LVVIS/val/val_instances_.json'
25 | dt_json = 'output/ov2seg/inference/lvvis_val/results.json'
26 | img_dir = 'datasets/LVVIS/val/JPEGImages'
27 | 
28 | 
29 | dt = json.load(open(dt_json, 'r'))
30 | data = json.load(open(anno_json, 'r'))
31 | categories = data['categories']
32 | videos = data['videos']
33 | 
34 | dt_dic = {}
35 | category_dic = {}
36 | for category in categories:
37 |     category_dic[category['id']] = category['name']
38 | 
39 | for d in dt:
40 |     if d['video_id'] not in dt_dic.keys():
41 |         dt_dic[d['video_id']] = []
42 |     dt_dic[d['video_id']].append(d)
43 | for video in tqdm.tqdm(videos):
44 |     video_name = video['file_names'][0].split('/')[0]
45 |     img_list = video['file_names']
46 |     img_list.sort()
47 |     video_id = video['id']
48 |     video_dt = dt_dic[video_id]
49 |     for fid, img_path in enumerate(img_list):
50 |         img = cv2.imread(os.path.join(img_dir, img_path))
51 |         h,w,_ = img.shape
52 |         mask_vis = np.zeros((h,w,3))
53 |         for obj_id, obj in enumerate(video_dt):
54 |             category_id = obj['category_id']
55 |             category_name = category_dic[category_id]
56 |             score = obj['score']
57 |             if score < 0.5:
58 |                 continue
59 |             obj_mask = pymask.decode(obj['segmentations'][fid])
60 |             if obj_mask.sum() == 0:
61 |                 continue
62 |             color = color_map[int(obj_id)%len(color_map)]
63 |             mask_vis[obj_mask > 0] = color
64 |             img[obj_mask > 0] = img[obj_mask > 0] * 0.45 + mask_vis[obj_mask>0]*0.55
65 |             contours,hierarchy = cv2.findContours(obj_mask,cv2.RETR_TREE,cv2.CHAIN_APPROX_SIMPLE)
66 |             img = cv2.drawContours(img,contours,-1,(222,222,222),2)
67 |             h_,w_,y1,x1,y2,x2 = get_center(obj_mask)
68 |             img = cv2.putText(img, category_name, ((x1 + x2)//2 - 45, (y1+y2)//2 -25), cv2.FONT_HERSHEY_SIMPLEX, 1.3, (255,255,255), 5)
69 |             img = cv2.putText(img, category_name, ((x1 + x2)//2 - 45, (y1+y2)//2 -25), cv2.FONT_HERSHEY_SIMPLEX, 1.3, color, 2)
70 | 
71 |         img_name = img_path.split('/')[-1]
72 |         os.makedirs(os.path.join(output_dir, video_name),exist_ok=True)
73 |         output_path = os.path.join(output_dir, video_name, img_name)
74 |         cv2.imwrite(output_path, img)
75 | 
76 | 


--------------------------------------------------------------------------------
/ovformer/modeling/pixel_decoder/ops/functions/ms_deform_attn_func.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from __future__ import absolute_import
13 | from __future__ import print_function
14 | from __future__ import division
15 | 
16 | import torch
17 | import torch.nn.functional as F
18 | from torch.autograd import Function
19 | from torch.autograd.function import once_differentiable
20 | 
21 | try:
22 |     import MultiScaleDeformableAttention as MSDA
23 | except ModuleNotFoundError as e:
24 |     info_string = (
25 |         "\n\nPlease compile MultiScaleDeformableAttention CUDA op with the following commands:\n"
26 |         "\t`cd mask2former/modeling/pixel_decoder/ops`\n"
27 |         "\t`sh make.sh`\n"
28 |     )
29 |     raise ModuleNotFoundError(info_string)
30 | 
31 | 
32 | class MSDeformAttnFunction(Function):
33 |     @staticmethod
34 |     def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step):
35 |         ctx.im2col_step = im2col_step
36 |         output = MSDA.ms_deform_attn_forward(
37 |             value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step)
38 |         ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights)
39 |         return output
40 | 
41 |     @staticmethod
42 |     @once_differentiable
43 |     def backward(ctx, grad_output):
44 |         value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors
45 |         grad_value, grad_sampling_loc, grad_attn_weight = \
46 |             MSDA.ms_deform_attn_backward(
47 |                 value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step)
48 | 
49 |         return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
50 | 
51 | 
52 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
53 |     # for debug and test only,
54 |     # need to use cuda version instead
55 |     N_, S_, M_, D_ = value.shape
56 |     _, Lq_, M_, L_, P_, _ = sampling_locations.shape
57 |     value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
58 |     sampling_grids = 2 * sampling_locations - 1
59 |     sampling_value_list = []
60 |     for lid_, (H_, W_) in enumerate(value_spatial_shapes):
61 |         # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
62 |         value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_)
63 |         # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
64 |         sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)
65 |         # N_*M_, D_, Lq_, P_
66 |         sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,
67 |                                           mode='bilinear', padding_mode='zeros', align_corners=False)
68 |         sampling_value_list.append(sampling_value_l_)
69 |     # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
70 |     attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_)
71 |     output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)
72 |     return output.transpose(1, 2).contiguous()
73 | 


--------------------------------------------------------------------------------
/ovformer/test_time_augmentation.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | import copy
  3 | import logging
  4 | from itertools import count
  5 | 
  6 | import numpy as np
  7 | import torch
  8 | from fvcore.transforms import HFlipTransform
  9 | from torch import nn
 10 | from torch.nn.parallel import DistributedDataParallel
 11 | 
 12 | from detectron2.data.detection_utils import read_image
 13 | from detectron2.modeling import DatasetMapperTTA
 14 | 
 15 | 
 16 | __all__ = [
 17 |     "SemanticSegmentorWithTTA",
 18 | ]
 19 | 
 20 | 
 21 | class SemanticSegmentorWithTTA(nn.Module):
 22 |     """
 23 |     A SemanticSegmentor with test-time augmentation enabled.
 24 |     Its :meth:`__call__` method has the same interface as :meth:`SemanticSegmentor.forward`.
 25 |     """
 26 | 
 27 |     def __init__(self, cfg, model, tta_mapper=None, batch_size=1):
 28 |         """
 29 |         Args:
 30 |             cfg (CfgNode):
 31 |             model (SemanticSegmentor): a SemanticSegmentor to apply TTA on.
 32 |             tta_mapper (callable): takes a dataset dict and returns a list of
 33 |                 augmented versions of the dataset dict. Defaults to
 34 |                 `DatasetMapperTTA(cfg)`.
 35 |             batch_size (int): batch the augmented images into this batch size for inference.
 36 |         """
 37 |         super().__init__()
 38 |         if isinstance(model, DistributedDataParallel):
 39 |             model = model.module
 40 |         self.cfg = cfg.clone()
 41 | 
 42 |         self.model = model
 43 | 
 44 |         if tta_mapper is None:
 45 |             tta_mapper = DatasetMapperTTA(cfg)
 46 |         self.tta_mapper = tta_mapper
 47 |         self.batch_size = batch_size
 48 | 
 49 |     def __call__(self, batched_inputs):
 50 |         """
 51 |         Same input/output format as :meth:`SemanticSegmentor.forward`
 52 |         """
 53 | 
 54 |         def _maybe_read_image(dataset_dict):
 55 |             ret = copy.copy(dataset_dict)
 56 |             if "image" not in ret:
 57 |                 image = read_image(ret.pop("file_name"), self.model.input_format)
 58 |                 image = torch.from_numpy(np.ascontiguousarray(image.transpose(2, 0, 1)))  # CHW
 59 |                 ret["image"] = image
 60 |             if "height" not in ret and "width" not in ret:
 61 |                 ret["height"] = image.shape[1]
 62 |                 ret["width"] = image.shape[2]
 63 |             return ret
 64 | 
 65 |         processed_results = []
 66 |         for x in batched_inputs:
 67 |             result = self._inference_one_image(_maybe_read_image(x))
 68 |             processed_results.append(result)
 69 |         return processed_results
 70 | 
 71 |     def _inference_one_image(self, input):
 72 |         """
 73 |         Args:
 74 |             input (dict): one dataset dict with "image" field being a CHW tensor
 75 |         Returns:
 76 |             dict: one output dict
 77 |         """
 78 |         orig_shape = (input["height"], input["width"])
 79 |         augmented_inputs, tfms = self._get_augmented_inputs(input)
 80 | 
 81 |         final_predictions = None
 82 |         count_predictions = 0
 83 |         for input, tfm in zip(augmented_inputs, tfms):
 84 |             count_predictions += 1
 85 |             with torch.no_grad():
 86 |                 if final_predictions is None:
 87 |                     if any(isinstance(t, HFlipTransform) for t in tfm.transforms):
 88 |                         final_predictions = self.model([input])[0].pop("sem_seg").flip(dims=[2])
 89 |                     else:
 90 |                         final_predictions = self.model([input])[0].pop("sem_seg")
 91 |                 else:
 92 |                     if any(isinstance(t, HFlipTransform) for t in tfm.transforms):
 93 |                         final_predictions += self.model([input])[0].pop("sem_seg").flip(dims=[2])
 94 |                     else:
 95 |                         final_predictions += self.model([input])[0].pop("sem_seg")
 96 | 
 97 |         final_predictions = final_predictions / count_predictions
 98 |         return {"sem_seg": final_predictions}
 99 | 
100 |     def _get_augmented_inputs(self, input):
101 |         augmented_inputs = self.tta_mapper(input)
102 |         tfms = [x.pop("transforms") for x in augmented_inputs]
103 |         return augmented_inputs, tfms
104 | 


--------------------------------------------------------------------------------
/ovformer/utils/misc.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/util/misc.py
  3 | """
  4 | Misc functions, including distributed helpers.
  5 | 
  6 | Mostly copy-paste from torchvision references.
  7 | """
  8 | from typing import List, Optional
  9 | 
 10 | import torch
 11 | import torch.distributed as dist
 12 | import torchvision
 13 | from torch import Tensor
 14 | 
 15 | 
 16 | def _max_by_axis(the_list):
 17 |     # type: (List[List[int]]) -> List[int]
 18 |     maxes = the_list[0]
 19 |     for sublist in the_list[1:]:
 20 |         for index, item in enumerate(sublist):
 21 |             maxes[index] = max(maxes[index], item)
 22 |     return maxes
 23 | 
 24 | 
 25 | class NestedTensor(object):
 26 |     def __init__(self, tensors, mask: Optional[Tensor]):
 27 |         self.tensors = tensors
 28 |         self.mask = mask
 29 | 
 30 |     def to(self, device):
 31 |         # type: (Device) -> NestedTensor # noqa
 32 |         cast_tensor = self.tensors.to(device)
 33 |         mask = self.mask
 34 |         if mask is not None:
 35 |             assert mask is not None
 36 |             cast_mask = mask.to(device)
 37 |         else:
 38 |             cast_mask = None
 39 |         return NestedTensor(cast_tensor, cast_mask)
 40 | 
 41 |     def decompose(self):
 42 |         return self.tensors, self.mask
 43 | 
 44 |     def __repr__(self):
 45 |         return str(self.tensors)
 46 | 
 47 | 
 48 | def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
 49 |     # TODO make this more general
 50 |     if tensor_list[0].ndim == 3:
 51 |         if torchvision._is_tracing():
 52 |             # nested_tensor_from_tensor_list() does not export well to ONNX
 53 |             # call _onnx_nested_tensor_from_tensor_list() instead
 54 |             return _onnx_nested_tensor_from_tensor_list(tensor_list)
 55 | 
 56 |         # TODO make it support different-sized images
 57 |         max_size = _max_by_axis([list(img.shape) for img in tensor_list])
 58 |         # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list]))
 59 |         batch_shape = [len(tensor_list)] + max_size
 60 |         b, c, h, w = batch_shape
 61 |         dtype = tensor_list[0].dtype
 62 |         device = tensor_list[0].device
 63 |         tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
 64 |         mask = torch.ones((b, h, w), dtype=torch.bool, device=device)
 65 |         for img, pad_img, m in zip(tensor_list, tensor, mask):
 66 |             pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
 67 |             m[: img.shape[1], : img.shape[2]] = False
 68 |     else:
 69 |         raise ValueError("not supported")
 70 |     return NestedTensor(tensor, mask)
 71 | 
 72 | 
 73 | # _onnx_nested_tensor_from_tensor_list() is an implementation of
 74 | # nested_tensor_from_tensor_list() that is supported by ONNX tracing.
 75 | @torch.jit.unused
 76 | def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor:
 77 |     max_size = []
 78 |     for i in range(tensor_list[0].dim()):
 79 |         max_size_i = torch.max(
 80 |             torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32)
 81 |         ).to(torch.int64)
 82 |         max_size.append(max_size_i)
 83 |     max_size = tuple(max_size)
 84 | 
 85 |     # work around for
 86 |     # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
 87 |     # m[: img.shape[1], :img.shape[2]] = False
 88 |     # which is not yet supported in onnx
 89 |     padded_imgs = []
 90 |     padded_masks = []
 91 |     for img in tensor_list:
 92 |         padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))]
 93 |         padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0]))
 94 |         padded_imgs.append(padded_img)
 95 | 
 96 |         m = torch.zeros_like(img[0], dtype=torch.int, device=img.device)
 97 |         padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1)
 98 |         padded_masks.append(padded_mask.to(torch.bool))
 99 | 
100 |     tensor = torch.stack(padded_imgs)
101 |     mask = torch.stack(padded_masks)
102 | 
103 |     return NestedTensor(tensor, mask=mask)
104 | 
105 | 
106 | def is_dist_avail_and_initialized():
107 |     if not dist.is_available():
108 |         return False
109 |     if not dist.is_initialized():
110 |         return False
111 |     return True
112 | 


--------------------------------------------------------------------------------
/ovformer/modeling/pixel_decoder/ops/test.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from __future__ import absolute_import
13 | from __future__ import print_function
14 | from __future__ import division
15 | 
16 | import time
17 | import torch
18 | import torch.nn as nn
19 | from torch.autograd import gradcheck
20 | 
21 | from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch
22 | 
23 | 
24 | N, M, D = 1, 2, 2
25 | Lq, L, P = 2, 2, 2
26 | shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda()
27 | level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1]))
28 | S = sum([(H*W).item() for H, W in shapes])
29 | 
30 | 
31 | torch.manual_seed(3)
32 | 
33 | 
34 | @torch.no_grad()
35 | def check_forward_equal_with_pytorch_double():
36 |     value = torch.rand(N, S, M, D).cuda() * 0.01
37 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
38 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
39 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
40 |     im2col_step = 2
41 |     output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu()
42 |     output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu()
43 |     fwdok = torch.allclose(output_cuda, output_pytorch)
44 |     max_abs_err = (output_cuda - output_pytorch).abs().max()
45 |     max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
46 | 
47 |     print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
48 | 
49 | 
50 | @torch.no_grad()
51 | def check_forward_equal_with_pytorch_float():
52 |     value = torch.rand(N, S, M, D).cuda() * 0.01
53 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
54 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
55 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
56 |     im2col_step = 2
57 |     output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu()
58 |     output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu()
59 |     fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3)
60 |     max_abs_err = (output_cuda - output_pytorch).abs().max()
61 |     max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
62 | 
63 |     print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
64 | 
65 | 
66 | def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True):
67 | 
68 |     value = torch.rand(N, S, M, channels).cuda() * 0.01
69 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
70 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
71 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
72 |     im2col_step = 2
73 |     func = MSDeformAttnFunction.apply
74 | 
75 |     value.requires_grad = grad_value
76 |     sampling_locations.requires_grad = grad_sampling_loc
77 |     attention_weights.requires_grad = grad_attn_weight
78 | 
79 |     gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step))
80 | 
81 |     print(f'* {gradok} check_gradient_numerical(D={channels})')
82 | 
83 | 
84 | if __name__ == '__main__':
85 |     check_forward_equal_with_pytorch_double()
86 |     check_forward_equal_with_pytorch_float()
87 | 
88 |     for channels in [30, 32, 64, 71, 1025, 2048, 3096]:
89 |         check_gradient_numerical(channels, True, True, True)
90 | 
91 | 
92 | 
93 | 


--------------------------------------------------------------------------------
/ovformer/data/datasets/register_ade20k_instance.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | import json
 3 | import logging
 4 | import numpy as np
 5 | import os
 6 | from PIL import Image
 7 | 
 8 | from detectron2.data import DatasetCatalog, MetadataCatalog
 9 | from detectron2.data.datasets.coco import load_coco_json, register_coco_instances
10 | from detectron2.utils.file_io import PathManager
11 | 
12 | ADE_CATEGORIES = [{'id': 7, 'name': 'bed'}, {'id': 8, 'name': 'windowpane'}, {'id': 10, 'name': 'cabinet'}, {'id': 12, 'name': 'person'}, {'id': 14, 'name': 'door'}, {'id': 15, 'name': 'table'}, {'id': 18, 'name': 'curtain'}, {'id': 19, 'name': 'chair'}, {'id': 20, 'name': 'car'}, {'id': 22, 'name': 'painting'}, {'id': 23, 'name': 'sofa'}, {'id': 24, 'name': 'shelf'}, {'id': 27, 'name': 'mirror'}, {'id': 30, 'name': 'armchair'}, {'id': 31, 'name': 'seat'}, {'id': 32, 'name': 'fence'}, {'id': 33, 'name': 'desk'}, {'id': 35, 'name': 'wardrobe'}, {'id': 36, 'name': 'lamp'}, {'id': 37, 'name': 'bathtub'}, {'id': 38, 'name': 'railing'}, {'id': 39, 'name': 'cushion'}, {'id': 41, 'name': 'box'}, {'id': 42, 'name': 'column'}, {'id': 43, 'name': 'signboard'}, {'id': 44, 'name': 'chest of drawers'}, {'id': 45, 'name': 'counter'}, {'id': 47, 'name': 'sink'}, {'id': 49, 'name': 'fireplace'}, {'id': 50, 'name': 'refrigerator'}, {'id': 53, 'name': 'stairs'}, {'id': 55, 'name': 'case'}, {'id': 56, 'name': 'pool table'}, {'id': 57, 'name': 'pillow'}, {'id': 58, 'name': 'screen door'}, {'id': 62, 'name': 'bookcase'}, {'id': 64, 'name': 'coffee table'}, {'id': 65, 'name': 'toilet'}, {'id': 66, 'name': 'flower'}, {'id': 67, 'name': 'book'}, {'id': 69, 'name': 'bench'}, {'id': 70, 'name': 'countertop'}, {'id': 71, 'name': 'stove'}, {'id': 72, 'name': 'palm'}, {'id': 73, 'name': 'kitchen island'}, {'id': 74, 'name': 'computer'}, {'id': 75, 'name': 'swivel chair'}, {'id': 76, 'name': 'boat'}, {'id': 78, 'name': 'arcade machine'}, {'id': 80, 'name': 'bus'}, {'id': 81, 'name': 'towel'}, {'id': 82, 'name': 'light'}, {'id': 83, 'name': 'truck'}, {'id': 85, 'name': 'chandelier'}, {'id': 86, 'name': 'awning'}, {'id': 87, 'name': 'streetlight'}, {'id': 88, 'name': 'booth'}, {'id': 89, 'name': 'television receiver'}, {'id': 90, 'name': 'airplane'}, {'id': 92, 'name': 'apparel'}, {'id': 93, 'name': 'pole'}, {'id': 95, 'name': 'bannister'}, {'id': 97, 'name': 'ottoman'}, {'id': 98, 'name': 'bottle'}, {'id': 102, 'name': 'van'}, {'id': 103, 'name': 'ship'}, {'id': 104, 'name': 'fountain'}, {'id': 107, 'name': 'washer'}, {'id': 108, 'name': 'plaything'}, {'id': 110, 'name': 'stool'}, {'id': 111, 'name': 'barrel'}, {'id': 112, 'name': 'basket'}, {'id': 115, 'name': 'bag'}, {'id': 116, 'name': 'minibike'}, {'id': 118, 'name': 'oven'}, {'id': 119, 'name': 'ball'}, {'id': 120, 'name': 'food'}, {'id': 121, 'name': 'step'}, {'id': 123, 'name': 'trade name'}, {'id': 124, 'name': 'microwave'}, {'id': 125, 'name': 'pot'}, {'id': 126, 'name': 'animal'}, {'id': 127, 'name': 'bicycle'}, {'id': 129, 'name': 'dishwasher'}, {'id': 130, 'name': 'screen'}, {'id': 132, 'name': 'sculpture'}, {'id': 133, 'name': 'hood'}, {'id': 134, 'name': 'sconce'}, {'id': 135, 'name': 'vase'}, {'id': 136, 'name': 'traffic light'}, {'id': 137, 'name': 'tray'}, {'id': 138, 'name': 'ashcan'}, {'id': 139, 'name': 'fan'}, {'id': 142, 'name': 'plate'}, {'id': 143, 'name': 'monitor'}, {'id': 144, 'name': 'bulletin board'}, {'id': 146, 'name': 'radiator'}, {'id': 147, 'name': 'glass'}, {'id': 148, 'name': 'clock'}, {'id': 149, 'name': 'flag'}]
13 | 
14 | 
15 | _PREDEFINED_SPLITS = {
16 |     # point annotations without masks
17 |     "ade20k_instance_train": (
18 |         "ADEChallengeData2016/images/training",
19 |         "ADEChallengeData2016/ade20k_instance_train.json",
20 |     ),
21 |     "ade20k_instance_val": (
22 |         "ADEChallengeData2016/images/validation",
23 |         "ADEChallengeData2016/ade20k_instance_val.json",
24 |     ),
25 | }
26 | 
27 | 
28 | def _get_ade_instances_meta():
29 |     thing_ids = [k["id"] for k in ADE_CATEGORIES]
30 |     assert len(thing_ids) == 100, len(thing_ids)
31 |     # Mapping from the incontiguous ADE category id to an id in [0, 99]
32 |     thing_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(thing_ids)}
33 |     thing_classes = [k["name"] for k in ADE_CATEGORIES]
34 |     ret = {
35 |         "thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id,
36 |         "thing_classes": thing_classes,
37 |     }
38 |     return ret
39 | 
40 | 
41 | def register_all_ade20k_instance(root):
42 |     for key, (image_root, json_file) in _PREDEFINED_SPLITS.items():
43 |         # Assume pre-defined datasets live in `./datasets`.
44 |         register_coco_instances(
45 |             key,
46 |             _get_ade_instances_meta(),
47 |             os.path.join(root, json_file) if "://" not in json_file else json_file,
48 |             os.path.join(root, image_root),
49 |         )
50 | 
51 | 
52 | _root = os.getenv("DETECTRON2_DATASETS", "datasets")
53 | register_all_ade20k_instance(_root)
54 | 


--------------------------------------------------------------------------------
/evaluate/mask.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'tsungyi'
  2 | 
  3 | import pycocotools._mask as _mask
  4 | 
  5 | # Interface for manipulating masks stored in RLE format.
  6 | #
  7 | # RLE is a simple yet efficient format for storing binary masks. RLE
  8 | # first divides a vector (or vectorized image) into a series of piecewise
  9 | # constant regions and then for each piece simply stores the length of
 10 | # that piece. For example, given M=[0 0 1 1 1 0 1] the RLE counts would
 11 | # be [2 3 1 1], or for M=[1 1 1 1 1 1 0] the counts would be [0 6 1]
 12 | # (note that the odd counts are always the numbers of zeros). Instead of
 13 | # storing the counts directly, additional compression is achieved with a
 14 | # variable bitrate representation based on a common scheme called LEB128.
 15 | #
 16 | # Compression is greatest given large piecewise constant regions.
 17 | # Specifically, the size of the RLE is proportional to the number of
 18 | # *boundaries* in M (or for an image the number of boundaries in the y
 19 | # direction). Assuming fairly simple shapes, the RLE representation is
 20 | # O(sqrt(n)) where n is number of pixels in the object. Hence space usage
 21 | # is substantially lower, especially for large simple objects (large n).
 22 | #
 23 | # Many common operations on masks can be computed directly using the RLE
 24 | # (without need for decoding). This includes computations such as area,
 25 | # union, intersection, etc. All of these operations are linear in the
 26 | # size of the RLE, in other words they are O(sqrt(n)) where n is the area
 27 | # of the object. Computing these operations on the original mask is O(n).
 28 | # Thus, using the RLE can result in substantial computational savings.
 29 | #
 30 | # The following API functions are defined:
 31 | #  encode         - Encode binary masks using RLE.
 32 | #  decode         - Decode binary masks encoded via RLE.
 33 | #  merge          - Compute union or intersection of encoded masks.
 34 | #  iou            - Compute intersection over union between masks.
 35 | #  area           - Compute area of encoded masks.
 36 | #  toBbox         - Get bounding boxes surrounding encoded masks.
 37 | #  frPyObjects    - Convert polygon, bbox, and uncompressed RLE to encoded RLE mask.
 38 | #
 39 | # Usage:
 40 | #  Rs     = encode( masks )
 41 | #  masks  = decode( Rs )
 42 | #  R      = merge( Rs, intersect=false )
 43 | #  o      = iou( dt, gt, iscrowd )
 44 | #  a      = area( Rs )
 45 | #  bbs    = toBbox( Rs )
 46 | #  Rs     = frPyObjects( [pyObjects], h, w )
 47 | #
 48 | # In the API the following formats are used:
 49 | #  Rs      - [dict] Run-length encoding of binary masks
 50 | #  R       - dict Run-length encoding of binary mask
 51 | #  masks   - [hxwxn] Binary mask(s) (must have type np.ndarray(dtype=uint8) in column-major order)
 52 | #  iscrowd - [nx1] list of np.ndarray. 1 indicates corresponding gt image has crowd region to ignore
 53 | #  bbs     - [nx4] Bounding box(es) stored as [x y w h]
 54 | #  poly    - Polygon stored as [[x1 y1 x2 y2...],[x1 y1 ...],...] (2D list)
 55 | #  dt,gt   - May be either bounding boxes or encoded masks
 56 | # Both poly and bbs are 0-indexed (bbox=[0 0 1 1] encloses first pixel).
 57 | #
 58 | # Finally, a note about the intersection over union (iou) computation.
 59 | # The standard iou of a ground truth (gt) and detected (dt) object is
 60 | #  iou(gt,dt) = area(intersect(gt,dt)) / area(union(gt,dt))
 61 | # For "crowd" regions, we use a modified criteria. If a gt object is
 62 | # marked as "iscrowd", we allow a dt to match any subregion of the gt.
 63 | # Choosing gt' in the crowd gt that best matches the dt can be done using
 64 | # gt'=intersect(dt,gt). Since by definition union(gt',dt)=dt, computing
 65 | #  iou(gt,dt,iscrowd) = iou(gt',dt) = area(intersect(gt,dt)) / area(dt)
 66 | # For crowd gt regions we use this modified criteria above for the iou.
 67 | #
 68 | # To compile run "python setup.py build_ext --inplace"
 69 | # Please do not contact us for help with compiling.
 70 | #
 71 | # Microsoft COCO Toolbox.      version 2.0
 72 | # Data, paper, and tutorials available at:  http://mscoco.org/
 73 | # Code written by Piotr Dollar and Tsung-Yi Lin, 2015.
 74 | # Licensed under the Simplified BSD License [see coco/license.txt]
 75 | 
 76 | iou         = _mask.iou
 77 | merge       = _mask.merge
 78 | frPyObjects = _mask.frPyObjects
 79 | 
 80 | def encode(bimask):
 81 |     if len(bimask.shape) == 3:
 82 |         return _mask.encode(bimask)
 83 |     elif len(bimask.shape) == 2:
 84 |         h, w = bimask.shape
 85 |         return _mask.encode(bimask.reshape((h, w, 1), order='F'))[0]
 86 | 
 87 | def decode(rleObjs):
 88 |     if type(rleObjs) == list:
 89 |         return _mask.decode(rleObjs)
 90 |     else:
 91 |         return _mask.decode([rleObjs])[:,:,0]
 92 | 
 93 | def area(rleObjs):
 94 |     if type(rleObjs) == list:
 95 |         return _mask.area(rleObjs)
 96 |     else:
 97 |         return _mask.area([rleObjs])[0]
 98 | 
 99 | def toBbox(rleObjs):
100 |     if type(rleObjs) == list:
101 |         return _mask.toBbox(rleObjs)
102 |     else:
103 |         return _mask.toBbox([rleObjs])[0]


--------------------------------------------------------------------------------
/ovformer/data_video/datasets/builtin.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC
  3 | 
  4 | import os
  5 | 
  6 | from .ytvis import (
  7 |     register_ytvis_instances,
  8 |     _get_ytvis_2019_instances_meta,
  9 |     _get_ytvis_2021_instances_meta,
 10 | )
 11 | 
 12 | from .ovis import (
 13 |     register_ovis_instances,
 14 |     _get_ovis_instances_meta,
 15 | )
 16 | 
 17 | from .lvvis import (
 18 |     register_lvvis_instances,
 19 |     _get_lvvis_instances_meta,
 20 | )
 21 | 
 22 | from .burst import (
 23 |     register_burst_instances,
 24 |     _get_burst_instances_meta,
 25 | )
 26 | 
 27 | # ==== Predefined splits for YTVIS 2019 ===========
 28 | _PREDEFINED_SPLITS_YTVIS_2019 = {
 29 |     "ytvis_2019_train": ("ytvis_2019/train/JPEGImages",
 30 |                          "ytvis_2019/train.json"),
 31 |     "ytvis_2019_val": ("ytvis_2019/valid/JPEGImages",
 32 |                        "ytvis_2019/valid.json"),
 33 |     "ytvis_2019_test": ("ytvis_2019/test/JPEGImages",
 34 |                         "ytvis_2019/test.json"),
 35 | }
 36 | 
 37 | 
 38 | # ==== Predefined splits for YTVIS 2021 ===========
 39 | _PREDEFINED_SPLITS_YTVIS_2021 = {
 40 |     "ytvis_2021_train": ("ytvis_2021/train/JPEGImages",
 41 |                          "ytvis_2021/train.json"),
 42 |     "ytvis_2021_val": ("ytvis_2021/valid/JPEGImages",
 43 |                        "ytvis_2021/valid.json"),
 44 |     "ytvis_2021_test": ("ytvis_2021/test/JPEGImages",
 45 |                         "ytvis_2021/test.json"),
 46 | }
 47 | 
 48 | # ==== Predefined splits for OVIS ===========
 49 | _PREDEFINED_SPLITS_OVIS = {
 50 |     "ovis_train": ("ovis/train",
 51 |                    "ovis/annotations/train.json"),
 52 |     "ovis_val": ("ovis/valid",
 53 |                  "ovis/annotations/valid.json"),
 54 |     "ovis_test": ("ovis/test",
 55 |                   "ovis/annotations/test.json"),
 56 | }
 57 | 
 58 | # ==== Predefined splits for LVVIS ===========
 59 | _PREDEFINED_SPLITS_LVVIS = {
 60 |     "lvvis_train": ("LVVIS/train/JPEGImages",
 61 |                     "LVVIS/train/train_instances_nonovel.json"),
 62 |     "lvvis_val": ("LVVIS/val/JPEGImages",
 63 |                   "LVVIS/val/val_instances_.json"),
 64 |     "lvvis_test": ("LVVIS/test/JPEGImages",
 65 |                    "LVVIS/test/test_instances.json"),
 66 | }
 67 | 
 68 | _PREDEFINED_SPLITS_BURST= {
 69 |     "burst_val": ("burst/val",
 70 |                   "burst/b2y_val.json"),
 71 | }
 72 | 
 73 | 
 74 | def register_all_ytvis_2019(root):
 75 |     for key, (image_root, json_file) in _PREDEFINED_SPLITS_YTVIS_2019.items():
 76 |         # Assume pre-defined datasets live in `./datasets`.
 77 |         register_ytvis_instances(
 78 |             key,
 79 |             _get_ytvis_2019_instances_meta(),
 80 |             os.path.join(root, json_file) if "://" not in json_file else json_file,
 81 |             os.path.join(root, image_root),
 82 |         )
 83 | 
 84 | 
 85 | def register_all_ytvis_2021(root):
 86 |     for key, (image_root, json_file) in _PREDEFINED_SPLITS_YTVIS_2021.items():
 87 |         # Assume pre-defined datasets live in `./datasets`.
 88 |         register_ytvis_instances(
 89 |             key,
 90 |             _get_ytvis_2021_instances_meta(),
 91 |             os.path.join(root, json_file) if "://" not in json_file else json_file,
 92 |             os.path.join(root, image_root),
 93 |         )
 94 | 
 95 | def register_all_ovis(root):
 96 |     for key, (image_root, json_file) in _PREDEFINED_SPLITS_OVIS.items():
 97 |         # Assume pre-defined datasets live in `./datasets`.
 98 |         register_ovis_instances(
 99 |             key,
100 |             _get_ovis_instances_meta(),
101 |             os.path.join(root, json_file) if "://" not in json_file else json_file,
102 |             os.path.join(root, image_root),
103 |         )
104 | 
105 | def register_all_lvvis(root):
106 |     for key, (image_root, json_file) in _PREDEFINED_SPLITS_LVVIS.items():
107 |         # Assume pre-defined datasets live in `./datasets`.
108 |         register_lvvis_instances(
109 |             key,
110 |             _get_lvvis_instances_meta(),
111 |             os.path.join(root, json_file) if "://" not in json_file else json_file,
112 |             os.path.join(root, image_root),
113 |         )
114 | 
115 | def register_all_burst(root):
116 |     for key, (image_root, json_file) in _PREDEFINED_SPLITS_BURST.items():
117 |         # Assume pre-defined datasets live in `./datasets`.
118 |         register_burst_instances(
119 |             key,
120 |             _get_burst_instances_meta(),
121 |             os.path.join(root, json_file) if "://" not in json_file else json_file,
122 |             os.path.join(root, image_root),
123 |         )
124 | 
125 | if __name__.endswith(".builtin"):
126 |     # Assume pre-defined datasets live in `./datasets`.
127 |     _root = os.getenv("DETECTRON2_DATASETS", "datasets")
128 |     register_all_ytvis_2019(_root)
129 |     register_all_ytvis_2021(_root)
130 |     register_all_ovis(_root)
131 |     register_all_lvvis(_root)
132 |     register_all_burst(_root)
133 | 


--------------------------------------------------------------------------------
/ovformer/evaluation/instance_evaluation.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | import contextlib
  3 | import copy
  4 | import io
  5 | import itertools
  6 | import json
  7 | import logging
  8 | import numpy as np
  9 | import os
 10 | import pickle
 11 | from collections import OrderedDict
 12 | import pycocotools.mask as mask_util
 13 | import torch
 14 | from pycocotools.coco import COCO
 15 | from pycocotools.cocoeval import COCOeval
 16 | from tabulate import tabulate
 17 | 
 18 | import detectron2.utils.comm as comm
 19 | from detectron2.config import CfgNode
 20 | from detectron2.data import MetadataCatalog
 21 | from detectron2.data.datasets.coco import convert_to_coco_json
 22 | from detectron2.evaluation.coco_evaluation import COCOEvaluator, _evaluate_predictions_on_coco
 23 | from detectron2.evaluation.fast_eval_api import COCOeval_opt
 24 | from detectron2.structures import Boxes, BoxMode, pairwise_iou
 25 | from detectron2.utils.file_io import PathManager
 26 | from detectron2.utils.logger import create_small_table
 27 | 
 28 | 
 29 | # modified from COCOEvaluator for instance segmetnat
 30 | class InstanceSegEvaluator(COCOEvaluator):
 31 |     """
 32 |     Evaluate AR for object proposals, AP for instance detection/segmentation, AP
 33 |     for keypoint detection outputs using COCO's metrics.
 34 |     See http://cocodataset.org/#detection-eval and
 35 |     http://cocodataset.org/#keypoints-eval to understand its metrics.
 36 |     The metrics range from 0 to 100 (instead of 0 to 1), where a -1 or NaN means
 37 |     the metric cannot be computed (e.g. due to no predictions made).
 38 | 
 39 |     In addition to COCO, this evaluator is able to support any bounding box detection,
 40 |     instance segmentation, or keypoint detection dataset.
 41 |     """
 42 | 
 43 |     def _eval_predictions(self, predictions, img_ids=None):
 44 |         """
 45 |         Evaluate predictions. Fill self._results with the metrics of the tasks.
 46 |         """
 47 |         self._logger.info("Preparing results for COCO format ...")
 48 |         coco_results = list(itertools.chain(*[x["instances"] for x in predictions]))
 49 |         tasks = self._tasks or self._tasks_from_predictions(coco_results)
 50 | 
 51 |         # unmap the category ids for COCO
 52 |         if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"):
 53 |             dataset_id_to_contiguous_id = self._metadata.thing_dataset_id_to_contiguous_id
 54 |             # all_contiguous_ids = list(dataset_id_to_contiguous_id.values())
 55 |             # num_classes = len(all_contiguous_ids)
 56 |             # assert min(all_contiguous_ids) == 0 and max(all_contiguous_ids) == num_classes - 1
 57 | 
 58 |             reverse_id_mapping = {v: k for k, v in dataset_id_to_contiguous_id.items()}
 59 |             for result in coco_results:
 60 |                 category_id = result["category_id"]
 61 |                 # assert category_id < num_classes, (
 62 |                 #     f"A prediction has class={category_id}, "
 63 |                 #     f"but the dataset only has {num_classes} classes and "
 64 |                 #     f"predicted class id should be in [0, {num_classes - 1}]."
 65 |                 # )
 66 |                 assert category_id in reverse_id_mapping, (
 67 |                     f"A prediction has class={category_id}, "
 68 |                     f"but the dataset only has class ids in {dataset_id_to_contiguous_id}."
 69 |                 )
 70 |                 result["category_id"] = reverse_id_mapping[category_id]
 71 | 
 72 |         if self._output_dir:
 73 |             file_path = os.path.join(self._output_dir, "coco_instances_results.json")
 74 |             self._logger.info("Saving results to {}".format(file_path))
 75 |             with PathManager.open(file_path, "w") as f:
 76 |                 f.write(json.dumps(coco_results))
 77 |                 f.flush()
 78 | 
 79 |         if not self._do_evaluation:
 80 |             self._logger.info("Annotations are not available for evaluation.")
 81 |             return
 82 | 
 83 |         self._logger.info(
 84 |             "Evaluating predictions with {} COCO API...".format(
 85 |                 "unofficial" if self._use_fast_impl else "official"
 86 |             )
 87 |         )
 88 |         for task in sorted(tasks):
 89 |             assert task in {"bbox", "segm", "keypoints"}, f"Got unknown task: {task}!"
 90 |             coco_eval = (
 91 |                 _evaluate_predictions_on_coco(
 92 |                     self._coco_api,
 93 |                     coco_results,
 94 |                     task,
 95 |                     kpt_oks_sigmas=self._kpt_oks_sigmas,
 96 |                     use_fast_impl=self._use_fast_impl,
 97 |                     img_ids=img_ids,
 98 |                     max_dets_per_image=self._max_dets_per_image,
 99 |                 )
100 |                 if len(coco_results) > 0
101 |                 else None  # cocoapi does not handle empty results very well
102 |             )
103 | 
104 |             res = self._derive_coco_results(
105 |                 coco_eval, task, class_names=self._metadata.get("thing_classes")
106 |             )
107 |             self._results[task] = res
108 | 


--------------------------------------------------------------------------------
/ovformer/config.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Copyright (c) Facebook, Inc. and its affiliates.
  3 | from detectron2.config import CfgNode as CN
  4 | 
  5 | 
  6 | def add_ovformer_config(cfg):
  7 |     """
  8 |     Add config for ovformer.
  9 |     """
 10 |     # data config
 11 |     # select the dataset mapper
 12 |     cfg.INPUT.DATASET_MAPPER_NAME = "mask_former_semantic"
 13 |     # Color augmentation
 14 |     cfg.INPUT.COLOR_AUG_SSD = False
 15 |     # We retry random cropping until no single category in semantic segmentation GT occupies more
 16 |     # than `SINGLE_CATEGORY_MAX_AREA` part of the crop.
 17 |     cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA = 1.0
 18 |     # Pad image and segmentation GT in dataset mapper.
 19 |     cfg.INPUT.SIZE_DIVISIBILITY = -1
 20 | 
 21 |     cfg.INPUT.SAMPLING_FRAME_NUM = 2
 22 |     cfg.INPUT.SAMPLING_FRAME_RANGE = 20
 23 |     cfg.INPUT.SAMPLING_FRAME_SHUFFLE = False
 24 |     cfg.INPUT.AUGMENTATIONS = [] # "brightness", "contrast", "saturation", "rotation"
 25 | 
 26 |     
 27 |     # solver config
 28 |     # weight decay on embedding
 29 |     cfg.SOLVER.WEIGHT_DECAY_EMBED = 0.0
 30 |     # optimizer
 31 |     cfg.SOLVER.OPTIMIZER = "ADAMW"
 32 |     cfg.SOLVER.BACKBONE_MULTIPLIER = 0.1
 33 | 
 34 |     # mask_former model config
 35 |     cfg.MODEL.MASK_FORMER = CN()
 36 | 
 37 | 
 38 |     # loss
 39 |     cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION = True
 40 |     cfg.MODEL.MASK_FORMER.NO_OBJECT_WEIGHT = 0.1
 41 |     cfg.MODEL.MASK_FORMER.OBJECT_WEIGHT = 1.0
 42 |     cfg.MODEL.MASK_FORMER.CLASS_WEIGHT = 1.0
 43 |     cfg.MODEL.MASK_FORMER.DICE_WEIGHT = 1.0
 44 |     cfg.MODEL.MASK_FORMER.MASK_WEIGHT = 20.0
 45 | 
 46 |     # transformer config
 47 |     cfg.MODEL.MASK_FORMER.NHEADS = 8
 48 |     cfg.MODEL.MASK_FORMER.DROPOUT = 0.1
 49 |     cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD = 2048
 50 |     cfg.MODEL.MASK_FORMER.ENC_LAYERS = 0
 51 |     cfg.MODEL.MASK_FORMER.DEC_LAYERS = 6
 52 |     cfg.MODEL.MASK_FORMER.PRE_NORM = False
 53 | 
 54 |     cfg.MODEL.MASK_FORMER.HIDDEN_DIM = 256
 55 |     cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES = 100
 56 | 
 57 |     cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE = "res5"
 58 |     cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ = False
 59 | 
 60 |     # mask_former inference config
 61 |     cfg.MODEL.MASK_FORMER.TEST = CN()
 62 |     cfg.MODEL.MASK_FORMER.TEST.SEMANTIC_ON = True
 63 |     cfg.MODEL.MASK_FORMER.TEST.INSTANCE_ON = False
 64 |     cfg.MODEL.MASK_FORMER.TEST.PANOPTIC_ON = False
 65 |     cfg.MODEL.MASK_FORMER.TEST.OBJECT_MASK_THRESHOLD = 0.0
 66 |     cfg.MODEL.MASK_FORMER.TEST.OVERLAP_THRESHOLD = 0.0
 67 |     cfg.MODEL.MASK_FORMER.TEST.SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE = False
 68 | 
 69 |     # Sometimes `backbone.size_divisibility` is set to 0 for some backbone (e.g. ResNet)
 70 |     # you can use this config to override
 71 |     cfg.MODEL.MASK_FORMER.SIZE_DIVISIBILITY = 32
 72 | 
 73 | 
 74 |     cfg.MODEL.MASK_FORMER.CLIP_TEXT_PATH = ''
 75 |     cfg.MODEL.MASK_FORMER.CLIP_IMAGE_PATH = ''
 76 | 
 77 |     # classifier config
 78 |     cfg.MODEL.MASK_FORMER.CLIP_CLASSIFIER = False
 79 |     cfg.MODEL.MASK_FORMER.AGNOSTIC_CLASSIFIER = False
 80 | 
 81 |     # pixel decoder config
 82 |     cfg.MODEL.SEM_SEG_HEAD.MASK_DIM = 256
 83 |     # adding transformer in pixel decoder
 84 |     cfg.MODEL.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS = 0
 85 |     # pixel decoder
 86 |     cfg.MODEL.SEM_SEG_HEAD.PIXEL_DECODER_NAME = "BasePixelDecoder"
 87 | 
 88 |     # swin transformer backbone
 89 |     cfg.MODEL.SWIN = CN()
 90 |     cfg.MODEL.SWIN.PRETRAIN_IMG_SIZE = 224
 91 |     cfg.MODEL.SWIN.PATCH_SIZE = 4
 92 |     cfg.MODEL.SWIN.EMBED_DIM = 96
 93 |     cfg.MODEL.SWIN.DEPTHS = [2, 2, 6, 2]
 94 |     cfg.MODEL.SWIN.NUM_HEADS = [3, 6, 12, 24]
 95 |     cfg.MODEL.SWIN.WINDOW_SIZE = 7
 96 |     cfg.MODEL.SWIN.MLP_RATIO = 4.0
 97 |     cfg.MODEL.SWIN.QKV_BIAS = True
 98 |     cfg.MODEL.SWIN.QK_SCALE = None
 99 |     cfg.MODEL.SWIN.DROP_RATE = 0.0
100 |     cfg.MODEL.SWIN.ATTN_DROP_RATE = 0.0
101 |     cfg.MODEL.SWIN.DROP_PATH_RATE = 0.3
102 |     cfg.MODEL.SWIN.APE = False
103 |     cfg.MODEL.SWIN.PATCH_NORM = True
104 |     cfg.MODEL.SWIN.OUT_FEATURES = ["res2", "res3", "res4", "res5"]
105 |     cfg.MODEL.SWIN.USE_CHECKPOINT = False
106 | 
107 |     # NOTE: maskformer2 extra configs
108 |     # transformer module
109 |     cfg.MODEL.MASK_FORMER.TRANSFORMER_DECODER_NAME = "MultiScaleMaskedTransformerDecoder"
110 | 
111 |     cfg.MODEL.TIMM = CN()
112 |     cfg.MODEL.TIMM.BASE_NAME = 'resnet50'
113 |     cfg.MODEL.TIMM.OUT_LEVELS = (2, 3, 4, 5)
114 |     cfg.MODEL.TIMM.NORM = 'FrozenBN'
115 |     cfg.MODEL.TIMM.FREEZE_AT = 0
116 |     cfg.MODEL.TIMM.PRETRAINED = False
117 | 
118 |     # LSJ aug
119 |     cfg.INPUT.IMAGE_SIZE = 1024
120 |     cfg.INPUT.MIN_SCALE = 0.1
121 |     cfg.INPUT.MAX_SCALE = 2.0
122 | 
123 |     # MSDeformAttn encoder configs
124 |     cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES = ["res3", "res4", "res5"]
125 |     cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_POINTS = 4
126 |     cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_HEADS = 8
127 | 
128 |     # point loss configs
129 |     # Number of points sampled during training for a mask point head.
130 |     cfg.MODEL.MASK_FORMER.TRAIN_NUM_POINTS = 112 * 112
131 |     # Oversampling parameter for PointRend point sampling during training. Parameter `k` in the
132 |     # original paper.
133 |     cfg.MODEL.MASK_FORMER.OVERSAMPLE_RATIO = 3.0
134 |     # Importance sampling parameter for PointRend point sampling during training. Parametr `beta` in
135 |     # the original paper.
136 |     cfg.MODEL.MASK_FORMER.IMPORTANCE_SAMPLE_RATIO = 0.75
137 | 


--------------------------------------------------------------------------------
/tools/save_clip_features.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from clip import clip
  3 | from PIL import Image
  4 | from tqdm import tqdm
  5 | import json
  6 | import os
  7 | 
  8 | device = "cuda" if torch.cuda.is_available() else "cpu"
  9 | model, preprocess = clip.load("ViT-B/32", device=device)
 10 | for _, param in model.named_parameters():
 11 |     param.requires_grad = False
 12 | 
 13 | # LVIS train
 14 | json_path = 'datasets/lvis/lvis_v1_train.json'
 15 | file_dir = "datasets/coco/train2017/"
 16 | save_path = "datasets/metadata/lvis_train_clip_feature.pkl"
 17 | data = json.load(open(json_path, 'r'))
 18 | dic = {}
 19 | for image in tqdm(data['images']):
 20 |     file_name = file_dir + f"{image['id']}".zfill(12) + ".jpg"
 21 |     image_clip = preprocess(Image.open(file_name)).unsqueeze(0).to(device)
 22 |     feature_clip = model.encode_image(image_clip)
 23 |     dic[image['id']] = feature_clip
 24 | torch.save(dic, save_path)
 25 | 
 26 | 
 27 | # LVIS val
 28 | json_path = 'datasets/lvis/lvis_v1_val.json'
 29 | file_dir = "datasets/coco/val2017/"
 30 | save_path = "datasets/metadata/lvis_val_clip_feature.pkl"
 31 | data = json.load(open(json_path, 'r'))
 32 | dic = {}
 33 | for image in tqdm(data['images']):
 34 |     file_name = file_dir + f"{image['id']}".zfill(12) + ".jpg"
 35 |     if not os.path.exists(file_name):
 36 |         file_name = "datasets/coco/train2017/" + f"{image['id']}".zfill(12) + ".jpg"
 37 |     image_clip = preprocess(Image.open(file_name)).unsqueeze(0).to(device)
 38 |     feature_clip = model.encode_image(image_clip)
 39 |     dic[image['id']] = feature_clip
 40 | torch.save(dic, save_path)
 41 | 
 42 | # LVVIS train
 43 | json_path = 'datasets/LVVIS/train/train_instances_.json'   #
 44 | file_dir = "datasets/LVVIS/train/JPEGImages/"
 45 | save_path = "datasets/metadata/lvvis_train_clip_feature.pkl"
 46 | data = json.load(open(json_path, 'r'))
 47 | dic = {}
 48 | for video in tqdm(data['videos']):
 49 |     for image in video["file_names"]:
 50 |         file_name = file_dir + image
 51 |         image_clip = preprocess(Image.open(file_name)).unsqueeze(0).to(device)
 52 |         feature_clip = model.encode_image(image_clip)
 53 |         dic[file_name] = feature_clip
 54 | torch.save(dic, save_path)
 55 | 
 56 | # LVVIS val
 57 | json_path = 'datasets/LVVIS/val/val_instances_.json'
 58 | file_dir = "datasets/LVVIS/val/JPEGImages/"
 59 | save_path = "datasets/metadata/lvvis_val_clip_feature.pkl"
 60 | data = json.load(open(json_path, 'r'))
 61 | dic = {}
 62 | for video in tqdm(data['videos']):
 63 |     for image in video["file_names"]:
 64 |         file_name = file_dir + image
 65 |         image_clip = preprocess(Image.open(file_name)).unsqueeze(0).to(device)
 66 |         feature_clip = model.encode_image(image_clip)
 67 |         dic[file_name] = feature_clip
 68 | torch.save(dic, save_path)
 69 | 
 70 | # LVVIS test
 71 | json_path = 'datasets/LVVIS/test/test_instances.json'
 72 | file_dir = "datasets/LVVIS/test/JPEGImages/"
 73 | save_path = "datasets/metadata/lvvis_test_clip_feature.pkl"
 74 | data = json.load(open(json_path, 'r'))
 75 | dic = {}
 76 | for video in tqdm(data['videos']):
 77 |     for image in video["file_names"]:
 78 |         file_name = file_dir + image
 79 |         image_clip = preprocess(Image.open(file_name)).unsqueeze(0).to(device)
 80 |         feature_clip = model.encode_image(image_clip)
 81 |         dic[file_name] = feature_clip
 82 | torch.save(dic, save_path)
 83 | 
 84 | # ytvis_2019 val
 85 | json_path = 'datasets/ytvis_2019/valid.json'
 86 | file_dir = "datasets/ytvis_2019/valid/JPEGImages/"
 87 | save_path = "datasets/metadata/ytvis_2019_val_clip_feature.pkl"
 88 | data = json.load(open(json_path, 'r'))
 89 | dic = {}
 90 | for video in tqdm(data['videos']):
 91 |     for image in video["file_names"]:
 92 |         file_name = file_dir + image
 93 |         image_clip = preprocess(Image.open(file_name)).unsqueeze(0).to(device)
 94 |         feature_clip = model.encode_image(image_clip)
 95 |         dic[file_name] = feature_clip
 96 | torch.save(dic, save_path)
 97 | 
 98 | # ytvis_2021 val
 99 | json_path = 'datasets/ytvis_2021/valid.json'
100 | file_dir = "datasets/ytvis_2021/valid/JPEGImages/"
101 | save_path = "datasets/metadata/ytvis_2021_val_clip_feature.pkl"
102 | data = json.load(open(json_path, 'r'))
103 | dic = {}
104 | for video in tqdm(data['videos']):
105 |     for image in video["file_names"]:
106 |         file_name = file_dir + image
107 |         image_clip = preprocess(Image.open(file_name)).unsqueeze(0).to(device)
108 |         feature_clip = model.encode_image(image_clip)
109 |         dic[file_name] = feature_clip
110 | torch.save(dic, save_path)
111 | 
112 | # ovis val
113 | json_path = 'datasets/ovis/annotations/valid.json'
114 | file_dir = "datasets/ovis/valid/"
115 | save_path = "datasets/metadata/ovis_val_clip_feature.pkl"
116 | data = json.load(open(json_path, 'r'))
117 | dic = {}
118 | for video in tqdm(data['videos']):
119 |     for image in video["file_names"]:
120 |         file_name = file_dir + image
121 |         image_clip = preprocess(Image.open(file_name)).unsqueeze(0).to(device)
122 |         feature_clip = model.encode_image(image_clip)
123 |         dic[file_name] = feature_clip
124 | torch.save(dic, save_path)
125 | 
126 | # burst val
127 | json_path = 'datasets/burst/b2y_val.json'
128 | file_dir = "datasets/burst/val/"
129 | save_path = "datasets/metadata/burst_val_clip_feature.pkl"
130 | data = json.load(open(json_path, 'r'))
131 | dic = {}
132 | for video in tqdm(data['videos']):
133 |     for image in video["file_names"]:
134 |         file_name = file_dir + image
135 |         image_clip = preprocess(Image.open(file_name)).unsqueeze(0).to(device)
136 |         feature_clip = model.encode_image(image_clip)
137 |         dic[file_name] = feature_clip
138 | torch.save(dic, save_path)
139 | 


--------------------------------------------------------------------------------
/datasets/README.md:
--------------------------------------------------------------------------------
  1 | # Prepare Datasets for OVFormer
  2 | 
  3 | OVFormer has builtin support for a few datasets.
  4 | The datasets are assumed to exist in a directory specified by the environment variable
  5 | `DETECTRON2_DATASETS`.
  6 | Under this directory, detectron2 will look for datasets in the structure described below, if needed.
  7 | ```
  8 | $DETECTRON2_DATASETS/
  9 |   coco/
 10 |   lvis/
 11 |   LVVIS/
 12 |   ytvis_2019/
 13 |   ytvis_2021/
 14 |   ovis/
 15 |   burst/
 16 |   metadata/
 17 | ```
 18 | 
 19 | You can set the location for builtin datasets by `export DETECTRON2_DATASETS=/path/to/datasets`.
 20 | If left unset, the default is `./datasets` relative to your current working directory.
 21 | 
 22 | <!-- The [model zoo](https://github.com/facebookresearch/MaskFormer/blob/master/MODEL_ZOO.md)
 23 | contains configs and models that use these builtin datasets. -->
 24 | 
 25 | ## STEP-1: Prepare Image & Video Instance Segmentation datasets
 26 | ### Expected dataset structure for [COCO](https://cocodataset.org/#download):
 27 | 
 28 | ```
 29 | coco/
 30 |   annotations/
 31 |     instances_{train,val}2017.json
 32 |   {train,val}2017/
 33 | ```
 34 | 
 35 | ### Expected dataset structure for [LVIS](https://www.lvisdataset.org/dataset):
 36 | 
 37 | ```
 38 | lvis/
 39 |   lvis_v1_train.json
 40 |   lvis_v1_train_norare.json
 41 |   lvis_v1_val.json
 42 | ```
 43 | Next, prepare the open-vocabulary LVIS training set using
 44 | ```bash
 45 | python tools/remove_lvis_rare.py --ann datasets/lvis/lvis_v1_train.json
 46 | ```
 47 | This will generate```datasets/lvis/lvis_v1_train_norare.json.```
 48 | 
 49 | ### Expected dataset structure for [LV-VIS](https://github.com/haochenheheda/LVVIS):
 50 | 
 51 | ```
 52 | LVVIS/
 53 |   train/
 54 |     JPEGImages/
 55 |     train_instances_.json
 56 |     train_instances_nonovel.json
 57 |   val/
 58 |     JPEGImages/
 59 |     val_instances_.json  
 60 |   test/
 61 |     JPEGImages/
 62 |     test_instances.json
 63 | ```
 64 | LV-VIS official did not provide JSON files for the test set, using
 65 | ```bash
 66 | python tools/lvivs_test_instances_json.py
 67 | ```
 68 | This will generate```datasets/LVVIS/test/test_instances.json.```
 69 | 
 70 | Next, prepare the open-vocabulary LV-VIS training set using
 71 | ```bash
 72 | python tools/remove_lvvis_novel.py --ann datasets/LVVIS/train/train_instances_.json
 73 | ```
 74 | This will generate```datasets/lvvis/train/train_instances_nonovel.json.```
 75 | 
 76 | ### Expected dataset structure for [YouTubeVIS 2019](https://codalab.lisn.upsaclay.fr/competitions/7682):
 77 | 
 78 | ```
 79 | ytvis_2019/
 80 |   {train,valid,test}.json
 81 |   {train,valid,test}/
 82 |     JPEGImages/
 83 | ```
 84 | 
 85 | ### Expected dataset structure for [YouTubeVIS 2021](https://codalab.lisn.upsaclay.fr/competitions/7680):
 86 | 
 87 | ```
 88 | ytvis_2021/
 89 |   {train,valid,test}.json
 90 |   {train,valid,test}/
 91 |     JPEGImages/
 92 | ```
 93 | 
 94 | ### Expected dataset structure for [OVIS](https://codalab.lisn.upsaclay.fr/competitions/4763):
 95 | 
 96 | ```
 97 | ovis/
 98 |   annotations/
 99 |     {train,valid,test}.json
100 |   {train,valid,test}/
101 |     JPEGImages/
102 | ```
103 | 
104 | ### Expected dataset structure for [BURST](https://github.com/Ali2500/BURST-benchmark):
105 | ```
106 | burst/
107 |   info/
108 |     class_split.json
109 |   val/
110 |     ArgoVerse/   
111 |     AVA/
112 |     BDD/                          
113 |     Charades/                     
114 |     HACS/                 
115 |     LaSOT/                        
116 |     YFCC100M/                          
117 |     all_classes.json
118 |   b2y_val.json
119 |   
120 | ```
121 | Download the data of BURST val set (except AVA and HACS videos):
122 | 
123 | ```bash
124 | wget https://motchallenge.net/data/2-TAO_VAL.zip 
125 | wget https://omnomnom.vision.rwth-aachen.de/data/BURST/annotations.zip
126 | ```
127 | To download the TAO AVA and HACS videos you need to sign in [MOTChallenge](https://motchallenge.net/login/) account. 
128 | 
129 | The b2y_val.json is the youtube-vis format annitation files generated by 
130 | ```bsah
131 | python tools/burst2ytvis.py --ann datasets/burst/val/all_classes.json --out datasets/burst/b2y_val.json
132 | ```
133 | 
134 | 
135 | ## STEP-2: Prepare metadata
136 | #### Download [metadata](https://drive.google.com/file/d/10M7PQdCc9n6dM0NHbOKXOO_cdBHPavRZ/view?usp=sharing), and organize the files according to the following structure:
137 | ```
138 | metadata/
139 |   fg_bg_5_10_coco_ens.npy
140 |   fg_bg_5_10_lvis_ens.npy
141 |   fg_bg_5_10_lvvis_ens.npy
142 |   fg_bg_5_10_ovis_ens.npy
143 |   fg_bg_5_10_ytvis19_ens.npy
144 |   fg_bg_5_10_ytvis21_ens.npy
145 |   fg_bg_5_10_burst_ens.npy
146 | ```
147 | the metadata contains pre-computed classifiers for each dataset, which are generated by [DetPro](https://github.com/dyabel/detpro). 
148 | If you want to generate customer classifiers, please follow this project.
149 | 
150 | ```
151 | metadata/
152 |   lvis_v1_train_cat_info.json
153 |   lvvis_train_cat_info.json
154 | ```
155 | the metadata contains category information for two training sets, 
156 | which are generated by [get_lvis_cat_info.py](../tools/get_lvis_cat_info.py) and [get_lvvis_cat_info.py](../tools/get_lvvis_cat_info.py).
157 | 
158 | ```
159 | metadata/
160 |   lvis_train_clip_feature.pkl
161 |   lvis_val_clip_feature.pkl
162 |   lvvis_train_clip_feature.pkl
163 |   lvvis_val_clip_feature.pkl
164 |   lvvis_test_clip_feature.pkl
165 |   ytvis_2019_val_clip_feature.pkl
166 |   ytvis_2021_val_clip_feature.pkl
167 |   ovis_val_clip_feature.pkl
168 |   burst_val_clip_feature.pkl
169 | ```
170 | the metadata contains CLIP image features for each dataset, 
171 | which are generated by [save_clip_features.py](../tools/save_clip_features.py)
172 | 
173 | ## STEP-3: Prepare Pretrained Model
174 | Like [OV2Seg](https://github.com/haochenheheda/LVVIS), our paper uses ImageNet-21K pretrained models that are not part of Detectron2 (ResNet-50-21K from [MIIL](https://github.com/Alibaba-MIIL/ImageNet21K) and SwinB-21K from [Swin-Transformer](https://github.com/microsoft/Swin-Transformer)). Before training, 
175 | please download the models and place them under `models/`, and following [this tool](../tools/convert-thirdparty-pretrained-model-to-d2.py) to convert the format.
176 | 
177 | ```
178 | models/
179 |   resnet50_miil_21k.pkl
180 |   swin_base_patch4_window12_384_22k.pkl
181 | datasets/
182 |   metadata/
183 |   ...
184 | ```


--------------------------------------------------------------------------------
/ovformer/modeling/meta_arch/mask_former_head.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | import logging
  3 | from copy import deepcopy
  4 | from typing import Callable, Dict, List, Optional, Tuple, Union
  5 | 
  6 | import fvcore.nn.weight_init as weight_init
  7 | from torch import nn
  8 | from torch.nn import functional as F
  9 | 
 10 | from detectron2.config import configurable
 11 | from detectron2.layers import Conv2d, ShapeSpec, get_norm
 12 | from detectron2.modeling import SEM_SEG_HEADS_REGISTRY
 13 | 
 14 | from ..transformer_decoder.maskformer_transformer_decoder import build_transformer_decoder
 15 | from ..pixel_decoder.fpn import build_pixel_decoder
 16 | 
 17 | 
 18 | @SEM_SEG_HEADS_REGISTRY.register()
 19 | class MaskFormerHead(nn.Module):
 20 | 
 21 |     _version = 2
 22 | 
 23 |     def _load_from_state_dict(
 24 |         self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
 25 |     ):
 26 |         version = local_metadata.get("version", None)
 27 |         if version is None or version < 2:
 28 |             # Do not warn if train from scratch
 29 |             scratch = True
 30 |             logger = logging.getLogger(__name__)
 31 |             for k in list(state_dict.keys()):
 32 |                 newk = k
 33 |                 #if "sem_seg_head" in k and not k.startswith(prefix + "predictor"):
 34 |                 #    newk = k.replace(prefix, prefix + "pixel_decoder.")
 35 |                 #    # logger.debug(f"{k} ==> {newk}")
 36 |                 if newk != k:
 37 |                     state_dict[newk] = state_dict[k]
 38 |                     del state_dict[k]
 39 |                     scratch = False
 40 | 
 41 |             if not scratch:
 42 |                 logger.warning(
 43 |                     f"Weight format of {self.__class__.__name__} have changed! "
 44 |                     "Please upgrade your models. Applying automatic conversion now ..."
 45 |                 )
 46 | 
 47 |     @configurable
 48 |     def __init__(
 49 |         self,
 50 |         input_shape: Dict[str, ShapeSpec],
 51 |         *,
 52 |         num_classes: int,
 53 |         pixel_decoder: nn.Module,
 54 |         loss_weight: float = 1.0,
 55 |         ignore_value: int = -1,
 56 |         # extra parameters
 57 |         transformer_predictor: nn.Module,
 58 |         transformer_in_feature: str,
 59 |     ):
 60 |         """
 61 |         NOTE: this interface is experimental.
 62 |         Args:
 63 |             input_shape: shapes (channels and stride) of the input features
 64 |             num_classes: number of classes to predict
 65 |             pixel_decoder: the pixel decoder module
 66 |             loss_weight: loss weight
 67 |             ignore_value: category id to be ignored during training.
 68 |             transformer_predictor: the transformer decoder that makes prediction
 69 |             transformer_in_feature: input feature name to the transformer_predictor
 70 |         """
 71 |         super().__init__()
 72 |         input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride)
 73 |         self.in_features = [k for k, v in input_shape]
 74 |         feature_strides = [v.stride for k, v in input_shape]
 75 |         feature_channels = [v.channels for k, v in input_shape]
 76 | 
 77 |         self.ignore_value = ignore_value
 78 |         self.common_stride = 4
 79 |         self.loss_weight = loss_weight
 80 | 
 81 |         self.pixel_decoder = pixel_decoder
 82 |         self.predictor = transformer_predictor
 83 |         self.transformer_in_feature = transformer_in_feature
 84 | 
 85 |         self.num_classes = num_classes
 86 | 
 87 |     @classmethod
 88 |     def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
 89 |         # figure out in_channels to transformer predictor
 90 |         if cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "transformer_encoder":
 91 |             transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
 92 |         elif cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "pixel_embedding":
 93 |             transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM
 94 |         elif cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "multi_scale_pixel_decoder":  # for maskformer2
 95 |             transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
 96 |         else:
 97 |             transformer_predictor_in_channels = input_shape[cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE].channels
 98 | 
 99 |         return {
100 |             "input_shape": {
101 |                 k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES
102 |             },
103 |             "ignore_value": cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
104 |             "num_classes": cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES,
105 |             "pixel_decoder": build_pixel_decoder(cfg, input_shape),
106 |             "loss_weight": cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT,
107 |             "transformer_in_feature": cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE,
108 |             "transformer_predictor": build_transformer_decoder(
109 |                 cfg,
110 |                 transformer_predictor_in_channels,
111 |                 mask_classification=True,
112 |             ),
113 |         }
114 | 
115 |     def forward(self, features, features_clip, mask=None):
116 |         return self.layers(features, features_clip, mask)
117 | 
118 |     def layers(self, features, features_clip, mask=None):
119 |         mask_features, transformer_encoder_features, multi_scale_features = self.pixel_decoder.forward_features(features)
120 |         if self.transformer_in_feature == "multi_scale_pixel_decoder":
121 |             predictions = self.predictor(multi_scale_features, mask_features, features_clip, mask)
122 |         else:
123 |             if self.transformer_in_feature == "transformer_encoder":
124 |                 assert (
125 |                     transformer_encoder_features is not None
126 |                 ), "Please use the TransformerEncoderPixelDecoder."
127 |                 predictions = self.predictor(transformer_encoder_features, mask_features, mask)
128 |             elif self.transformer_in_feature == "pixel_embedding":
129 |                 predictions = self.predictor(mask_features, mask_features, mask)
130 |             else:
131 |                 predictions = self.predictor(features[self.transformer_in_feature], mask_features, mask)
132 |         return predictions
133 | 


--------------------------------------------------------------------------------
/ovformer/data/dataset_mappers/coco_panoptic_new_baseline_dataset_mapper.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/d2/detr/dataset_mapper.py
  3 | import copy
  4 | import logging
  5 | 
  6 | import numpy as np
  7 | import torch
  8 | 
  9 | from detectron2.config import configurable
 10 | from detectron2.data import detection_utils as utils
 11 | from detectron2.data import transforms as T
 12 | from detectron2.data.transforms import TransformGen
 13 | from detectron2.structures import BitMasks, Boxes, Instances
 14 | 
 15 | __all__ = ["COCOPanopticNewBaselineDatasetMapper"]
 16 | 
 17 | 
 18 | def build_transform_gen(cfg, is_train):
 19 |     """
 20 |     Create a list of default :class:`Augmentation` from config.
 21 |     Now it includes resizing and flipping.
 22 |     Returns:
 23 |         list[Augmentation]
 24 |     """
 25 |     assert is_train, "Only support training augmentation"
 26 |     image_size = cfg.INPUT.IMAGE_SIZE
 27 |     min_scale = cfg.INPUT.MIN_SCALE
 28 |     max_scale = cfg.INPUT.MAX_SCALE
 29 | 
 30 |     augmentation = []
 31 | 
 32 |     if cfg.INPUT.RANDOM_FLIP != "none":
 33 |         augmentation.append(
 34 |             T.RandomFlip(
 35 |                 horizontal=cfg.INPUT.RANDOM_FLIP == "horizontal",
 36 |                 vertical=cfg.INPUT.RANDOM_FLIP == "vertical",
 37 |             )
 38 |         )
 39 | 
 40 |     augmentation.extend([
 41 |         T.ResizeScale(
 42 |             min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size
 43 |         ),
 44 |         T.FixedSizeCrop(crop_size=(image_size, image_size)),
 45 |     ])
 46 | 
 47 |     return augmentation
 48 | 
 49 | 
 50 | # This is specifically designed for the COCO dataset.
 51 | class COCOPanopticNewBaselineDatasetMapper:
 52 |     """
 53 |     A callable which takes a dataset dict in Detectron2 Dataset format,
 54 |     and map it into a format used by MaskFormer.
 55 | 
 56 |     This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation.
 57 | 
 58 |     The callable currently does the following:
 59 | 
 60 |     1. Read the image from "file_name"
 61 |     2. Applies geometric transforms to the image and annotation
 62 |     3. Find and applies suitable cropping to the image and annotation
 63 |     4. Prepare image and annotation to Tensors
 64 |     """
 65 | 
 66 |     @configurable
 67 |     def __init__(
 68 |         self,
 69 |         is_train=True,
 70 |         *,
 71 |         tfm_gens,
 72 |         image_format,
 73 |     ):
 74 |         """
 75 |         NOTE: this interface is experimental.
 76 |         Args:
 77 |             is_train: for training or inference
 78 |             augmentations: a list of augmentations or deterministic transforms to apply
 79 |             crop_gen: crop augmentation
 80 |             tfm_gens: data augmentation
 81 |             image_format: an image format supported by :func:`detection_utils.read_image`.
 82 |         """
 83 |         self.tfm_gens = tfm_gens
 84 |         logging.getLogger(__name__).info(
 85 |             "[COCOPanopticNewBaselineDatasetMapper] Full TransformGens used in training: {}".format(
 86 |                 str(self.tfm_gens)
 87 |             )
 88 |         )
 89 | 
 90 |         self.img_format = image_format
 91 |         self.is_train = is_train
 92 | 
 93 |     @classmethod
 94 |     def from_config(cls, cfg, is_train=True):
 95 |         # Build augmentation
 96 |         tfm_gens = build_transform_gen(cfg, is_train)
 97 | 
 98 |         ret = {
 99 |             "is_train": is_train,
100 |             "tfm_gens": tfm_gens,
101 |             "image_format": cfg.INPUT.FORMAT,
102 |         }
103 |         return ret
104 | 
105 |     def __call__(self, dataset_dict):
106 |         """
107 |         Args:
108 |             dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
109 | 
110 |         Returns:
111 |             dict: a format that builtin models in detectron2 accept
112 |         """
113 |         dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
114 |         image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
115 |         utils.check_image_size(dataset_dict, image)
116 | 
117 |         image, transforms = T.apply_transform_gens(self.tfm_gens, image)
118 |         image_shape = image.shape[:2]  # h, w
119 | 
120 |         # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
121 |         # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
122 |         # Therefore it's important to use torch.Tensor.
123 |         dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
124 | 
125 |         if not self.is_train:
126 |             # USER: Modify this if you want to keep them for some reason.
127 |             dataset_dict.pop("annotations", None)
128 |             return dataset_dict
129 | 
130 |         if "pan_seg_file_name" in dataset_dict:
131 |             pan_seg_gt = utils.read_image(dataset_dict.pop("pan_seg_file_name"), "RGB")
132 |             segments_info = dataset_dict["segments_info"]
133 | 
134 |             # apply the same transformation to panoptic segmentation
135 |             pan_seg_gt = transforms.apply_segmentation(pan_seg_gt)
136 | 
137 |             from panopticapi.utils import rgb2id
138 | 
139 |             pan_seg_gt = rgb2id(pan_seg_gt)
140 | 
141 |             instances = Instances(image_shape)
142 |             classes = []
143 |             masks = []
144 |             for segment_info in segments_info:
145 |                 class_id = segment_info["category_id"]
146 |                 if not segment_info["iscrowd"]:
147 |                     classes.append(class_id)
148 |                     masks.append(pan_seg_gt == segment_info["id"])
149 | 
150 |             classes = np.array(classes)
151 |             instances.gt_classes = torch.tensor(classes, dtype=torch.int64)
152 |             if len(masks) == 0:
153 |                 # Some image does not have annotation (all ignored)
154 |                 instances.gt_masks = torch.zeros((0, pan_seg_gt.shape[-2], pan_seg_gt.shape[-1]))
155 |                 instances.gt_boxes = Boxes(torch.zeros((0, 4)))
156 |             else:
157 |                 masks = BitMasks(
158 |                     torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks])
159 |                 )
160 |                 instances.gt_masks = masks.tensor
161 |                 instances.gt_boxes = masks.get_bounding_boxes()
162 | 
163 |             dataset_dict["instances"] = instances
164 | 
165 |         return dataset_dict
166 | 


--------------------------------------------------------------------------------
/tools/analyze_model.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Copyright (c) Facebook, Inc. and its affiliates.
  3 | # Modified by Bowen Cheng from https://github.com/facebookresearch/detectron2/blob/main/tools/analyze_model.py
  4 | 
  5 | import logging
  6 | import numpy as np
  7 | from collections import Counter
  8 | import tqdm
  9 | from fvcore.nn import flop_count_table  # can also try flop_count_str
 10 | 
 11 | from detectron2.checkpoint import DetectionCheckpointer
 12 | from detectron2.config import CfgNode, LazyConfig, get_cfg, instantiate
 13 | from detectron2.data import build_detection_test_loader
 14 | from detectron2.engine import default_argument_parser
 15 | from detectron2.modeling import build_model
 16 | from detectron2.projects.deeplab import add_deeplab_config
 17 | from detectron2.utils.analysis import (
 18 |     FlopCountAnalysis,
 19 |     activation_count_operators,
 20 |     parameter_count_table,
 21 | )
 22 | from detectron2.utils.logger import setup_logger
 23 | 
 24 | # fmt: off
 25 | import os
 26 | import sys
 27 | sys.path.insert(1, os.path.join(sys.path[0], '..'))
 28 | # fmt: on
 29 | 
 30 | from ovformer import add_ovformer_config
 31 | 
 32 | logger = logging.getLogger("detectron2")
 33 | 
 34 | 
 35 | def setup(args):
 36 |     if args.config_file.endswith(".yaml"):
 37 |         cfg = get_cfg()
 38 |         add_deeplab_config(cfg)
 39 |         add_ovformer_config(cfg)
 40 |         cfg.merge_from_file(args.config_file)
 41 |         cfg.DATALOADER.NUM_WORKERS = 0
 42 |         cfg.merge_from_list(args.opts)
 43 |         cfg.freeze()
 44 |     else:
 45 |         cfg = LazyConfig.load(args.config_file)
 46 |         cfg = LazyConfig.apply_overrides(cfg, args.opts)
 47 |     setup_logger(name="fvcore")
 48 |     setup_logger()
 49 |     return cfg
 50 | 
 51 | 
 52 | def do_flop(cfg):
 53 |     if isinstance(cfg, CfgNode):
 54 |         data_loader = build_detection_test_loader(cfg, cfg.DATASETS.TEST[0])
 55 |         model = build_model(cfg)
 56 |         DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS)
 57 |     else:
 58 |         data_loader = instantiate(cfg.dataloader.test)
 59 |         model = instantiate(cfg.model)
 60 |         model.to(cfg.train.device)
 61 |         DetectionCheckpointer(model).load(cfg.train.init_checkpoint)
 62 |     model.eval()
 63 | 
 64 |     counts = Counter()
 65 |     total_flops = []
 66 |     for idx, data in zip(tqdm.trange(args.num_inputs), data_loader):  # noqa
 67 |         if args.use_fixed_input_size and isinstance(cfg, CfgNode):
 68 |             import torch
 69 |             crop_size = cfg.INPUT.CROP.SIZE[0]
 70 |             data[0]["image"] = torch.zeros((3, crop_size, crop_size))
 71 |         flops = FlopCountAnalysis(model, data)
 72 |         if idx > 0:
 73 |             flops.unsupported_ops_warnings(False).uncalled_modules_warnings(False)
 74 |         counts += flops.by_operator()
 75 |         total_flops.append(flops.total())
 76 | 
 77 |     logger.info("Flops table computed from only one input sample:\n" + flop_count_table(flops))
 78 |     logger.info(
 79 |         "Average GFlops for each type of operators:\n"
 80 |         + str([(k, v / (idx + 1) / 1e9) for k, v in counts.items()])
 81 |     )
 82 |     logger.info(
 83 |         "Total GFlops: {:.1f}±{:.1f}".format(np.mean(total_flops) / 1e9, np.std(total_flops) / 1e9)
 84 |     )
 85 | 
 86 | 
 87 | def do_activation(cfg):
 88 |     if isinstance(cfg, CfgNode):
 89 |         data_loader = build_detection_test_loader(cfg, cfg.DATASETS.TEST[0])
 90 |         model = build_model(cfg)
 91 |         DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS)
 92 |     else:
 93 |         data_loader = instantiate(cfg.dataloader.test)
 94 |         model = instantiate(cfg.model)
 95 |         model.to(cfg.train.device)
 96 |         DetectionCheckpointer(model).load(cfg.train.init_checkpoint)
 97 |     model.eval()
 98 | 
 99 |     counts = Counter()
100 |     total_activations = []
101 |     for idx, data in zip(tqdm.trange(args.num_inputs), data_loader):  # noqa
102 |         count = activation_count_operators(model, data)
103 |         counts += count
104 |         total_activations.append(sum(count.values()))
105 |     logger.info(
106 |         "(Million) Activations for Each Type of Operators:\n"
107 |         + str([(k, v / idx) for k, v in counts.items()])
108 |     )
109 |     logger.info(
110 |         "Total (Million) Activations: {}±{}".format(
111 |             np.mean(total_activations), np.std(total_activations)
112 |         )
113 |     )
114 | 
115 | 
116 | def do_parameter(cfg):
117 |     if isinstance(cfg, CfgNode):
118 |         model = build_model(cfg)
119 |     else:
120 |         model = instantiate(cfg.model)
121 |     logger.info("Parameter Count:\n" + parameter_count_table(model, max_depth=5))
122 | 
123 | 
124 | def do_structure(cfg):
125 |     if isinstance(cfg, CfgNode):
126 |         model = build_model(cfg)
127 |     else:
128 |         model = instantiate(cfg.model)
129 |     logger.info("Model Structure:\n" + str(model))
130 | 
131 | 
132 | if __name__ == "__main__":
133 |     parser = default_argument_parser(
134 |         epilog="""
135 | Examples:
136 | To show parameters of a model:
137 | $ ./analyze_model.py --tasks parameter \\
138 |     --config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml
139 | Flops and activations are data-dependent, therefore inputs and model weights
140 | are needed to count them:
141 | $ ./analyze_model.py --num-inputs 100 --tasks flop \\
142 |     --config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \\
143 |     MODEL.WEIGHTS /path/to/model.pkl
144 | For OVFormer:
145 | $ ./analyze_model.py--num-inputs 100 --tasks flop --config-file configs/lvis/ovformer_R50_bs8.yaml \\
146 |     MODEL.WEIGHTS models/ovformer_r50_lvis.pth
147 | """
148 |     )
149 |     parser.add_argument(
150 |         "--tasks",
151 |         choices=["flop", "activation", "parameter", "structure"],
152 |         required=True,
153 |         nargs="+",
154 |     )
155 |     parser.add_argument(
156 |         "-n",
157 |         "--num-inputs",
158 |         default=100,
159 |         type=int,
160 |         help="number of inputs used to compute statistics for flops/activations, "
161 |         "both are data dependent.",
162 |     )
163 |     parser.add_argument(
164 |         "--use-fixed-input-size",
165 |         action="store_true",
166 |         help="use fixed input size when calculating flops",
167 |     )
168 |     args = parser.parse_args()
169 |     assert not args.eval_only
170 |     assert args.num_gpus == 1
171 | 
172 |     cfg = setup(args)
173 | 
174 |     for task in args.tasks:
175 |         {
176 |             "flop": do_flop,
177 |             "activation": do_activation,
178 |             "parameter": do_parameter,
179 |             "structure": do_structure,
180 |         }[task](cfg)
181 | 


--------------------------------------------------------------------------------
/ovformer/data_video/augmentation.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC
  3 | 
  4 | import numpy as np
  5 | import logging
  6 | import sys
  7 | from fvcore.transforms.transform import (
  8 |     HFlipTransform,
  9 |     NoOpTransform,
 10 |     VFlipTransform,
 11 | )
 12 | from PIL import Image
 13 | 
 14 | from detectron2.data import transforms as T
 15 | 
 16 | 
 17 | class ResizeShortestEdge(T.Augmentation):
 18 |     """
 19 |     Scale the shorter edge to the given size, with a limit of `max_size` on the longer edge.
 20 |     If `max_size` is reached, then downscale so that the longer edge does not exceed max_size.
 21 |     """
 22 | 
 23 |     def __init__(
 24 |         self, short_edge_length, max_size=sys.maxsize, sample_style="range", interp=Image.BILINEAR, clip_frame_cnt=1
 25 |     ):
 26 |         """
 27 |         Args:
 28 |             short_edge_length (list[int]): If ``sample_style=="range"``,
 29 |                 a [min, max] interval from which to sample the shortest edge length.
 30 |                 If ``sample_style=="choice"``, a list of shortest edge lengths to sample from.
 31 |             max_size (int): maximum allowed longest edge length.
 32 |             sample_style (str): either "range" or "choice".
 33 |         """
 34 |         super().__init__()
 35 |         assert sample_style in ["range", "choice", "range_by_clip", "choice_by_clip"], sample_style
 36 | 
 37 |         self.is_range = ("range" in sample_style)
 38 |         if isinstance(short_edge_length, int):
 39 |             short_edge_length = (short_edge_length, short_edge_length)
 40 |         if self.is_range:
 41 |             assert len(short_edge_length) == 2, (
 42 |                 "short_edge_length must be two values using 'range' sample style."
 43 |                 f" Got {short_edge_length}!"
 44 |             )
 45 |         self._cnt = 0
 46 |         self._init(locals())
 47 | 
 48 |     def get_transform(self, image):
 49 |         if self._cnt % self.clip_frame_cnt == 0:
 50 |             if self.is_range:
 51 |                 self.size = np.random.randint(self.short_edge_length[0], self.short_edge_length[1] + 1)
 52 |             else:
 53 |                 self.size = np.random.choice(self.short_edge_length)
 54 |             if self.size == 0:
 55 |                 return NoOpTransform()
 56 | 
 57 |             self._cnt = 0   # avoiding overflow
 58 |         self._cnt += 1
 59 | 
 60 |         h, w = image.shape[:2]
 61 | 
 62 |         scale = self.size * 1.0 / min(h, w)
 63 |         if h < w:
 64 |             newh, neww = self.size, scale * w
 65 |         else:
 66 |             newh, neww = scale * h, self.size
 67 |         if max(newh, neww) > self.max_size:
 68 |             scale = self.max_size * 1.0 / max(newh, neww)
 69 |             newh = newh * scale
 70 |             neww = neww * scale
 71 |         neww = int(neww + 0.5)
 72 |         newh = int(newh + 0.5)
 73 |         return T.ResizeTransform(h, w, newh, neww, self.interp)
 74 | 
 75 | 
 76 | class RandomFlip(T.Augmentation):
 77 |     """
 78 |     Flip the image horizontally or vertically with the given probability.
 79 |     """
 80 | 
 81 |     def __init__(self, prob=0.5, *, horizontal=True, vertical=False, clip_frame_cnt=1):
 82 |         """
 83 |         Args:
 84 |             prob (float): probability of flip.
 85 |             horizontal (boolean): whether to apply horizontal flipping
 86 |             vertical (boolean): whether to apply vertical flipping
 87 |         """
 88 |         super().__init__()
 89 | 
 90 |         if horizontal and vertical:
 91 |             raise ValueError("Cannot do both horiz and vert. Please use two Flip instead.")
 92 |         if not horizontal and not vertical:
 93 |             raise ValueError("At least one of horiz or vert has to be True!")
 94 |         self._cnt = 0
 95 | 
 96 |         self._init(locals())
 97 | 
 98 |     def get_transform(self, image):
 99 |         if self._cnt % self.clip_frame_cnt == 0:
100 |             self.do = self._rand_range() < self.prob
101 |             self._cnt = 0   # avoiding overflow
102 |         self._cnt += 1
103 | 
104 |         h, w = image.shape[:2]
105 | 
106 |         if self.do:
107 |             if self.horizontal:
108 |                 return HFlipTransform(w)
109 |             elif self.vertical:
110 |                 return VFlipTransform(h)
111 |         else:
112 |             return NoOpTransform()
113 | 
114 | 
115 | def build_augmentation(cfg, is_train):
116 |     logger = logging.getLogger(__name__)
117 |     aug_list = []
118 |     if is_train:
119 |         # Crop
120 |         if cfg.INPUT.CROP.ENABLED:
121 |             aug_list.append(T.RandomCrop(cfg.INPUT.CROP.TYPE, cfg.INPUT.CROP.SIZE))
122 | 
123 |         # Resize
124 |         min_size = cfg.INPUT.MIN_SIZE_TRAIN
125 |         max_size = cfg.INPUT.MAX_SIZE_TRAIN
126 |         sample_style = cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING
127 |         ms_clip_frame_cnt = cfg.INPUT.SAMPLING_FRAME_NUM if "by_clip" in cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING else 1
128 |         aug_list.append(ResizeShortestEdge(min_size, max_size, sample_style, clip_frame_cnt=ms_clip_frame_cnt))
129 | 
130 |         # Flip
131 |         if cfg.INPUT.RANDOM_FLIP != "none":
132 |             if cfg.INPUT.RANDOM_FLIP == "flip_by_clip":
133 |                 flip_clip_frame_cnt = cfg.INPUT.SAMPLING_FRAME_NUM
134 |             else:
135 |                 flip_clip_frame_cnt = 1
136 | 
137 |             aug_list.append(
138 |                 # NOTE using RandomFlip modified for the support of flip maintenance
139 |                 RandomFlip(
140 |                     horizontal=(cfg.INPUT.RANDOM_FLIP == "horizontal") or (cfg.INPUT.RANDOM_FLIP == "flip_by_clip"),
141 |                     vertical=cfg.INPUT.RANDOM_FLIP == "vertical",
142 |                     clip_frame_cnt=flip_clip_frame_cnt,
143 |                 )
144 |             )
145 | 
146 |         # Additional augmentations : brightness, contrast, saturation, rotation
147 |         augmentations = cfg.INPUT.AUGMENTATIONS
148 |         if "brightness" in augmentations:
149 |             aug_list.append(T.RandomBrightness(0.9, 1.1))
150 |         if "contrast" in augmentations:
151 |             aug_list.append(T.RandomContrast(0.9, 1.1))
152 |         if "saturation" in augmentations:
153 |             aug_list.append(T.RandomSaturation(0.9, 1.1))
154 |         if "rotation" in augmentations:
155 |             aug_list.append(
156 |                 T.RandomRotation(
157 |                     [-15, 15], expand=False, center=[(0.4, 0.4), (0.6, 0.6)], sample_style="range"
158 |                 )
159 |             )
160 |     else:
161 |         # Resize
162 |         min_size = cfg.INPUT.MIN_SIZE_TEST
163 |         max_size = cfg.INPUT.MAX_SIZE_TEST
164 |         sample_style = "choice"
165 |         aug_list.append(T.ResizeShortestEdge(min_size, max_size, sample_style))
166 | 
167 |     return aug_list
168 | 


--------------------------------------------------------------------------------
/ovformer_video/modeling/transformer_decoder/zero_shot_classifier.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | import numpy as np
  3 | import torch
  4 | from torch import nn, Tensor
  5 | from torch.nn import functional as F
  6 | from detectron2.config import configurable
  7 | from detectron2.layers import Linear, ShapeSpec
  8 | from typing import Optional
  9 | 
 10 | 
 11 | class CrossAttentionLayer(nn.Module):
 12 |     def __init__(self, d_model, nhead, dropout=0.0,
 13 |                  activation="relu", normalize_before=False):
 14 |         super().__init__()
 15 |         self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
 16 | 
 17 |         self.norm = nn.LayerNorm(d_model)
 18 |         self.dropout = nn.Dropout(dropout)
 19 | 
 20 |         self.activation = _get_activation_fn(activation)
 21 |         self.normalize_before = normalize_before
 22 | 
 23 |         self._reset_parameters()
 24 | 
 25 |     def _reset_parameters(self):
 26 |         for p in self.parameters():
 27 |             if p.dim() > 1:
 28 |                 nn.init.xavier_uniform_(p)
 29 | 
 30 |     def with_pos_embed(self, tensor, pos: Optional[Tensor]):
 31 |         return tensor if pos is None else tensor + pos
 32 | 
 33 |     def forward_post(self, tgt, memory,
 34 |                      memory_mask: Optional[Tensor] = None,
 35 |                      memory_key_padding_mask: Optional[Tensor] = None,
 36 |                      pos: Optional[Tensor] = None,
 37 |                      query_pos: Optional[Tensor] = None):
 38 |         tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt, query_pos),
 39 |                                    key=self.with_pos_embed(memory, pos),
 40 |                                    value=memory, attn_mask=memory_mask,
 41 |                                    key_padding_mask=memory_key_padding_mask)[0]
 42 |         tgt = tgt + self.dropout(tgt2)
 43 |         tgt = self.norm(tgt)
 44 | 
 45 |         return tgt
 46 | 
 47 |     def forward_pre(self, tgt, memory,
 48 |                     memory_mask: Optional[Tensor] = None,
 49 |                     memory_key_padding_mask: Optional[Tensor] = None,
 50 |                     pos: Optional[Tensor] = None,
 51 |                     query_pos: Optional[Tensor] = None):
 52 |         tgt2 = self.norm(tgt)
 53 |         tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt2, query_pos),
 54 |                                    key=self.with_pos_embed(memory, pos),
 55 |                                    value=memory, attn_mask=memory_mask,
 56 |                                    key_padding_mask=memory_key_padding_mask)[0]
 57 |         tgt = tgt + self.dropout(tgt2)
 58 | 
 59 |         return tgt
 60 | 
 61 |     def forward(self, tgt, memory,
 62 |                 memory_mask: Optional[Tensor] = None,
 63 |                 memory_key_padding_mask: Optional[Tensor] = None,
 64 |                 pos: Optional[Tensor] = None,
 65 |                 query_pos: Optional[Tensor] = None):
 66 |         if self.normalize_before:
 67 |             return self.forward_pre(tgt, memory, memory_mask,
 68 |                                     memory_key_padding_mask, pos, query_pos)
 69 |         return self.forward_post(tgt, memory, memory_mask,
 70 |                                  memory_key_padding_mask, pos, query_pos)
 71 | 
 72 | 
 73 | def _get_activation_fn(activation):
 74 |     """Return an activation function given a string"""
 75 |     if activation == "relu":
 76 |         return F.relu
 77 |     if activation == "gelu":
 78 |         return F.gelu
 79 |     if activation == "glu":
 80 |         return F.glu
 81 |     raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
 82 | 
 83 | 
 84 | class ZeroShotClassifier(nn.Module):
 85 |     def __init__(
 86 |         self,
 87 |         input_shape: ShapeSpec,
 88 |         num_classes: int,
 89 |         zs_weight_path: str,
 90 |         zs_weight_dim: int = 512,
 91 |         use_bias: float = 0.0, 
 92 |         norm_weight: bool = True,
 93 |         norm_temperature: float = 50.0,
 94 |     ):
 95 |         super().__init__()
 96 |         if isinstance(input_shape, int):  # some backward compatibility
 97 |             input_shape = ShapeSpec(channels=input_shape)
 98 |         input_size = input_shape.channels * (input_shape.width or 1) * (input_shape.height or 1)
 99 |         self.norm_weight = norm_weight
100 |         self.norm_temperature = norm_temperature
101 | 
102 |         self.use_bias = use_bias < 0
103 |         if self.use_bias:
104 |             self.cls_bias = nn.Parameter(torch.ones(1) * use_bias)
105 | 
106 |         self.linear = nn.Sequential(nn.Linear(input_size, zs_weight_dim//2),
107 |                 nn.ReLU(),
108 |                 nn.Linear(zs_weight_dim//2, zs_weight_dim))
109 | 
110 |         self.cross_attention = CrossAttentionLayer(
111 |                 d_model=zs_weight_dim,
112 |                 nhead=8,
113 |                 dropout=0.0,
114 |                 normalize_before=False,
115 |             )
116 | 
117 | 
118 |         if zs_weight_path == 'rand':
119 |             zs_weight = torch.randn((zs_weight_dim, num_classes))
120 |             nn.init.normal_(zs_weight, std=0.01)
121 |         else:
122 |             zs_weight = torch.tensor(
123 |                 np.load(zs_weight_path), 
124 |                 dtype=torch.float32).permute(1, 0).contiguous() # D x C
125 |         zs_weight = torch.cat(
126 |             [zs_weight, zs_weight.new_zeros((zs_weight_dim, 1))], 
127 |             dim=1) # D x (C + 1)
128 |         
129 |         if self.norm_weight:
130 |             zs_weight = F.normalize(zs_weight, p=2, dim=0)
131 |         
132 |         if zs_weight_path == 'rand':
133 |             self.zs_weight = nn.Parameter(zs_weight)
134 |         else:
135 |             self.register_buffer('zs_weight', zs_weight)
136 |         assert self.zs_weight.shape[1] == num_classes + 1
137 | 
138 | 
139 |     def forward(self, x, features_clip, classifier=None):
140 |         x = self.linear(x).transpose(0, 1)  # (b,100,512)
141 |         x = self.cross_attention(
142 |             x, features_clip,
143 |             memory_mask=None,
144 |             memory_key_padding_mask=None,
145 |             pos=None, query_pos=None
146 |         )
147 |         x = x.transpose(0, 1)
148 | 
149 |         if classifier is not None:
150 |             zs_weight = classifier.permute(1, 0).contiguous() # D x C'
151 |             zs_weight = F.normalize(zs_weight, p=2, dim=0) \
152 |                 if self.norm_weight else zs_weight
153 |         else:
154 |             zs_weight = self.zs_weight # (512, 1197)
155 |         if self.norm_weight:
156 |             x = self.norm_temperature * F.normalize(x, p=2, dim=2)
157 |         bs, qn, _ = x.shape
158 |         x = x.reshape(bs * qn, -1)  # (b*100, 512)
159 |         x = torch.mm(x, zs_weight)  # (b*100, 1197)
160 |         x = x.reshape(bs, qn, -1)  # (b,100,1197)
161 |         if self.use_bias:
162 |             x = x + self.cls_bias
163 |         return x
164 | 


--------------------------------------------------------------------------------
/ovformer/modeling/transformer_decoder/zero_shot_classifier.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | from torch import nn, Tensor
  4 | from torch.nn import functional as F
  5 | from detectron2.config import configurable
  6 | from detectron2.layers import Linear, ShapeSpec
  7 | from typing import Optional
  8 | 
  9 | 
 10 | class CrossAttentionLayer(nn.Module):
 11 |     def __init__(self, d_model, nhead, dropout=0.0,
 12 |                  activation="relu", normalize_before=False):
 13 |         super().__init__()
 14 |         self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
 15 | 
 16 |         self.norm = nn.LayerNorm(d_model)
 17 |         self.dropout = nn.Dropout(dropout)
 18 | 
 19 |         self.activation = _get_activation_fn(activation)
 20 |         self.normalize_before = normalize_before
 21 | 
 22 |         self._reset_parameters()
 23 | 
 24 |     def _reset_parameters(self):
 25 |         for p in self.parameters():
 26 |             if p.dim() > 1:
 27 |                 nn.init.xavier_uniform_(p)
 28 | 
 29 |     def with_pos_embed(self, tensor, pos: Optional[Tensor]):
 30 |         return tensor if pos is None else tensor + pos
 31 | 
 32 |     def forward_post(self, tgt, memory,
 33 |                      memory_mask: Optional[Tensor] = None,
 34 |                      memory_key_padding_mask: Optional[Tensor] = None,
 35 |                      pos: Optional[Tensor] = None,
 36 |                      query_pos: Optional[Tensor] = None):
 37 |         tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt, query_pos),
 38 |                                    key=self.with_pos_embed(memory, pos),
 39 |                                    value=memory, attn_mask=memory_mask,
 40 |                                    key_padding_mask=memory_key_padding_mask)[0]
 41 |         tgt = tgt + self.dropout(tgt2)
 42 |         tgt = self.norm(tgt)
 43 | 
 44 |         return tgt
 45 | 
 46 |     def forward_pre(self, tgt, memory,
 47 |                     memory_mask: Optional[Tensor] = None,
 48 |                     memory_key_padding_mask: Optional[Tensor] = None,
 49 |                     pos: Optional[Tensor] = None,
 50 |                     query_pos: Optional[Tensor] = None):
 51 |         tgt2 = self.norm(tgt)
 52 |         tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt2, query_pos),
 53 |                                    key=self.with_pos_embed(memory, pos),
 54 |                                    value=memory, attn_mask=memory_mask,
 55 |                                    key_padding_mask=memory_key_padding_mask)[0]
 56 |         tgt = tgt + self.dropout(tgt2)
 57 | 
 58 |         return tgt
 59 | 
 60 |     def forward(self, tgt, memory,
 61 |                 memory_mask: Optional[Tensor] = None,
 62 |                 memory_key_padding_mask: Optional[Tensor] = None,
 63 |                 pos: Optional[Tensor] = None,
 64 |                 query_pos: Optional[Tensor] = None):
 65 |         if self.normalize_before:
 66 |             return self.forward_pre(tgt, memory, memory_mask,
 67 |                                     memory_key_padding_mask, pos, query_pos)
 68 |         return self.forward_post(tgt, memory, memory_mask,
 69 |                                  memory_key_padding_mask, pos, query_pos)
 70 | 
 71 | 
 72 | def _get_activation_fn(activation):
 73 |     """Return an activation function given a string"""
 74 |     if activation == "relu":
 75 |         return F.relu
 76 |     if activation == "gelu":
 77 |         return F.gelu
 78 |     if activation == "glu":
 79 |         return F.glu
 80 |     raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
 81 | 
 82 | 
 83 | class ZeroShotClassifier(nn.Module):
 84 |     def __init__(
 85 |             self,
 86 |             input_shape: ShapeSpec,
 87 |             num_classes: int,
 88 |             zs_weight_path: str,
 89 |             zs_weight_dim: int = 512,
 90 |             use_bias: float = 0.0,
 91 |             norm_weight: bool = True,
 92 |             norm_temperature: float = 50.0,
 93 |     ):
 94 |         super().__init__()
 95 |         if isinstance(input_shape, int):  # some backward compatibility
 96 |             input_shape = ShapeSpec(channels=input_shape)
 97 |         input_size = input_shape.channels * (input_shape.width or 1) * (input_shape.height or 1)
 98 |         self.norm_weight = norm_weight
 99 |         self.norm_temperature = norm_temperature
100 | 
101 |         self.use_bias = use_bias < 0
102 |         if self.use_bias:
103 |             self.cls_bias = nn.Parameter(torch.ones(1) * use_bias)
104 | 
105 |         self.linear = nn.Sequential(nn.Linear(input_size, zs_weight_dim // 2),
106 |                                     nn.ReLU(),
107 |                                     nn.Linear(zs_weight_dim // 2, zs_weight_dim))
108 | 
109 |         self.cross_attention = CrossAttentionLayer(
110 |             d_model=zs_weight_dim,
111 |             nhead=8,
112 |             dropout=0.0,
113 |             normalize_before=False,
114 |         )
115 | 
116 |         if zs_weight_path == 'rand':
117 |             zs_weight = torch.randn((zs_weight_dim, num_classes))
118 |             nn.init.normal_(zs_weight, std=0.01)
119 |         else:
120 |             zs_weight = torch.tensor(
121 |                 np.load(zs_weight_path),
122 |                 dtype=torch.float32).permute(1, 0).contiguous()  # D x C
123 |         zs_weight = torch.cat(
124 |             [zs_weight, zs_weight.new_zeros((zs_weight_dim, 1))],
125 |             dim=1)  # D x (C + 1)
126 | 
127 |         if self.norm_weight:
128 |             zs_weight = F.normalize(zs_weight, p=2, dim=0)
129 | 
130 |         if zs_weight_path == 'rand':
131 |             self.zs_weight = nn.Parameter(zs_weight)
132 |         else:
133 |             self.register_buffer('zs_weight', zs_weight)
134 |         assert self.zs_weight.shape[1] == num_classes + 1
135 | 
136 |     def forward(self, x, features_clip, classifier=None):
137 |         x = self.linear(x).transpose(0, 1)  # (b,100,512)
138 |         features_clip = torch.stack(features_clip, dim=1)
139 |         x = self.cross_attention(
140 |             x, features_clip,
141 |             memory_mask=None,
142 |             memory_key_padding_mask=None,
143 |             pos=None, query_pos=None
144 |         )
145 |         x = x.transpose(0, 1)
146 | 
147 |         if classifier is not None:
148 |             zs_weight = classifier.permute(1, 0).contiguous()  # D x C'
149 |             zs_weight = F.normalize(zs_weight, p=2, dim=0) \
150 |                 if self.norm_weight else zs_weight
151 |         else:
152 |             zs_weight = self.zs_weight  # (512, k)
153 |         if self.norm_weight:
154 |             x = self.norm_temperature * F.normalize(x, p=2, dim=2)
155 |         bs, qn, _ = x.shape
156 |         x = x.reshape(bs * qn, -1)  # (b*100, 512)
157 |         x = torch.mm(x, zs_weight)  # (b*100, k)
158 |         x = x.reshape(bs, qn, -1)  # (b,100,k)
159 |         if self.use_bias:
160 |             x = x + self.cls_bias
161 |         return x
162 | 


--------------------------------------------------------------------------------
/ovformer/data/dataset_mappers/mask_former_panoptic_dataset_mapper.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | import copy
  3 | import logging
  4 | 
  5 | import numpy as np
  6 | import torch
  7 | from torch.nn import functional as F
  8 | 
  9 | from detectron2.config import configurable
 10 | from detectron2.data import detection_utils as utils
 11 | from detectron2.data import transforms as T
 12 | from detectron2.structures import BitMasks, Instances
 13 | 
 14 | from .mask_former_semantic_dataset_mapper import MaskFormerSemanticDatasetMapper
 15 | 
 16 | __all__ = ["MaskFormerPanopticDatasetMapper"]
 17 | 
 18 | 
 19 | class MaskFormerPanopticDatasetMapper(MaskFormerSemanticDatasetMapper):
 20 |     """
 21 |     A callable which takes a dataset dict in Detectron2 Dataset format,
 22 |     and map it into a format used by MaskFormer for panoptic segmentation.
 23 | 
 24 |     The callable currently does the following:
 25 | 
 26 |     1. Read the image from "file_name"
 27 |     2. Applies geometric transforms to the image and annotation
 28 |     3. Find and applies suitable cropping to the image and annotation
 29 |     4. Prepare image and annotation to Tensors
 30 |     """
 31 | 
 32 |     @configurable
 33 |     def __init__(
 34 |         self,
 35 |         is_train=True,
 36 |         *,
 37 |         augmentations,
 38 |         image_format,
 39 |         ignore_label,
 40 |         size_divisibility,
 41 |     ):
 42 |         """
 43 |         NOTE: this interface is experimental.
 44 |         Args:
 45 |             is_train: for training or inference
 46 |             augmentations: a list of augmentations or deterministic transforms to apply
 47 |             image_format: an image format supported by :func:`detection_utils.read_image`.
 48 |             ignore_label: the label that is ignored to evaluation
 49 |             size_divisibility: pad image size to be divisible by this value
 50 |         """
 51 |         super().__init__(
 52 |             is_train,
 53 |             augmentations=augmentations,
 54 |             image_format=image_format,
 55 |             ignore_label=ignore_label,
 56 |             size_divisibility=size_divisibility,
 57 |         )
 58 | 
 59 |     def __call__(self, dataset_dict):
 60 |         """
 61 |         Args:
 62 |             dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
 63 | 
 64 |         Returns:
 65 |             dict: a format that builtin models in detectron2 accept
 66 |         """
 67 |         assert self.is_train, "MaskFormerPanopticDatasetMapper should only be used for training!"
 68 | 
 69 |         dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
 70 |         image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
 71 |         utils.check_image_size(dataset_dict, image)
 72 | 
 73 |         # semantic segmentation
 74 |         if "sem_seg_file_name" in dataset_dict:
 75 |             # PyTorch transformation not implemented for uint16, so converting it to double first
 76 |             sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name")).astype("double")
 77 |         else:
 78 |             sem_seg_gt = None
 79 | 
 80 |         # panoptic segmentation
 81 |         if "pan_seg_file_name" in dataset_dict:
 82 |             pan_seg_gt = utils.read_image(dataset_dict.pop("pan_seg_file_name"), "RGB")
 83 |             segments_info = dataset_dict["segments_info"]
 84 |         else:
 85 |             pan_seg_gt = None
 86 |             segments_info = None
 87 | 
 88 |         if pan_seg_gt is None:
 89 |             raise ValueError(
 90 |                 "Cannot find 'pan_seg_file_name' for panoptic segmentation dataset {}.".format(
 91 |                     dataset_dict["file_name"]
 92 |                 )
 93 |             )
 94 | 
 95 |         aug_input = T.AugInput(image, sem_seg=sem_seg_gt)
 96 |         aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input)
 97 |         image = aug_input.image
 98 |         if sem_seg_gt is not None:
 99 |             sem_seg_gt = aug_input.sem_seg
100 | 
101 |         # apply the same transformation to panoptic segmentation
102 |         pan_seg_gt = transforms.apply_segmentation(pan_seg_gt)
103 | 
104 |         from panopticapi.utils import rgb2id
105 | 
106 |         pan_seg_gt = rgb2id(pan_seg_gt)
107 | 
108 |         # Pad image and segmentation label here!
109 |         image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
110 |         if sem_seg_gt is not None:
111 |             sem_seg_gt = torch.as_tensor(sem_seg_gt.astype("long"))
112 |         pan_seg_gt = torch.as_tensor(pan_seg_gt.astype("long"))
113 | 
114 |         if self.size_divisibility > 0:
115 |             image_size = (image.shape[-2], image.shape[-1])
116 |             padding_size = [
117 |                 0,
118 |                 self.size_divisibility - image_size[1],
119 |                 0,
120 |                 self.size_divisibility - image_size[0],
121 |             ]
122 |             image = F.pad(image, padding_size, value=128).contiguous()
123 |             if sem_seg_gt is not None:
124 |                 sem_seg_gt = F.pad(sem_seg_gt, padding_size, value=self.ignore_label).contiguous()
125 |             pan_seg_gt = F.pad(
126 |                 pan_seg_gt, padding_size, value=0
127 |             ).contiguous()  # 0 is the VOID panoptic label
128 | 
129 |         image_shape = (image.shape[-2], image.shape[-1])  # h, w
130 | 
131 |         # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
132 |         # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
133 |         # Therefore it's important to use torch.Tensor.
134 |         dataset_dict["image"] = image
135 |         if sem_seg_gt is not None:
136 |             dataset_dict["sem_seg"] = sem_seg_gt.long()
137 | 
138 |         if "annotations" in dataset_dict:
139 |             raise ValueError("Pemantic segmentation dataset should not have 'annotations'.")
140 | 
141 |         # Prepare per-category binary masks
142 |         pan_seg_gt = pan_seg_gt.numpy()
143 |         instances = Instances(image_shape)
144 |         classes = []
145 |         masks = []
146 |         for segment_info in segments_info:
147 |             class_id = segment_info["category_id"]
148 |             if not segment_info["iscrowd"]:
149 |                 classes.append(class_id)
150 |                 masks.append(pan_seg_gt == segment_info["id"])
151 | 
152 |         classes = np.array(classes)
153 |         instances.gt_classes = torch.tensor(classes, dtype=torch.int64)
154 |         if len(masks) == 0:
155 |             # Some image does not have annotation (all ignored)
156 |             instances.gt_masks = torch.zeros((0, pan_seg_gt.shape[-2], pan_seg_gt.shape[-1]))
157 |         else:
158 |             masks = BitMasks(
159 |                 torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks])
160 |             )
161 |             instances.gt_masks = masks.tensor
162 | 
163 |         dataset_dict["instances"] = instances
164 | 
165 |         return dataset_dict
166 | 


--------------------------------------------------------------------------------
/ovformer/data/datasets/lvis_v1.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | import logging
  3 | import os
  4 | 
  5 | from fvcore.common.timer import Timer
  6 | from detectron2.structures import BoxMode
  7 | from fvcore.common.file_io import PathManager
  8 | from detectron2.data import DatasetCatalog, MetadataCatalog
  9 | from detectron2.data.datasets.lvis import get_lvis_instances_meta
 10 | 
 11 | logger = logging.getLogger(__name__)
 12 | 
 13 | __all__ = ["custom_load_lvis_json", "custom_register_lvis_instances"]
 14 | 
 15 | 
 16 | def custom_register_lvis_instances(name, metadata, json_file, image_root):
 17 |     """
 18 |     """
 19 |     DatasetCatalog.register(name, lambda: custom_load_lvis_json(
 20 |         json_file, image_root, name))
 21 |     MetadataCatalog.get(name).set(
 22 |         json_file=json_file, image_root=image_root, 
 23 |         evaluator_type="lvis", **metadata
 24 |     )
 25 | 
 26 | 
 27 | def custom_load_lvis_json(json_file, image_root, dataset_name=None):
 28 |     '''
 29 |     Modifications:
 30 |       use `file_name`
 31 |       convert neg_category_ids
 32 |       add pos_category_ids
 33 |     '''
 34 |     from lvis import LVIS
 35 | 
 36 |     json_file = PathManager.get_local_path(json_file)
 37 | 
 38 |     timer = Timer()
 39 |     lvis_api = LVIS(json_file)
 40 |     if timer.seconds() > 1:
 41 |         logger.info("Loading {} takes {:.2f} seconds.".format(
 42 |             json_file, timer.seconds()))
 43 | 
 44 |     catid2contid = {x['id']: i for i, x in enumerate(
 45 |         sorted(lvis_api.dataset['categories'], key=lambda x: x['id']))}
 46 |     if len(lvis_api.dataset['categories']) == 1203:
 47 |         for x in lvis_api.dataset['categories']:
 48 |             assert catid2contid[x['id']] == x['id'] - 1
 49 |     img_ids = sorted(lvis_api.imgs.keys())
 50 |     imgs = lvis_api.load_imgs(img_ids)
 51 |     anns = [lvis_api.img_ann_map[img_id] for img_id in img_ids]
 52 | 
 53 |     ann_ids = [ann["id"] for anns_per_image in anns for ann in anns_per_image]
 54 |     assert len(set(ann_ids)) == len(ann_ids), \
 55 |         "Annotation ids in '{}' are not unique".format(json_file)
 56 | 
 57 |     imgs_anns = list(zip(imgs, anns))
 58 |     logger.info("Loaded {} images in the LVIS v1 format from {}".format(
 59 |         len(imgs_anns), json_file))
 60 | 
 61 |     dataset_dicts = []
 62 | 
 63 |     for (img_dict, anno_dict_list) in imgs_anns:
 64 |         record = {}
 65 |         if "file_name" in img_dict:
 66 |             file_name = img_dict["file_name"]
 67 |             if img_dict["file_name"].startswith("COCO"):
 68 |                 file_name = file_name[-16:]
 69 |             record["file_name"] = os.path.join(image_root, file_name)
 70 |         elif 'coco_url' in img_dict:
 71 |             # e.g., http://images.cocodataset.org/train2017/000000391895.jpg
 72 |             file_name = img_dict["coco_url"][30:]
 73 |             record["file_name"] = os.path.join(image_root, file_name)
 74 |         elif 'tar_index' in img_dict:
 75 |             record['tar_index'] = img_dict['tar_index']
 76 |         
 77 |         record["height"] = img_dict["height"]
 78 |         record["width"] = img_dict["width"]
 79 |         record["not_exhaustive_category_ids"] = img_dict.get(
 80 |             "not_exhaustive_category_ids", [])
 81 |         record["neg_category_ids"] = img_dict.get("neg_category_ids", [])
 82 |         # NOTE: modified by Xingyi: convert to 0-based
 83 |         record["neg_category_ids"] = [
 84 |             catid2contid[x] for x in record["neg_category_ids"]]
 85 |         if 'pos_category_ids' in img_dict:
 86 |             record['pos_category_ids'] = [
 87 |                 catid2contid[x] for x in img_dict.get("pos_category_ids", [])]
 88 |         if 'captions' in img_dict:
 89 |             record['captions'] = img_dict['captions']
 90 |         if 'caption_features' in img_dict:
 91 |             record['caption_features'] = img_dict['caption_features']
 92 |         image_id = record["image_id"] = img_dict["id"]
 93 | 
 94 |         objs = []
 95 |         for anno in anno_dict_list:
 96 |             assert anno["image_id"] == image_id
 97 |             if anno.get('iscrowd', 0) > 0:
 98 |                 continue
 99 |             obj = {"bbox": anno["bbox"], "bbox_mode": BoxMode.XYWH_ABS}
100 |             obj["category_id"] = catid2contid[anno['category_id']] 
101 |             if 'segmentation' in anno:
102 |                 segm = anno["segmentation"]
103 |                 valid_segm = [poly for poly in segm \
104 |                     if len(poly) % 2 == 0 and len(poly) >= 6]
105 |                 # assert len(segm) == len(
106 |                 #     valid_segm
107 |                 # ), "Annotation contains an invalid polygon with < 3 points"
108 |                 if not len(segm) == len(valid_segm):
109 |                     print('Annotation contains an invalid polygon with < 3 points')
110 |                 assert len(segm) > 0
111 |                 obj["segmentation"] = segm
112 |             objs.append(obj)
113 |         record["annotations"] = objs
114 |         dataset_dicts.append(record)
115 | 
116 |     return dataset_dicts
117 | 
118 | _CUSTOM_SPLITS_LVIS = {
119 |     "lvis_v1_train+coco": ("coco/", "lvis/lvis_v1_train+coco_mask.json"),
120 |     "lvis_v1_train_norare": ("coco/", "lvis/lvis_v1_train_norare.json"),
121 |     "lvis_v1_train_norare_cloth": ("coco/", "lvis/lvis_v1_train_norare_cloth.json"),
122 |     "lvis_v1_train_norare_coco": ("coco/", "lvis/lvis_v1_train_norare_coco.json"),
123 |     "lvis_v1_train_norare_nocloth": ("coco/", "lvis/lvis_v1_train_norare_nocloth.json"),
124 |     "lvis_v1_val_cloth": ("coco/", "lvis/lvis_v1_val_cloth.json"),
125 |     "lvis_v1_val_person": ("coco/", "lvis/lvis_v1_val_person.json"),
126 |     "lvis_v1_val_car": ("coco/", "lvis/lvis_v1_val_car.json"),
127 |     "lvis_v1_val_coco": ("coco/", "lvis/lvis_v1_val_coco.json")
128 | }
129 | 
130 | 
131 | for key, (image_root, json_file) in _CUSTOM_SPLITS_LVIS.items():
132 |     custom_register_lvis_instances(
133 |         key,
134 |         get_lvis_instances_meta(key),
135 |         os.path.join("datasets", json_file) if "://" not in json_file else json_file,
136 |         os.path.join("datasets", image_root),
137 |     )
138 | 
139 | 
140 | def get_lvis_22k_meta():
141 |     from .lvis_22k_categories import CATEGORIES
142 |     cat_ids = [k["id"] for k in CATEGORIES]
143 |     assert min(cat_ids) == 1 and max(cat_ids) == len(
144 |         cat_ids
145 |     ), "Category ids are not in [1, #categories], as expected"
146 |     # Ensure that the category list is sorted by id
147 |     lvis_categories = sorted(CATEGORIES, key=lambda x: x["id"])
148 |     thing_classes = [k["name"] for k in lvis_categories]
149 |     meta = {"thing_classes": thing_classes}
150 |     return meta
151 | 
152 | _CUSTOM_SPLITS_LVIS_22K = {
153 |     "lvis_v1_train_22k": ("coco/", "lvis/lvis_v1_train_lvis-22k.json"),
154 | }
155 | 
156 | for key, (image_root, json_file) in _CUSTOM_SPLITS_LVIS_22K.items():
157 |     custom_register_lvis_instances(
158 |         key,
159 |         get_lvis_22k_meta(),
160 |         os.path.join("datasets", json_file) if "://" not in json_file else json_file,
161 |         os.path.join("datasets", image_root),
162 |     )
163 | 


--------------------------------------------------------------------------------
/tools/burst2ytvis.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import argparse
  3 | import os
  4 | import json
  5 | import pycocotools.mask as cocomask
  6 | from tabulate import tabulate
  7 | from typing import Union
  8 | import copy
  9 | 
 10 | def _global_track_id(*, local_track_id: Union[str, int],
 11 |                      video_id: Union[str, int],
 12 |                      track_id_mapping) -> int:
 13 |     # remap local track ids into globally unique ids
 14 |     return track_id_mapping[str(video_id)][str(local_track_id)]
 15 | 
 16 | 
 17 | class B2YConverter:
 18 |     def __init__(self, b_format,class_split):
 19 |         self._b_format = b_format
 20 |         self._class_common_split=class_split
 21 |         self._split = b_format['split']
 22 |         self._ori_categories = b_format['categories']
 23 |         self._categories =self._make_categories()
 24 |         self._cate_map=self._make_map()
 25 |         self._videos = []
 26 |         self._annotations = []
 27 |         self._tracks = {}
 28 |         self._images = []
 29 |         self._next_img_id = 0
 30 |         self._next_ann_id = 0
 31 |         
 32 |         self._track_id_mapping = self._load_track_id_mapping()
 33 | 
 34 |         for seq in b_format['sequences']:
 35 |             self._visit_seq(seq)
 36 | 
 37 |     def _load_track_id_mapping(self):
 38 |         id_map = {}
 39 |         next_global_track_id = 1
 40 |         for seq in self._b_format['sequences']:
 41 |             seq_id = seq['id']
 42 |             seq_id_map = {}
 43 |             id_map[str(seq_id)] = seq_id_map
 44 |             for local_track_id in seq['track_category_ids']:
 45 |                 seq_id_map[str(local_track_id)] = next_global_track_id
 46 |                 next_global_track_id += 1
 47 |         return id_map
 48 | 
 49 |     def global_track_id(self, *, local_track_id: Union[str, int],
 50 |                         video_id: Union[str, int]) -> int:
 51 |         return _global_track_id(local_track_id=local_track_id,
 52 |                                 video_id=video_id,
 53 |                                 track_id_mapping=self._track_id_mapping)
 54 | 
 55 |     def _visit_seq(self, seq):
 56 |         self._make_video(seq)
 57 |         imgs = self._make_images(seq)
 58 |         self._make_annotations_and_tracks(seq, imgs)
 59 | 
 60 |     def _make_images(self, seq):
 61 |         imgs = []
 62 |         for img_path in seq['annotated_image_paths']:
 63 |             video = self._split + '/' + seq['dataset'] + '/' + seq['seq_name']
 64 |             file_name = video + '/' + img_path
 65 | 
 66 |             # TODO: once python 3.9 is more common, we can use this nicer and safer code
 67 |             #stripped = img_path.removesuffix('.jpg').removesuffix('.png').removeprefix('frame')
 68 |             stripped = img_path.replace('.jpg', '').replace('.png', '').replace('frame', '')
 69 | 
 70 |             last = stripped.split('_')[-1]
 71 |             frame_idx = int(last)
 72 | 
 73 |             img = {'id': self._next_img_id, 'video': video,
 74 |                    'width': seq['width'], 'height': seq['height'],
 75 |                    'file_name': file_name,
 76 |                    'frame_index': frame_idx,
 77 |                    'video_id': seq['id']}
 78 |             self._next_img_id += 1
 79 |             self._images.append(img)
 80 |             imgs.append(img)
 81 |         return imgs
 82 | 
 83 |     def _make_video(self, seq):
 84 |         video_id = seq['id']
 85 |         dataset = seq['dataset']
 86 |         seq_name = seq['seq_name']
 87 |         name =   dataset + '/' + seq_name
 88 |         file_name=[name+'/'+iname for iname in seq['annotated_image_paths']]
 89 |         video = {
 90 |             'id': video_id, 'width': seq['width'], 'height': seq['height'],'length':len(file_name),
 91 |             'neg_category_ids': seq['neg_category_ids'],
 92 |             'not_exhaustive_category_ids': seq['not_exhaustive_category_ids'],
 93 |             'file_names': file_name, 'metadata': {'dataset': dataset}}
 94 |         self._videos.append(video)
 95 | 
 96 |     def _make_annotations_and_tracks(self, seq, imgs):
 97 |         video_id = seq['id']
 98 |         segs = seq['segmentations']
 99 |         assert len(segs) == len(imgs), (len(segs), len(imgs))
100 |         for i in seq['track_category_ids'].keys():
101 |             segmentations=[]
102 |             bboxs=[]
103 |             for frame_segs, img in zip(segs, imgs):
104 |                 if i in frame_segs:
105 |                     rle = frame_segs[i]['rle']
106 |                     segment = {'counts': rle, 'size': [img['height'], img['width']]}
107 |                     segmentations.append(segment)
108 |                     coco_bbox = cocomask.toBbox(segment)
109 |                     bbox = [int(x) for x in coco_bbox]
110 |                     bboxs.append(bbox)
111 |                 else :
112 |                     segmentations.append(None)
113 |                     bboxs.append(None)
114 |             category_id = int(seq['track_category_ids'][i])
115 |             ann = {'segmentations': segmentations, 'id': self._next_ann_id,
116 |                  'category_id': self._cate_map[category_id],'width': seq['width'], 'height': seq['height'],
117 |                  'video_id': video_id,
118 |                 'bboxes': bboxs}
119 |             self._next_ann_id += 1
120 |             self._annotations.append(ann)
121 | 
122 |     def convert(self):
123 |         return {'videos': self._videos, 'annotations': self._annotations,
124 |                  'images': self._images,
125 |                 'categories': self._categories,
126 |                 'cate_ori':self._ori_categories,
127 |                 'track_id_mapping': self._track_id_mapping,
128 |                 'split': self._split}
129 | 
130 |     def _make_categories(self):
131 |         common_class=self._class_common_split['common']
132 |         uncommon_class=self._class_common_split['uncommon']
133 |         cate_mod=[]
134 |         for idx,cate in enumerate(self._ori_categories):
135 |             cate_2=copy.deepcopy(cate)
136 |             if cate_2['id'] in common_class:
137 |                 cate_2['split']='common'
138 |             if cate_2['id'] in uncommon_class:
139 |                 cate_2['split']='uncommon'
140 |             cate_2['id']=idx+1
141 |             cate_mod.append(cate_2)
142 |         
143 |         return cate_mod
144 |     def _make_map(self):
145 | 
146 |         cate_map={}
147 |         for idx,(ori,mod) in enumerate(zip(self._ori_categories,self._categories)):
148 |             cate_map[ori['id']]=mod['id']
149 |         return cate_map
150 | 
151 | 
152 | if __name__ == '__main__':
153 |     parser = argparse.ArgumentParser()
154 |     parser.add_argument('--ann', type=str,default='datasets/burst/val/all_classes.json')
155 |     parser.add_argument('--out', type=str,default='datasets/burst/val/b2y_val.json')
156 |     args = parser.parse_args()
157 |     class_common='datasets/burst/info/class_split.json'
158 |     with open(class_common) as ft:
159 |         class_common_dict = json.load(ft)
160 |     with open(args.ann) as f:
161 |         b_format_gt = json.load(f)
162 |     y_format_gt = B2YConverter(b_format_gt,class_common_dict).convert()
163 |     with open(args.out, 'w') as f:
164 |         json.dump(y_format_gt, f)
165 | 


--------------------------------------------------------------------------------
/ovformer/modeling/pixel_decoder/ops/modules/ms_deform_attn.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------------------------------
  2 | # Deformable DETR
  3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
  4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  5 | # ------------------------------------------------------------------------------------------------
  6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
  7 | # ------------------------------------------------------------------------------------------------
  8 | 
  9 | # Copyright (c) Facebook, Inc. and its affiliates.
 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
 11 | 
 12 | from __future__ import absolute_import
 13 | from __future__ import print_function
 14 | from __future__ import division
 15 | 
 16 | import warnings
 17 | import math
 18 | 
 19 | import torch
 20 | from torch import nn
 21 | import torch.nn.functional as F
 22 | from torch.nn.init import xavier_uniform_, constant_
 23 | 
 24 | from ..functions import MSDeformAttnFunction
 25 | from ..functions.ms_deform_attn_func import ms_deform_attn_core_pytorch
 26 | 
 27 | 
 28 | def _is_power_of_2(n):
 29 |     if (not isinstance(n, int)) or (n < 0):
 30 |         raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n)))
 31 |     return (n & (n-1) == 0) and n != 0
 32 | 
 33 | 
 34 | class MSDeformAttn(nn.Module):
 35 |     def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4):
 36 |         """
 37 |         Multi-Scale Deformable Attention Module
 38 |         :param d_model      hidden dimension
 39 |         :param n_levels     number of feature levels
 40 |         :param n_heads      number of attention heads
 41 |         :param n_points     number of sampling points per attention head per feature level
 42 |         """
 43 |         super().__init__()
 44 |         if d_model % n_heads != 0:
 45 |             raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads))
 46 |         _d_per_head = d_model // n_heads
 47 |         # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation
 48 |         if not _is_power_of_2(_d_per_head):
 49 |             warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 "
 50 |                           "which is more efficient in our CUDA implementation.")
 51 | 
 52 |         self.im2col_step = 128
 53 | 
 54 |         self.d_model = d_model
 55 |         self.n_levels = n_levels
 56 |         self.n_heads = n_heads
 57 |         self.n_points = n_points
 58 | 
 59 |         self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2)
 60 |         self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points)
 61 |         self.value_proj = nn.Linear(d_model, d_model)
 62 |         self.output_proj = nn.Linear(d_model, d_model)
 63 | 
 64 |         self._reset_parameters()
 65 | 
 66 |     def _reset_parameters(self):
 67 |         constant_(self.sampling_offsets.weight.data, 0.)
 68 |         thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
 69 |         grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
 70 |         grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1)
 71 |         for i in range(self.n_points):
 72 |             grid_init[:, :, i, :] *= i + 1
 73 |         with torch.no_grad():
 74 |             self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
 75 |         constant_(self.attention_weights.weight.data, 0.)
 76 |         constant_(self.attention_weights.bias.data, 0.)
 77 |         xavier_uniform_(self.value_proj.weight.data)
 78 |         constant_(self.value_proj.bias.data, 0.)
 79 |         xavier_uniform_(self.output_proj.weight.data)
 80 |         constant_(self.output_proj.bias.data, 0.)
 81 | 
 82 |     def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None):
 83 |         """
 84 |         :param query                       (N, Length_{query}, C)
 85 |         :param reference_points            (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area
 86 |                                         or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes
 87 |         :param input_flatten               (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C)
 88 |         :param input_spatial_shapes        (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
 89 |         :param input_level_start_index     (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}]
 90 |         :param input_padding_mask          (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements
 91 | 
 92 |         :return output                     (N, Length_{query}, C)
 93 |         """
 94 |         N, Len_q, _ = query.shape
 95 |         N, Len_in, _ = input_flatten.shape
 96 |         assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in
 97 | 
 98 |         value = self.value_proj(input_flatten)
 99 |         if input_padding_mask is not None:
100 |             value = value.masked_fill(input_padding_mask[..., None], float(0))
101 |         value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads)
102 |         sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2)
103 |         attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points)
104 |         attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points)
105 |         # N, Len_q, n_heads, n_levels, n_points, 2
106 |         if reference_points.shape[-1] == 2:
107 |             offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1)
108 |             sampling_locations = reference_points[:, :, None, :, None, :] \
109 |                                  + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
110 |         elif reference_points.shape[-1] == 4:
111 |             sampling_locations = reference_points[:, :, None, :, None, :2] \
112 |                                  + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
113 |         else:
114 |             raise ValueError(
115 |                 'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1]))
116 |         try:
117 |             output = MSDeformAttnFunction.apply(
118 |                 value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step)
119 |         except:
120 |             # CPU
121 |             output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights)
122 |         # # For FLOPs calculation only
123 |         # output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights)
124 |         output = self.output_proj(output)
125 |         return output
126 | 


--------------------------------------------------------------------------------
/ovformer/data/dataset_mappers/mask_former_instance_dataset_mapper.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | import copy
  3 | import logging
  4 | 
  5 | import numpy as np
  6 | import pycocotools.mask as mask_util
  7 | import torch
  8 | from torch.nn import functional as F
  9 | 
 10 | from detectron2.config import configurable
 11 | from detectron2.data import detection_utils as utils
 12 | from detectron2.data import transforms as T
 13 | from detectron2.projects.point_rend import ColorAugSSDTransform
 14 | from detectron2.structures import BitMasks, Instances, polygons_to_bitmask
 15 | 
 16 | __all__ = ["MaskFormerInstanceDatasetMapper"]
 17 | 
 18 | 
 19 | class MaskFormerInstanceDatasetMapper:
 20 |     """
 21 |     A callable which takes a dataset dict in Detectron2 Dataset format,
 22 |     and map it into a format used by MaskFormer for instance segmentation.
 23 | 
 24 |     The callable currently does the following:
 25 | 
 26 |     1. Read the image from "file_name"
 27 |     2. Applies geometric transforms to the image and annotation
 28 |     3. Find and applies suitable cropping to the image and annotation
 29 |     4. Prepare image and annotation to Tensors
 30 |     """
 31 | 
 32 |     @configurable
 33 |     def __init__(
 34 |         self,
 35 |         is_train=True,
 36 |         *,
 37 |         augmentations,
 38 |         image_format,
 39 |         size_divisibility,
 40 |     ):
 41 |         """
 42 |         NOTE: this interface is experimental.
 43 |         Args:
 44 |             is_train: for training or inference
 45 |             augmentations: a list of augmentations or deterministic transforms to apply
 46 |             image_format: an image format supported by :func:`detection_utils.read_image`.
 47 |             size_divisibility: pad image size to be divisible by this value
 48 |         """
 49 |         self.is_train = is_train
 50 |         self.tfm_gens = augmentations
 51 |         self.img_format = image_format
 52 |         self.size_divisibility = size_divisibility
 53 | 
 54 |         logger = logging.getLogger(__name__)
 55 |         mode = "training" if is_train else "inference"
 56 |         logger.info(f"[{self.__class__.__name__}] Augmentations used in {mode}: {augmentations}")
 57 | 
 58 |     @classmethod
 59 |     def from_config(cls, cfg, is_train=True):
 60 |         # Build augmentation
 61 |         augs = [
 62 |             T.ResizeShortestEdge(
 63 |                 cfg.INPUT.MIN_SIZE_TRAIN,
 64 |                 cfg.INPUT.MAX_SIZE_TRAIN,
 65 |                 cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING,
 66 |             )
 67 |         ]
 68 |         if cfg.INPUT.CROP.ENABLED:
 69 |             augs.append(
 70 |                 T.RandomCrop(
 71 |                     cfg.INPUT.CROP.TYPE,
 72 |                     cfg.INPUT.CROP.SIZE,
 73 |                 )
 74 |             )
 75 |         if cfg.INPUT.COLOR_AUG_SSD:
 76 |             augs.append(ColorAugSSDTransform(img_format=cfg.INPUT.FORMAT))
 77 |         augs.append(T.RandomFlip())
 78 | 
 79 |         ret = {
 80 |             "is_train": is_train,
 81 |             "augmentations": augs,
 82 |             "image_format": cfg.INPUT.FORMAT,
 83 |             "size_divisibility": cfg.INPUT.SIZE_DIVISIBILITY,
 84 |         }
 85 |         return ret
 86 | 
 87 |     def __call__(self, dataset_dict):
 88 |         """
 89 |         Args:
 90 |             dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
 91 | 
 92 |         Returns:
 93 |             dict: a format that builtin models in detectron2 accept
 94 |         """
 95 |         assert self.is_train, "MaskFormerPanopticDatasetMapper should only be used for training!"
 96 | 
 97 |         dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
 98 |         image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
 99 |         utils.check_image_size(dataset_dict, image)
100 | 
101 |         aug_input = T.AugInput(image)
102 |         aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input)
103 |         image = aug_input.image
104 | 
105 |         # transform instnace masks
106 |         assert "annotations" in dataset_dict
107 |         for anno in dataset_dict["annotations"]:
108 |             anno.pop("keypoints", None)
109 | 
110 |         annos = [
111 |             utils.transform_instance_annotations(obj, transforms, image.shape[:2])
112 |             for obj in dataset_dict.pop("annotations")
113 |             if obj.get("iscrowd", 0) == 0
114 |         ]
115 | 
116 |         if len(annos):
117 |             assert "segmentation" in annos[0]
118 |         segms = [obj["segmentation"] for obj in annos]
119 |         masks = []
120 |         for segm in segms:
121 |             if isinstance(segm, list):
122 |                 # polygon
123 |                 masks.append(polygons_to_bitmask(segm, *image.shape[:2]))
124 |             elif isinstance(segm, dict):
125 |                 # COCO RLE
126 |                 masks.append(mask_util.decode(segm))
127 |             elif isinstance(segm, np.ndarray):
128 |                 assert segm.ndim == 2, "Expect segmentation of 2 dimensions, got {}.".format(
129 |                     segm.ndim
130 |                 )
131 |                 # mask array
132 |                 masks.append(segm)
133 |             else:
134 |                 raise ValueError(
135 |                     "Cannot convert segmentation of type '{}' to BitMasks!"
136 |                     "Supported types are: polygons as list[list[float] or ndarray],"
137 |                     " COCO-style RLE as a dict, or a binary segmentation mask "
138 |                     " in a 2D numpy array of shape HxW.".format(type(segm))
139 |                 )
140 | 
141 |         # Pad image and segmentation label here!
142 |         image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
143 |         masks = [torch.from_numpy(np.ascontiguousarray(x)) for x in masks]
144 | 
145 |         classes = [int(obj["category_id"]) for obj in annos]
146 |         classes = torch.tensor(classes, dtype=torch.int64)
147 | 
148 |         if self.size_divisibility > 0:
149 |             image_size = (image.shape[-2], image.shape[-1])
150 |             padding_size = [
151 |                 0,
152 |                 self.size_divisibility - image_size[1],
153 |                 0,
154 |                 self.size_divisibility - image_size[0],
155 |             ]
156 |             # pad image
157 |             image = F.pad(image, padding_size, value=128).contiguous()
158 |             # pad mask
159 |             masks = [F.pad(x, padding_size, value=0).contiguous() for x in masks]
160 | 
161 |         image_shape = (image.shape[-2], image.shape[-1])  # h, w
162 | 
163 |         # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
164 |         # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
165 |         # Therefore it's important to use torch.Tensor.
166 |         dataset_dict["image"] = image
167 | 
168 |         # Prepare per-category binary masks
169 |         instances = Instances(image_shape)
170 |         instances.gt_classes = classes
171 |         if len(masks) == 0:
172 |             # Some image does not have annotation (all ignored)
173 |             instances.gt_masks = torch.zeros((0, image.shape[-2], image.shape[-1]))
174 |         else:
175 |             masks = BitMasks(torch.stack(masks))
176 |             instances.gt_masks = masks.tensor
177 | 
178 |         dataset_dict["instances"] = instances
179 | 
180 |         return dataset_dict
181 | 


--------------------------------------------------------------------------------
/ovformer/data/dataset_mappers/mask_former_semantic_dataset_mapper.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | import copy
  3 | import logging
  4 | 
  5 | import numpy as np
  6 | import torch
  7 | from torch.nn import functional as F
  8 | 
  9 | from detectron2.config import configurable
 10 | from detectron2.data import MetadataCatalog
 11 | from detectron2.data import detection_utils as utils
 12 | from detectron2.data import transforms as T
 13 | from detectron2.projects.point_rend import ColorAugSSDTransform
 14 | from detectron2.structures import BitMasks, Instances
 15 | 
 16 | __all__ = ["MaskFormerSemanticDatasetMapper"]
 17 | 
 18 | 
 19 | class MaskFormerSemanticDatasetMapper:
 20 |     """
 21 |     A callable which takes a dataset dict in Detectron2 Dataset format,
 22 |     and map it into a format used by MaskFormer for semantic segmentation.
 23 | 
 24 |     The callable currently does the following:
 25 | 
 26 |     1. Read the image from "file_name"
 27 |     2. Applies geometric transforms to the image and annotation
 28 |     3. Find and applies suitable cropping to the image and annotation
 29 |     4. Prepare image and annotation to Tensors
 30 |     """
 31 | 
 32 |     @configurable
 33 |     def __init__(
 34 |         self,
 35 |         is_train=True,
 36 |         *,
 37 |         augmentations,
 38 |         image_format,
 39 |         ignore_label,
 40 |         size_divisibility,
 41 |     ):
 42 |         """
 43 |         NOTE: this interface is experimental.
 44 |         Args:
 45 |             is_train: for training or inference
 46 |             augmentations: a list of augmentations or deterministic transforms to apply
 47 |             image_format: an image format supported by :func:`detection_utils.read_image`.
 48 |             ignore_label: the label that is ignored to evaluation
 49 |             size_divisibility: pad image size to be divisible by this value
 50 |         """
 51 |         self.is_train = is_train
 52 |         self.tfm_gens = augmentations
 53 |         self.img_format = image_format
 54 |         self.ignore_label = ignore_label
 55 |         self.size_divisibility = size_divisibility
 56 | 
 57 |         logger = logging.getLogger(__name__)
 58 |         mode = "training" if is_train else "inference"
 59 |         logger.info(f"[{self.__class__.__name__}] Augmentations used in {mode}: {augmentations}")
 60 | 
 61 |     @classmethod
 62 |     def from_config(cls, cfg, is_train=True):
 63 |         # Build augmentation
 64 |         augs = [
 65 |             T.ResizeShortestEdge(
 66 |                 cfg.INPUT.MIN_SIZE_TRAIN,
 67 |                 cfg.INPUT.MAX_SIZE_TRAIN,
 68 |                 cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING,
 69 |             )
 70 |         ]
 71 |         if cfg.INPUT.CROP.ENABLED:
 72 |             augs.append(
 73 |                 T.RandomCrop_CategoryAreaConstraint(
 74 |                     cfg.INPUT.CROP.TYPE,
 75 |                     cfg.INPUT.CROP.SIZE,
 76 |                     cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA,
 77 |                     cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
 78 |                 )
 79 |             )
 80 |         if cfg.INPUT.COLOR_AUG_SSD:
 81 |             augs.append(ColorAugSSDTransform(img_format=cfg.INPUT.FORMAT))
 82 |         augs.append(T.RandomFlip())
 83 | 
 84 |         # Assume always applies to the training set.
 85 |         dataset_names = cfg.DATASETS.TRAIN
 86 |         meta = MetadataCatalog.get(dataset_names[0])
 87 |         ignore_label = meta.ignore_label
 88 | 
 89 |         ret = {
 90 |             "is_train": is_train,
 91 |             "augmentations": augs,
 92 |             "image_format": cfg.INPUT.FORMAT,
 93 |             "ignore_label": ignore_label,
 94 |             "size_divisibility": cfg.INPUT.SIZE_DIVISIBILITY,
 95 |         }
 96 |         return ret
 97 | 
 98 |     def __call__(self, dataset_dict):
 99 |         """
100 |         Args:
101 |             dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
102 | 
103 |         Returns:
104 |             dict: a format that builtin models in detectron2 accept
105 |         """
106 |         assert self.is_train, "MaskFormerSemanticDatasetMapper should only be used for training!"
107 | 
108 |         dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
109 |         image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
110 |         utils.check_image_size(dataset_dict, image)
111 | 
112 |         if "sem_seg_file_name" in dataset_dict:
113 |             # PyTorch transformation not implemented for uint16, so converting it to double first
114 |             sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name")).astype("double")
115 |         else:
116 |             sem_seg_gt = None
117 | 
118 |         if sem_seg_gt is None:
119 |             raise ValueError(
120 |                 "Cannot find 'sem_seg_file_name' for semantic segmentation dataset {}.".format(
121 |                     dataset_dict["file_name"]
122 |                 )
123 |             )
124 | 
125 |         aug_input = T.AugInput(image, sem_seg=sem_seg_gt)
126 |         aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input)
127 |         image = aug_input.image
128 |         sem_seg_gt = aug_input.sem_seg
129 | 
130 |         # Pad image and segmentation label here!
131 |         image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
132 |         if sem_seg_gt is not None:
133 |             sem_seg_gt = torch.as_tensor(sem_seg_gt.astype("long"))
134 | 
135 |         if self.size_divisibility > 0:
136 |             image_size = (image.shape[-2], image.shape[-1])
137 |             padding_size = [
138 |                 0,
139 |                 self.size_divisibility - image_size[1],
140 |                 0,
141 |                 self.size_divisibility - image_size[0],
142 |             ]
143 |             image = F.pad(image, padding_size, value=128).contiguous()
144 |             if sem_seg_gt is not None:
145 |                 sem_seg_gt = F.pad(sem_seg_gt, padding_size, value=self.ignore_label).contiguous()
146 | 
147 |         image_shape = (image.shape[-2], image.shape[-1])  # h, w
148 | 
149 |         # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
150 |         # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
151 |         # Therefore it's important to use torch.Tensor.
152 |         dataset_dict["image"] = image
153 | 
154 |         if sem_seg_gt is not None:
155 |             dataset_dict["sem_seg"] = sem_seg_gt.long()
156 | 
157 |         if "annotations" in dataset_dict:
158 |             raise ValueError("Semantic segmentation dataset should not have 'annotations'.")
159 | 
160 |         # Prepare per-category binary masks
161 |         if sem_seg_gt is not None:
162 |             sem_seg_gt = sem_seg_gt.numpy()
163 |             instances = Instances(image_shape)
164 |             classes = np.unique(sem_seg_gt)
165 |             # remove ignored region
166 |             classes = classes[classes != self.ignore_label]
167 |             instances.gt_classes = torch.tensor(classes, dtype=torch.int64)
168 | 
169 |             masks = []
170 |             for class_id in classes:
171 |                 masks.append(sem_seg_gt == class_id)
172 | 
173 |             if len(masks) == 0:
174 |                 # Some image does not have annotation (all ignored)
175 |                 instances.gt_masks = torch.zeros((0, sem_seg_gt.shape[-2], sem_seg_gt.shape[-1]))
176 |             else:
177 |                 masks = BitMasks(
178 |                     torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks])
179 |                 )
180 |                 instances.gt_masks = masks.tensor
181 | 
182 |             dataset_dict["instances"] = instances
183 | 
184 |         return dataset_dict
185 | 


--------------------------------------------------------------------------------
/ovformer/modeling/transformer_decoder/maskformer_transformer_decoder.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/detr.py
  3 | import fvcore.nn.weight_init as weight_init
  4 | import torch
  5 | from torch import nn
  6 | from torch.nn import functional as F
  7 | 
  8 | from detectron2.config import configurable
  9 | from detectron2.layers import Conv2d
 10 | from detectron2.utils.registry import Registry
 11 | 
 12 | from .position_encoding import PositionEmbeddingSine
 13 | from .transformer import Transformer
 14 | 
 15 | 
 16 | TRANSFORMER_DECODER_REGISTRY = Registry("TRANSFORMER_MODULE")
 17 | TRANSFORMER_DECODER_REGISTRY.__doc__ = """
 18 | Registry for transformer module in MaskFormer.
 19 | """
 20 | 
 21 | 
 22 | def build_transformer_decoder(cfg, in_channels, mask_classification=True):
 23 |     """
 24 |     Build a instance embedding branch from `cfg.MODEL.INS_EMBED_HEAD.NAME`.
 25 |     """
 26 |     name = cfg.MODEL.MASK_FORMER.TRANSFORMER_DECODER_NAME
 27 |     return TRANSFORMER_DECODER_REGISTRY.get(name)(cfg, in_channels, mask_classification)
 28 | 
 29 | 
 30 | @TRANSFORMER_DECODER_REGISTRY.register()
 31 | class StandardTransformerDecoder(nn.Module):
 32 |     @configurable
 33 |     def __init__(
 34 |         self,
 35 |         in_channels,
 36 |         mask_classification=True,
 37 |         *,
 38 |         num_classes: int,
 39 |         hidden_dim: int,
 40 |         num_queries: int,
 41 |         nheads: int,
 42 |         dropout: float,
 43 |         dim_feedforward: int,
 44 |         enc_layers: int,
 45 |         dec_layers: int,
 46 |         pre_norm: bool,
 47 |         deep_supervision: bool,
 48 |         mask_dim: int,
 49 |         enforce_input_project: bool,
 50 |     ):
 51 |         """
 52 |         NOTE: this interface is experimental.
 53 |         Args:
 54 |             in_channels: channels of the input features
 55 |             mask_classification: whether to add mask classifier or not
 56 |             num_classes: number of classes
 57 |             hidden_dim: Transformer feature dimension
 58 |             num_queries: number of queries
 59 |             nheads: number of heads
 60 |             dropout: dropout in Transformer
 61 |             dim_feedforward: feature dimension in feedforward network
 62 |             enc_layers: number of Transformer encoder layers
 63 |             dec_layers: number of Transformer decoder layers
 64 |             pre_norm: whether to use pre-LayerNorm or not
 65 |             deep_supervision: whether to add supervision to every decoder layers
 66 |             mask_dim: mask feature dimension
 67 |             enforce_input_project: add input project 1x1 conv even if input
 68 |                 channels and hidden dim is identical
 69 |         """
 70 |         super().__init__()
 71 | 
 72 |         self.mask_classification = mask_classification
 73 | 
 74 |         # positional encoding
 75 |         N_steps = hidden_dim // 2
 76 |         self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True)
 77 | 
 78 |         transformer = Transformer(
 79 |             d_model=hidden_dim,
 80 |             dropout=dropout,
 81 |             nhead=nheads,
 82 |             dim_feedforward=dim_feedforward,
 83 |             num_encoder_layers=enc_layers,
 84 |             num_decoder_layers=dec_layers,
 85 |             normalize_before=pre_norm,
 86 |             return_intermediate_dec=deep_supervision,
 87 |         )
 88 | 
 89 |         self.num_queries = num_queries
 90 |         self.transformer = transformer
 91 |         hidden_dim = transformer.d_model
 92 | 
 93 |         self.query_embed = nn.Embedding(num_queries, hidden_dim)
 94 | 
 95 |         if in_channels != hidden_dim or enforce_input_project:
 96 |             self.input_proj = Conv2d(in_channels, hidden_dim, kernel_size=1)
 97 |             weight_init.c2_xavier_fill(self.input_proj)
 98 |         else:
 99 |             self.input_proj = nn.Sequential()
100 |         self.aux_loss = deep_supervision
101 | 
102 |         # output FFNs
103 |         if self.mask_classification:
104 |             self.class_embed = nn.Linear(hidden_dim, num_classes + 1)
105 |         self.mask_embed = MLP(hidden_dim, hidden_dim, mask_dim, 3)
106 | 
107 |     @classmethod
108 |     def from_config(cls, cfg, in_channels, mask_classification):
109 |         ret = {}
110 |         ret["in_channels"] = in_channels
111 |         ret["mask_classification"] = mask_classification
112 | 
113 |         ret["num_classes"] = cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES
114 |         ret["hidden_dim"] = cfg.MODEL.MASK_FORMER.HIDDEN_DIM
115 |         ret["num_queries"] = cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES
116 |         # Transformer parameters:
117 |         ret["nheads"] = cfg.MODEL.MASK_FORMER.NHEADS
118 |         ret["dropout"] = cfg.MODEL.MASK_FORMER.DROPOUT
119 |         ret["dim_feedforward"] = cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD
120 |         ret["enc_layers"] = cfg.MODEL.MASK_FORMER.ENC_LAYERS
121 |         ret["dec_layers"] = cfg.MODEL.MASK_FORMER.DEC_LAYERS
122 |         ret["pre_norm"] = cfg.MODEL.MASK_FORMER.PRE_NORM
123 |         ret["deep_supervision"] = cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION
124 |         ret["enforce_input_project"] = cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ
125 | 
126 |         ret["mask_dim"] = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM
127 | 
128 |         return ret
129 | 
130 |     def forward(self, x, mask_features, mask=None):
131 |         if mask is not None:
132 |             mask = F.interpolate(mask[None].float(), size=x.shape[-2:]).to(torch.bool)[0]
133 |         pos = self.pe_layer(x, mask)
134 | 
135 |         src = x
136 |         hs, memory = self.transformer(self.input_proj(src), mask, self.query_embed.weight, pos)
137 | 
138 |         if self.mask_classification:
139 |             outputs_class = self.class_embed(hs)
140 |             out = {"pred_logits": outputs_class[-1]}
141 |         else:
142 |             out = {}
143 | 
144 |         if self.aux_loss:
145 |             # [l, bs, queries, embed]
146 |             mask_embed = self.mask_embed(hs)
147 |             outputs_seg_masks = torch.einsum("lbqc,bchw->lbqhw", mask_embed, mask_features)
148 |             out["pred_masks"] = outputs_seg_masks[-1]
149 |             out["aux_outputs"] = self._set_aux_loss(
150 |                 outputs_class if self.mask_classification else None, outputs_seg_masks
151 |             )
152 |         else:
153 |             # FIXME h_boxes takes the last one computed, keep this in mind
154 |             # [bs, queries, embed]
155 |             mask_embed = self.mask_embed(hs[-1])
156 |             outputs_seg_masks = torch.einsum("bqc,bchw->bqhw", mask_embed, mask_features)
157 |             out["pred_masks"] = outputs_seg_masks
158 |         return out
159 | 
160 |     @torch.jit.unused
161 |     def _set_aux_loss(self, outputs_class, outputs_seg_masks):
162 |         # this is a workaround to make torchscript happy, as torchscript
163 |         # doesn't support dictionary with non-homogeneous values, such
164 |         # as a dict having both a Tensor and a list.
165 |         if self.mask_classification:
166 |             return [
167 |                 {"pred_logits": a, "pred_masks": b}
168 |                 for a, b in zip(outputs_class[:-1], outputs_seg_masks[:-1])
169 |             ]
170 |         else:
171 |             return [{"pred_masks": b} for b in outputs_seg_masks[:-1]]
172 | 
173 | 
174 | class MLP(nn.Module):
175 |     """Very simple multi-layer perceptron (also called FFN)"""
176 | 
177 |     def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
178 |         super().__init__()
179 |         self.num_layers = num_layers
180 |         h = [hidden_dim] * (num_layers - 1)
181 |         self.layers = nn.ModuleList(
182 |             nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])
183 |         )
184 | 
185 |     def forward(self, x):
186 |         for i, layer in enumerate(self.layers):
187 |             x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
188 |         return x
189 | 


--------------------------------------------------------------------------------
/ovformer/data_video/datasets/burst.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import contextlib
  3 | import io
  4 | import json
  5 | import logging
  6 | import numpy as np
  7 | import os
  8 | import pycocotools.mask as mask_util
  9 | from fvcore.common.file_io import PathManager
 10 | from fvcore.common.timer import Timer
 11 | 
 12 | from detectron2.structures import Boxes, BoxMode, PolygonMasks
 13 | from detectron2.data import DatasetCatalog, MetadataCatalog
 14 | from .burst_categories import BURST_CATEGORIES
 15 | """
 16 | This file contains functions to parse YTVIS dataset of
 17 | COCO-format annotations into dicts in "Detectron2 format".
 18 | """
 19 | 
 20 | logger = logging.getLogger(__name__)
 21 | 
 22 | __all__ = ["load_burst_json", "register_burst_instances"]
 23 | 
 24 | 
 25 | def _get_burst_instances_meta():
 26 |     thing_ids = [k["id"] for k in BURST_CATEGORIES]
 27 |     assert len(thing_ids) == 482, len(thing_ids)
 28 |     # Mapping from the incontiguous YTVIS category id to an id in [0, 39]
 29 |     thing_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(thing_ids)}
 30 |     thing_classes = [k["name"] for k in BURST_CATEGORIES]
 31 |     ret = {
 32 |         "thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id,
 33 |         "thing_classes": thing_classes,
 34 |     }
 35 |     return ret
 36 | 
 37 | 
 38 | def load_burst_json(json_file, image_root, dataset_name=None, extra_annotation_keys=None):
 39 |     from .ytvis_api.burst import BURST
 40 | 
 41 |     timer = Timer()
 42 |     json_file = PathManager.get_local_path(json_file)
 43 |     with contextlib.redirect_stdout(io.StringIO()):
 44 |         ytvis_api = BURST(json_file)
 45 |     if timer.seconds() > 1:
 46 |         logger.info("Loading {} takes {:.2f} seconds.".format(json_file, timer.seconds()))
 47 | 
 48 |     id_map = None
 49 |     if dataset_name is not None:
 50 |         meta = MetadataCatalog.get(dataset_name)
 51 |         cat_ids = sorted(ytvis_api.getCatIds())
 52 |         cats = ytvis_api.loadCats(cat_ids)
 53 |         # The categories in a custom json file may not be sorted.
 54 |         thing_classes = [c["name"] for c in sorted(cats, key=lambda x: x["id"])]
 55 |         meta.thing_classes = thing_classes
 56 | 
 57 |         # In COCO, certain category ids are artificially removed,
 58 |         # and by convention they are always ignored.
 59 |         # We deal with COCO's id issue and translate
 60 |         # the category ids to contiguous ids in [0, 80).
 61 | 
 62 |         # It works by looking at the "categories" field in the json, therefore
 63 |         # if users' own json also have incontiguous ids, we'll
 64 |         # apply this mapping as well but print a warning.
 65 |         if not (min(cat_ids) == 1 and max(cat_ids) == len(cat_ids)):
 66 |             if "coco" not in dataset_name:
 67 |                 logger.warning(
 68 |                     """
 69 | Category ids in annotations are not in [1, #categories]! We'll apply a mapping for you.
 70 | """
 71 |                 )
 72 |         id_map = {v: i for i, v in enumerate(cat_ids)} 
 73 |         meta.thing_dataset_id_to_contiguous_id = id_map 
 74 |     # sort indices for reproducible results
 75 |     vid_ids = sorted(ytvis_api.vids.keys())
 76 |     # vids is a list of dicts, each looks something like:
 77 |     # {'license': 1,
 78 |     #  'flickr_url': ' ',
 79 |     #  'file_names': ['ff25f55852/00000.jpg', 'ff25f55852/00005.jpg', ..., 'ff25f55852/00175.jpg'],
 80 |     #  'height': 720,
 81 |     #  'width': 1280,
 82 |     #  'length': 36,
 83 |     #  'date_captured': '2019-04-11 00:55:41.903902',
 84 |     #  'id': 2232}
 85 |     vids = ytvis_api.loadVids(vid_ids)
 86 | 
 87 |     anns = [ytvis_api.vidToAnns[vid_id] for vid_id in vid_ids]
 88 |     total_num_valid_anns = sum([len(x) for x in anns])
 89 |     total_num_anns = len(ytvis_api.anns)
 90 |     if total_num_valid_anns < total_num_anns:
 91 |         logger.warning(
 92 |             f"{json_file} contains {total_num_anns} annotations, but only "
 93 |             f"{total_num_valid_anns} of them match to images in the file."
 94 |         )
 95 | 
 96 |     vids_anns = list(zip(vids, anns))
 97 |     logger.info("Loaded {} videos in YTVIS format from {}".format(len(vids_anns), json_file))
 98 | 
 99 |     dataset_dicts = []
100 | 
101 |     ann_keys = ["iscrowd", "category_id", "id"] + (extra_annotation_keys or [])
102 | 
103 |     num_instances_without_valid_segmentation = 0
104 | 
105 |     for (vid_dict, anno_dict_list) in vids_anns:
106 |         record = {}
107 |         #record["file_names"] = [os.path.join(image_root, vid_dict["file_names"][i]) for i in range(vid_dict["length"])]
108 |         record["file_names"] = [os.path.join(image_root, '/'.join(vid_dict["file_names"][i].split('\\')[-2:])) for i in range(vid_dict["length"])]
109 |         record["height"] = vid_dict["height"]
110 |         record["width"] = vid_dict["width"]
111 |         record["length"] = vid_dict["length"]
112 |         video_id = record["video_id"] = vid_dict["id"]
113 | 
114 |         video_objs = []
115 |         for frame_idx in range(record["length"]):
116 |             frame_objs = []
117 |             for anno in anno_dict_list:
118 |                 assert anno["video_id"] == video_id
119 | 
120 |                 obj = {key: anno[key] for key in ann_keys if key in anno}
121 | 
122 |                 _segm = anno.get("segmentations", None)
123 | 
124 |                 if not ( _segm and _segm[frame_idx]):
125 |                     continue
126 | 
127 |                 segm = _segm[frame_idx]
128 | 
129 | 
130 |                 if isinstance(segm, dict):
131 |                     if isinstance(segm["counts"], list):
132 |                         # convert to compressed RLE
133 |                         segm = mask_util.frPyObjects(segm, *segm["size"])
134 |                 elif segm:
135 |                     # filter out invalid polygons (< 3 points)
136 |                     segm = [poly for poly in segm if len(poly) % 2 == 0 and len(poly) >= 6]
137 |                     if len(segm) == 0:
138 |                         num_instances_without_valid_segmentation += 1
139 |                         continue  # ignore this instance
140 |                 obj["segmentation"] = segm
141 | 
142 |                 if id_map:
143 |                     obj["category_id"] = id_map[obj["category_id"]]
144 |                 frame_objs.append(obj)
145 |             video_objs.append(frame_objs)
146 |         record["annotations"] = video_objs
147 |         dataset_dicts.append(record)
148 | 
149 |     if num_instances_without_valid_segmentation > 0:
150 |         logger.warning(
151 |             "Filtered out {} instances without valid segmentation. ".format(
152 |                 num_instances_without_valid_segmentation
153 |             )
154 |             + "There might be issues in your dataset generation process. "
155 |             "A valid polygon should be a list[float] with even length >= 6."
156 |         )
157 |     return dataset_dicts
158 | 
159 | 
160 | def register_burst_instances(name, metadata, json_file, image_root):
161 |     """
162 |     Register a dataset in YTVIS's json annotation format for
163 |     instance tracking.
164 | 
165 |     Args:
166 |         name (str): the name that identifies a dataset, e.g. "ytvis_train".
167 |         metadata (dict): extra metadata associated with this dataset.  You can
168 |             leave it as an empty dict.
169 |         json_file (str): path to the json instance annotation file.
170 |         image_root (str or path-like): directory which contains all the images.
171 |     """
172 |     assert isinstance(name, str), name
173 |     assert isinstance(json_file, (str, os.PathLike)), json_file
174 |     assert isinstance(image_root, (str, os.PathLike)), image_root
175 |     # 1. register a function which returns dicts
176 |     DatasetCatalog.register(name, lambda: load_burst_json(json_file, image_root, name))
177 | 
178 |     # 2. Optionally, add metadata about this dataset,
179 |     # since they might be useful in evaluation, visualization or logging
180 |     MetadataCatalog.get(name).set(
181 |         json_file=json_file, image_root=image_root, evaluator_type="ytvis", **metadata
182 |     )
183 | 
184 | 


--------------------------------------------------------------------------------