├── ovformer ├── evaluation │ ├── __init__.py │ └── instance_evaluation.py ├── utils │ ├── __init__.py │ └── misc.py ├── modeling │ ├── backbone │ │ └── __init__.py │ ├── meta_arch │ │ ├── __init__.py │ │ └── mask_former_head.py │ ├── pixel_decoder │ │ ├── __init__.py │ │ └── ops │ │ │ ├── make.sh │ │ │ ├── modules │ │ │ ├── __init__.py │ │ │ └── ms_deform_attn.py │ │ │ ├── functions │ │ │ ├── __init__.py │ │ │ └── ms_deform_attn_func.py │ │ │ ├── src │ │ │ ├── vision.cpp │ │ │ ├── cuda │ │ │ │ └── ms_deform_attn_cuda.h │ │ │ ├── cpu │ │ │ │ ├── ms_deform_attn_cpu.h │ │ │ │ └── ms_deform_attn_cpu.cpp │ │ │ └── ms_deform_attn.h │ │ │ ├── setup.py │ │ │ └── test.py │ ├── transformer_decoder │ │ ├── __init__.py │ │ ├── position_encoding.py │ │ ├── zero_shot_classifier.py │ │ └── maskformer_transformer_decoder.py │ ├── __init__.py │ └── util.py ├── data │ ├── dataset_mappers │ │ ├── __init__.py │ │ ├── coco_panoptic_new_baseline_dataset_mapper.py │ │ ├── mask_former_panoptic_dataset_mapper.py │ │ ├── mask_former_instance_dataset_mapper.py │ │ └── mask_former_semantic_dataset_mapper.py │ ├── __init__.py │ └── datasets │ │ ├── __init__.py │ │ ├── register_ade20k_instance.py │ │ └── lvis_v1.py ├── data_video │ ├── datasets │ │ ├── ytvis_api │ │ │ └── __init__.py │ │ ├── __init__.py │ │ ├── builtin.py │ │ └── burst.py │ ├── __init__.py │ └── augmentation.py ├── __init__.py ├── test_time_augmentation.py └── config.py ├── OVFormer.png ├── ovformer_video ├── utils │ ├── __init__.py │ └── memory.py ├── modeling │ ├── __init__.py │ └── transformer_decoder │ │ ├── __init__.py │ │ ├── position_encoding.py │ │ └── zero_shot_classifier.py ├── __init__.py └── config.py ├── requirements.txt ├── evaluate ├── __init__.py └── mask.py ├── configs ├── lvis │ ├── ovformer_SwinB_bs8.yaml │ ├── Base-COCO-InstanceSegmentation.yaml │ └── ovformer_R50_bs8.yaml ├── ovis │ ├── ovformer_SwinB_bs8.yaml │ └── ovformer_R50_bs8.yaml ├── burst │ ├── ovformer_SwinB_bs8.yaml │ └── ovformer_R50_bs8.yaml ├── youtubevis_2019 │ ├── ovformer_SwinB_bs8.yaml │ └── ovformer_R50_bs8.yaml ├── youtubevis_2021 │ ├── ovformer_SwinB_bs8.yaml │ └── ovformer_R50_bs8.yaml └── lvvis │ ├── video_ovformer_SwinB_bs8.yaml │ ├── Base-LVVIS-VideoInstanceSegmentation.yaml │ ├── ovformer_R50_bs8.yaml │ └── video_ovformer_R50_bs8.yaml ├── .gitignore ├── tools ├── remove_lvvis_novel.py ├── remove_lvis_rare.py ├── lvivs_test_instances_json.py ├── convert-thirdparty-pretrained-model-to-d2.py ├── ytvis_json.py ├── get_lvvis_cat_info.py ├── get_lvis_cat_info.py ├── vis_results.py ├── save_clip_features.py ├── analyze_model.py └── burst2ytvis.py ├── INSTALL.md ├── mAP.py ├── README.md └── datasets └── README.md /ovformer/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /OVFormer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fanghaook/OVFormer/HEAD/OVFormer.png -------------------------------------------------------------------------------- /ovformer/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /ovformer_video/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /ovformer/modeling/backbone/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /ovformer/data/dataset_mappers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /ovformer/modeling/meta_arch/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /ovformer/modeling/pixel_decoder/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | opencv-python==4.8.0.74 2 | cython 3 | scipy 4 | shapely 5 | timm==0.5.4 6 | h5py 7 | submitit 8 | scikit-image -------------------------------------------------------------------------------- /ovformer/data/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from . import datasets 3 | from .catalog import DatasetCatalog, MetadataCatalog 4 | -------------------------------------------------------------------------------- /ovformer/data_video/datasets/ytvis_api/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Modified by Bowen Cheng from https://github.com/youtubevos/cocoapi 3 | -------------------------------------------------------------------------------- /evaluate/__init__.py: -------------------------------------------------------------------------------- 1 | from .burst import BURST 2 | from .bursteval import BURSTeval 3 | from .lvvis import LVVIS 4 | from .lvviseval import LVVISeval 5 | from .mask import encode ,decode,area,toBbox -------------------------------------------------------------------------------- /ovformer_video/modeling/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from .transformer_decoder.video_mask2former_transformer_decoder import VideoMultiScaleMaskedTransformerDecoder 3 | -------------------------------------------------------------------------------- /ovformer_video/modeling/transformer_decoder/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from .video_mask2former_transformer_decoder import VideoMultiScaleMaskedTransformerDecoder 3 | -------------------------------------------------------------------------------- /ovformer/modeling/transformer_decoder/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from .maskformer_transformer_decoder import StandardTransformerDecoder 3 | from .mask2former_transformer_decoder import MultiScaleMaskedTransformerDecoder 4 | -------------------------------------------------------------------------------- /ovformer_video/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from . import modeling 3 | 4 | # config 5 | from .config import add_ovformer_video_config 6 | 7 | # models 8 | from .video_ovformer_model import VideoOVFormer 9 | 10 | -------------------------------------------------------------------------------- /ovformer/data_video/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC 3 | 4 | from . import builtin # ensure the builtin datasets are registered 5 | 6 | __all__ = [k for k in globals().keys() if "builtin" not in k and not k.startswith("_")] 7 | -------------------------------------------------------------------------------- /ovformer/data/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from . import ( 3 | register_ade20k_full, 4 | register_ade20k_panoptic, 5 | register_coco_stuff_10k, 6 | register_mapillary_vistas, 7 | register_coco_panoptic_annos_semseg, 8 | register_ade20k_instance, 9 | register_mapillary_vistas_panoptic, 10 | lvis_v1, 11 | lvvis_oracle, 12 | ) 13 | -------------------------------------------------------------------------------- /ovformer/modeling/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from .backbone.swin import D2SwinTransformer 3 | from .backbone.timm import build_timm_backbone 4 | from .pixel_decoder.fpn import BasePixelDecoder 5 | from .pixel_decoder.msdeformattn import MSDeformAttnPixelDecoder 6 | from .meta_arch.mask_former_head import MaskFormerHead 7 | from .meta_arch.per_pixel_baseline import PerPixelBaselineHead, PerPixelBaselinePlusHead 8 | -------------------------------------------------------------------------------- /ovformer_video/config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | from detectron2.config import CfgNode as CN 4 | 5 | 6 | def add_ovformer_video_config(cfg): 7 | # video data 8 | # DataLoader 9 | cfg.INPUT.SAMPLING_FRAME_NUM = 2 10 | cfg.INPUT.SAMPLING_FRAME_RANGE = 20 11 | cfg.INPUT.SAMPLING_FRAME_SHUFFLE = False 12 | cfg.INPUT.AUGMENTATIONS = [] # "brightness", "contrast", "saturation", "rotation" 13 | -------------------------------------------------------------------------------- /ovformer/data_video/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC 3 | 4 | from .dataset_mapper import YTVISDatasetMapper, CocoClipDatasetMapper 5 | from .build import * 6 | 7 | from .datasets import * 8 | from .ytvis_eval import YTVISEvaluator 9 | from .ovis_eval import OVISEvaluator 10 | from .lvvis_eval import LVVISEvaluator 11 | from .lvvis_eval_video import LVVISEvaluator_video 12 | from .burst_eval import BURSTEvaluator 13 | -------------------------------------------------------------------------------- /configs/lvis/ovformer_SwinB_bs8.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ovformer_R50_bs8.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 128 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [4, 8, 16, 32] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "models/swin_base_patch4_window12_384_22k.pkl" 15 | SEM_SEG_HEAD: 16 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 17 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 18 | 19 | -------------------------------------------------------------------------------- /configs/ovis/ovformer_SwinB_bs8.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ovformer_R50_bs8.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 128 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [4, 8, 16, 32] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "models/swin_base_patch4_window12_384_22k.pkl" 15 | SEM_SEG_HEAD: 16 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 17 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 18 | 19 | INPUT: 20 | MIN_SIZE_TEST: 480 21 | 22 | -------------------------------------------------------------------------------- /configs/burst/ovformer_SwinB_bs8.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ovformer_R50_bs8.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 128 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [4, 8, 16, 32] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "models/swin_base_patch4_window12_384_22k.pkl" 15 | SEM_SEG_HEAD: 16 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 17 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 18 | 19 | INPUT: 20 | MIN_SIZE_TEST: 480 21 | 22 | -------------------------------------------------------------------------------- /configs/youtubevis_2019/ovformer_SwinB_bs8.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ovformer_R50_bs8.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 128 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [4, 8, 16, 32] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "models/swin_base_patch4_window12_384_22k.pkl" 15 | SEM_SEG_HEAD: 16 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 17 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 18 | 19 | INPUT: 20 | MIN_SIZE_TEST: 480 21 | 22 | -------------------------------------------------------------------------------- /configs/youtubevis_2021/ovformer_SwinB_bs8.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ovformer_R50_bs8.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 128 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [4, 8, 16, 32] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "models/swin_base_patch4_window12_384_22k.pkl" 15 | SEM_SEG_HEAD: 16 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 17 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 18 | 19 | INPUT: 20 | MIN_SIZE_TEST: 480 21 | 22 | -------------------------------------------------------------------------------- /configs/lvvis/video_ovformer_SwinB_bs8.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: video_ovformer_R50_bs8.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 128 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [4, 8, 16, 32] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "models/OVFormer_swin_lvis.pth" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | SEM_SEG_HEAD: 18 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 19 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 20 | INPUT: 21 | MIN_SIZE_TEST: 480 22 | 23 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # output dir 2 | output* 3 | 4 | *.json 5 | *.diff 6 | *.jpg 7 | !/projects/DensePose/doc/images/*.jpg 8 | 9 | # compilation and distribution 10 | __pycache__ 11 | _ext 12 | *.pyc 13 | *.pyd 14 | *.so 15 | *.dll 16 | *.egg-info/ 17 | build/ 18 | dist/ 19 | wheels/ 20 | 21 | # pytorch/python/numpy formats 22 | *.pth 23 | *.pkl 24 | *.npy 25 | *.ts 26 | model_ts*.txt 27 | 28 | # ipython/jupyter notebooks 29 | *.ipynb 30 | **/.ipynb_checkpoints/ 31 | 32 | # Editor temporaries 33 | *.swn 34 | *.swo 35 | *.swp 36 | *~ 37 | 38 | # editor settings 39 | .idea 40 | .vscode 41 | _darcs 42 | 43 | # project dirs 44 | /detectron2/model_zoo/configs 45 | /datasets/* 46 | !/datasets/*.* 47 | /projects/*/datasets 48 | /models 49 | /snippet 50 | 51 | detectron2 -------------------------------------------------------------------------------- /ovformer/modeling/pixel_decoder/ops/make.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # ------------------------------------------------------------------------------------------------ 3 | # Deformable DETR 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | # ------------------------------------------------------------------------------------------------ 7 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | # ------------------------------------------------------------------------------------------------ 9 | 10 | # Copyright (c) Facebook, Inc. and its affiliates. 11 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 12 | 13 | python setup.py build install 14 | -------------------------------------------------------------------------------- /ovformer/modeling/pixel_decoder/ops/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | from .ms_deform_attn import MSDeformAttn 13 | -------------------------------------------------------------------------------- /tools/remove_lvvis_novel.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | 4 | if __name__ == '__main__': 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument('--ann', default='datasets/LVVIS/train/train_instances_.json') 7 | args = parser.parse_args() 8 | 9 | print('Loading', args.ann) 10 | data = json.load(open(args.ann, 'r')) 11 | print('all #anns', len(data['annotations'])) # 15967 12 | 13 | novel_categories = [i['id'] for i in data['categories'] if i['partition'] in [2, 3]] 14 | data['annotations'] = [x for x in data['annotations'] if x['category_id'] not in novel_categories] 15 | 16 | print('nonovel #anns', len(data['annotations'])) # 10884 17 | out_path = args.ann[:-5] + 'nonovel.json' 18 | print('Saving to', out_path) 19 | json.dump(data, open(out_path, 'w')) 20 | 21 | -------------------------------------------------------------------------------- /ovformer/modeling/pixel_decoder/ops/functions/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | from .ms_deform_attn_func import MSDeformAttnFunction 13 | 14 | -------------------------------------------------------------------------------- /tools/remove_lvis_rare.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import argparse 3 | import json 4 | 5 | if __name__ == '__main__': 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument('--ann', default='datasets/lvis/lvis_v1_train.json') 8 | args = parser.parse_args() 9 | 10 | print('Loading', args.ann) 11 | data = json.load(open(args.ann, 'r')) 12 | catid2freq = {x['id']: x['frequency'] for x in data['categories']} 13 | print('ori #anns', len(data['annotations'])) 14 | exclude = ['r'] 15 | data['annotations'] = [x for x in data['annotations'] \ 16 | if catid2freq[x['category_id']] not in exclude] 17 | print('filtered #anns', len(data['annotations'])) 18 | out_path = args.ann[:-5] + '_norare.json' 19 | print('Saving to', out_path) 20 | json.dump(data, open(out_path, 'w')) 21 | -------------------------------------------------------------------------------- /ovformer/modeling/pixel_decoder/ops/src/vision.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #include "ms_deform_attn.h" 17 | 18 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 19 | m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward"); 20 | m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward"); 21 | } 22 | -------------------------------------------------------------------------------- /configs/lvis/Base-COCO-InstanceSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_timm_backbone" 5 | WEIGHTS: "models/resnet50_miil_21k.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | TIMM: 9 | BASE_NAME: resnet50_in21k 10 | DATASETS: 11 | TRAIN: ("lvis_v1_train_norare",) 12 | TEST: ("lvis_v1_val",) 13 | SOLVER: 14 | IMS_PER_BATCH: 8 15 | BASE_LR: 0.0001 16 | STEPS: (398250, 420375) 17 | MAX_ITER: 442500 18 | CHECKPOINT_PERIOD: 20000 19 | WARMUP_FACTOR: 1.0 20 | WARMUP_ITERS: 10 21 | WEIGHT_DECAY: 0.05 22 | OPTIMIZER: "ADAMW" 23 | BACKBONE_MULTIPLIER: 0.1 24 | CLIP_GRADIENTS: 25 | ENABLED: True 26 | CLIP_TYPE: "full_model" 27 | CLIP_VALUE: 0.01 28 | NORM_TYPE: 2.0 29 | AMP: 30 | ENABLED: True 31 | INPUT: 32 | IMAGE_SIZE: 1024 33 | MIN_SCALE: 0.1 34 | MAX_SCALE: 2.0 35 | FORMAT: "RGB" 36 | DATASET_MAPPER_NAME: "coco_instance_lsj" 37 | TEST: 38 | EVAL_PERIOD: 20000 39 | DATALOADER: 40 | FILTER_EMPTY_ANNOTATIONS: True 41 | NUM_WORKERS: 8 42 | SAMPLER_TRAIN: "RepeatFactorTrainingSampler" 43 | REPEAT_THRESHOLD: 0.001 44 | VERSION: 2 45 | -------------------------------------------------------------------------------- /tools/lvivs_test_instances_json.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import cv2 4 | 5 | val_instances = json.load(open('datasets/LVVIS/val/val_instances_.json', 'r')) 6 | categories = val_instances['categories'] 7 | 8 | videos = [] 9 | video_folder = 'datasets/lvvis/test/JPEGImages' 10 | video_ids = sorted(os.listdir(video_folder)) 11 | for video_id, video_name in enumerate(video_ids): 12 | video_path = os.path.join(video_folder, video_name) 13 | file_names = sorted(os.listdir(video_path)) 14 | 15 | first_frame_path = os.path.join(video_path, file_names[0]) 16 | frame = cv2.imread(first_frame_path) 17 | height, width, _ = frame.shape 18 | 19 | video_info = { 20 | 'id': video_id, 21 | 'width': width, 22 | 'height': height, 23 | 'length': len(file_names), 24 | 'file_names': [os.path.join(video_name, file_name) for file_name in file_names] 25 | } 26 | videos.append(video_info) 27 | 28 | test_data = { 29 | 'videos': videos, 30 | 'categories': categories 31 | } 32 | 33 | with open('datasets/LVVIS/test/test_instances.json', 'w') as f: 34 | json.dump(test_data, f, indent=4) 35 | 36 | print("test_instances.json done.") -------------------------------------------------------------------------------- /INSTALL.md: -------------------------------------------------------------------------------- 1 | ## Installation 2 | 3 | ### Requirements 4 | - Linux or macOS with Python ≥ 3.6 5 | - PyTorch ≥ 1.9 and [torchvision](https://github.com/pytorch/vision/) that matches the PyTorch installation. 6 | Install them together at [pytorch.org](https://pytorch.org) to make sure of this. Note, please check 7 | PyTorch version matches that is required by Detectron2. 8 | - Detectron2: follow [Detectron2 installation instructions](https://detectron2.readthedocs.io/tutorials/install.html). 9 | - `pip install -r requirements.txt` 10 | 11 | ### Example conda environment setup 12 | ```bash 13 | conda create --name ovformer python=3.8 -y 14 | conda activate ovformer 15 | conda install pytorch==1.9.0 torchvision==0.10.0 cudatoolkit=11.1 -c pytorch -c nvidia 16 | pip install git+https://github.com/cocodataset/panopticapi.git 17 | pip install git+https://github.com/lvis-dataset/lvis-api.git 18 | 19 | # under your working directory 20 | git clone git@github.com:facebookresearch/detectron2.git 21 | cd detectron2 22 | pip install -e . 23 | 24 | cd .. 25 | git clone https://github.com/fanghaook/OVFormer.git 26 | cd OVFormer 27 | pip install -r requirements.txt 28 | cd ovformer/modeling/pixel_decoder/ops 29 | sh make.sh 30 | cd ../../../.. 31 | ``` 32 | -------------------------------------------------------------------------------- /configs/lvvis/Base-LVVIS-VideoInstanceSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | NAME: "build_timm_backbone" 4 | WEIGHTS: "models/resnet50_miil_21k.pkl" 5 | PIXEL_MEAN: [123.675, 116.280, 103.530] 6 | PIXEL_STD: [58.395, 57.120, 57.375] 7 | TIMM: 8 | BASE_NAME: resnet50_in21k 9 | DATASETS: 10 | TRAIN: ("lvvis_train",) 11 | TEST: ("lvvis_val",) 12 | SOLVER: 13 | IMS_PER_BATCH: 8 14 | BASE_LR: 0.0001 15 | STEPS: (1000,) 16 | MAX_ITER: 2000 17 | CHECKPOINT_PERIOD: 1000 18 | WARMUP_FACTOR: 1.0 19 | WARMUP_ITERS: 10 20 | WEIGHT_DECAY: 0.05 21 | OPTIMIZER: "ADAMW" 22 | BACKBONE_MULTIPLIER: 0.1 23 | CLIP_GRADIENTS: 24 | ENABLED: True 25 | CLIP_TYPE: "full_model" 26 | CLIP_VALUE: 0.01 27 | NORM_TYPE: 2.0 28 | AMP: 29 | ENABLED: True 30 | INPUT: 31 | MIN_SIZE_TRAIN_SAMPLING: "choice_by_clip" 32 | RANDOM_FLIP: "flip_by_clip" 33 | AUGMENTATIONS: [] 34 | MIN_SIZE_TRAIN: (360, 480) 35 | MIN_SIZE_TEST: 360 36 | CROP: 37 | ENABLED: False 38 | TYPE: "absolute_range" 39 | SIZE: (600, 720) 40 | FORMAT: "RGB" 41 | SAMPLING_FRAME_NUM: 2 42 | SAMPLING_FRAME_RANGE: 20 43 | TEST: 44 | EVAL_PERIOD: 0 45 | DETECTIONS_PER_IMAGE: 50 46 | DATALOADER: 47 | FILTER_EMPTY_ANNOTATIONS: False 48 | NUM_WORKERS: 4 49 | VERSION: 2 50 | -------------------------------------------------------------------------------- /ovformer/modeling/util.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import json 3 | import numpy as np 4 | from torch.nn import functional as F 5 | 6 | def load_class_freq( 7 | path='datasets/metadata/lvis_v1_train_cat_info.json', freq_weight=1.0): 8 | cat_info = json.load(open(path, 'r')) 9 | if path=='datasets/metadata/lvis_v1_train_cat_info.json': 10 | cat_info = torch.tensor( 11 | [(c['image_count'] if c['frequency'] != 'r' else 0) for c in sorted(cat_info, key=lambda x: x['id'])]) 12 | else: 13 | cat_info = torch.tensor( 14 | [(c['image_count'] if c['frequency'] != 'n' else 0) for c in sorted(cat_info, key=lambda x: x['id'])]) 15 | freq_weight = cat_info.float() ** freq_weight 16 | return freq_weight 17 | 18 | 19 | def get_fed_loss_inds(gt_classes, num_sample_cats, C, weight=None): 20 | appeared = torch.unique(gt_classes) # C' 21 | prob = appeared.new_ones(C + 1).float() 22 | prob[-1] = 0 23 | if len(appeared) < num_sample_cats: 24 | if weight is not None: 25 | prob[:C] = weight.float().clone() 26 | prob[appeared] = 0 27 | more_appeared = torch.multinomial( 28 | prob, num_sample_cats - len(appeared), 29 | replacement=False) 30 | appeared = torch.cat([appeared, more_appeared]) 31 | return appeared 32 | -------------------------------------------------------------------------------- /tools/convert-thirdparty-pretrained-model-to-d2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 3 | import argparse 4 | import pickle 5 | import torch 6 | 7 | """ 8 | Usage: 9 | 10 | cd models/ 11 | wget https://miil-public-eu.oss-eu-central-1.aliyuncs.com/model-zoo/ImageNet_21K_P/models/resnet50_miil_21k.pth 12 | python ../tools/convert-thirdparty-pretrained-model-to-d2.py --path resnet50_miil_21k.pth 13 | 14 | download swin_base_patch4_window12_384_22k.pth from https://github.com/microsoft/Swin-Transformer 15 | python ../tools/convert-thirdparty-pretrained-model-to-d2.py --path swin_base_patch4_window12_384_22k.pth 16 | 17 | """ 18 | 19 | 20 | if __name__ == "__main__": 21 | parser = argparse.ArgumentParser() 22 | parser.add_argument('--path', default='') 23 | args = parser.parse_args() 24 | 25 | print('Loading', args.path) 26 | model = torch.load(args.path, map_location="cpu") 27 | # import pdb; pdb.set_trace() 28 | if 'model' in model: 29 | model = model['model'] 30 | if 'state_dict' in model: 31 | model = model['state_dict'] 32 | ret = { 33 | "model": model, 34 | "__author__": "third_party", 35 | "matching_heuristics": True 36 | } 37 | out_path = args.path.replace('.pth', '.pkl') 38 | print('Saving to', out_path) 39 | pickle.dump(ret, open(out_path, "wb")) -------------------------------------------------------------------------------- /ovformer/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from . import data # register all new datasets 3 | from . import modeling 4 | 5 | # config 6 | from .config import add_ovformer_config 7 | 8 | # dataset loading 9 | from .data.dataset_mappers.coco_instance_new_baseline_dataset_mapper import COCOInstanceNewBaselineDatasetMapper 10 | from .data.dataset_mappers.coco_panoptic_new_baseline_dataset_mapper import COCOPanopticNewBaselineDatasetMapper 11 | from .data.dataset_mappers.mask_former_instance_dataset_mapper import ( 12 | MaskFormerInstanceDatasetMapper, 13 | ) 14 | from .data.dataset_mappers.mask_former_panoptic_dataset_mapper import ( 15 | MaskFormerPanopticDatasetMapper, 16 | ) 17 | from .data.dataset_mappers.mask_former_semantic_dataset_mapper import ( 18 | MaskFormerSemanticDatasetMapper, 19 | ) 20 | 21 | # models 22 | from .ovformer_model_video import OVFormerVideo 23 | from .ovformer_model import OVFormer 24 | from .test_time_augmentation import SemanticSegmentorWithTTA 25 | 26 | # evaluation 27 | from .evaluation.instance_evaluation import InstanceSegEvaluator 28 | 29 | 30 | from .data_video import ( 31 | YTVISDatasetMapper, 32 | YTVISEvaluator, 33 | OVISEvaluator, 34 | LVVISEvaluator, 35 | LVVISEvaluator_video, 36 | BURSTEvaluator, 37 | build_detection_train_loader, 38 | build_detection_test_loader, 39 | get_detection_dataset_dicts, 40 | ) 41 | -------------------------------------------------------------------------------- /ovformer/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #pragma once 17 | #include 18 | 19 | at::Tensor ms_deform_attn_cuda_forward( 20 | const at::Tensor &value, 21 | const at::Tensor &spatial_shapes, 22 | const at::Tensor &level_start_index, 23 | const at::Tensor &sampling_loc, 24 | const at::Tensor &attn_weight, 25 | const int im2col_step); 26 | 27 | std::vector ms_deform_attn_cuda_backward( 28 | const at::Tensor &value, 29 | const at::Tensor &spatial_shapes, 30 | const at::Tensor &level_start_index, 31 | const at::Tensor &sampling_loc, 32 | const at::Tensor &attn_weight, 33 | const at::Tensor &grad_output, 34 | const int im2col_step); 35 | 36 | -------------------------------------------------------------------------------- /ovformer/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #pragma once 17 | #include 18 | 19 | at::Tensor 20 | ms_deform_attn_cpu_forward( 21 | const at::Tensor &value, 22 | const at::Tensor &spatial_shapes, 23 | const at::Tensor &level_start_index, 24 | const at::Tensor &sampling_loc, 25 | const at::Tensor &attn_weight, 26 | const int im2col_step); 27 | 28 | std::vector 29 | ms_deform_attn_cpu_backward( 30 | const at::Tensor &value, 31 | const at::Tensor &spatial_shapes, 32 | const at::Tensor &level_start_index, 33 | const at::Tensor &sampling_loc, 34 | const at::Tensor &attn_weight, 35 | const at::Tensor &grad_output, 36 | const int im2col_step); 37 | 38 | 39 | -------------------------------------------------------------------------------- /tools/ytvis_json.py: -------------------------------------------------------------------------------- 1 | # Split results.json into base categories and novel categories, and zip compress them 2 | # YTVIS19:AP*40 = APb*33 + APn*7 3 | # YTVIS21:AP*40 = APb*34 + APn*6 4 | import json 5 | import zipfile 6 | import os 7 | 8 | # novel_id_ytvis19 = [6, 7, 9, 11, 23, 24, 39] 9 | # novel_id_ytvis21 = [11, 14, 15, 20, 30, 39] 10 | novel_id = [6, 7, 9, 11, 23, 24, 39] 11 | novel_list = [] 12 | base_list = [] 13 | 14 | results = json.load(open('output/inference/ytvis_2019_val/results.json', 'r')) 15 | for result in results: 16 | if result['category_id'] in novel_id: 17 | novel_list.append(result) 18 | else: 19 | base_list.append(result) 20 | 21 | # all.zip 22 | file_name = "results.json" 23 | with open(file_name, 'w') as json_file: 24 | json.dump(results, json_file) 25 | zip_file_name = "all.zip" 26 | with zipfile.ZipFile(zip_file_name, "w", zipfile.ZIP_DEFLATED) as zipf: 27 | zipf.write(file_name) 28 | 29 | # novel.zip 30 | with open(file_name, 'w') as json_file: 31 | json.dump(novel_list, json_file) 32 | zip_file_name = "novel.zip" 33 | with zipfile.ZipFile(zip_file_name, "w", zipfile.ZIP_DEFLATED) as zipf: 34 | zipf.write(file_name) 35 | 36 | # base.zip 37 | with open(file_name, 'w') as json_file: 38 | json.dump(base_list, json_file) 39 | zip_file_name = "base.zip" 40 | with zipfile.ZipFile(zip_file_name, "w", zipfile.ZIP_DEFLATED) as zipf: 41 | zipf.write(file_name) 42 | 43 | # delete results.json 44 | if os.path.exists(file_name): 45 | os.remove(file_name) -------------------------------------------------------------------------------- /configs/lvis/ovformer_R50_bs8.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-COCO-InstanceSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "OVFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IGNORE_VALUE: 255 7 | NUM_CLASSES: 1203 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 14 | IN_FEATURES: ["layer2", "layer3", "layer4", "layer5"] 15 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["layer3", "layer4", "layer5"] 16 | COMMON_STRIDE: 4 17 | TRANSFORMER_ENC_LAYERS: 6 18 | MASK_FORMER: 19 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 20 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 21 | CLIP_TEXT_PATH: "datasets/metadata/fg_bg_5_10_lvis_ens.npy" 22 | CLIP_CLASSIFIER: True 23 | DEEP_SUPERVISION: True 24 | NO_OBJECT_WEIGHT: 0.1 25 | OBJECT_WEIGHT: 2.0 26 | CLASS_WEIGHT: 2.0 27 | MASK_WEIGHT: 5.0 28 | DICE_WEIGHT: 5.0 29 | HIDDEN_DIM: 256 30 | NUM_OBJECT_QUERIES: 100 31 | NHEADS: 8 32 | DROPOUT: 0.0 33 | DIM_FEEDFORWARD: 2048 34 | ENC_LAYERS: 0 35 | PRE_NORM: False 36 | ENFORCE_INPUT_PROJ: False 37 | SIZE_DIVISIBILITY: 32 38 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 39 | TRAIN_NUM_POINTS: 12544 40 | OVERSAMPLE_RATIO: 3.0 41 | IMPORTANCE_SAMPLE_RATIO: 0.75 42 | TEST: 43 | SEMANTIC_ON: False 44 | INSTANCE_ON: True 45 | PANOPTIC_ON: False 46 | OVERLAP_THRESHOLD: 0.8 47 | OBJECT_MASK_THRESHOLD: 0.8 48 | -------------------------------------------------------------------------------- /ovformer/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #include 17 | 18 | #include 19 | #include 20 | 21 | 22 | at::Tensor 23 | ms_deform_attn_cpu_forward( 24 | const at::Tensor &value, 25 | const at::Tensor &spatial_shapes, 26 | const at::Tensor &level_start_index, 27 | const at::Tensor &sampling_loc, 28 | const at::Tensor &attn_weight, 29 | const int im2col_step) 30 | { 31 | AT_ERROR("Not implement on cpu"); 32 | } 33 | 34 | std::vector 35 | ms_deform_attn_cpu_backward( 36 | const at::Tensor &value, 37 | const at::Tensor &spatial_shapes, 38 | const at::Tensor &level_start_index, 39 | const at::Tensor &sampling_loc, 40 | const at::Tensor &attn_weight, 41 | const at::Tensor &grad_output, 42 | const int im2col_step) 43 | { 44 | AT_ERROR("Not implement on cpu"); 45 | } 46 | 47 | -------------------------------------------------------------------------------- /tools/get_lvvis_cat_info.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | 4 | if __name__ == '__main__': 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument("--ann", default='datasets/LVVIS/train/train_instances_nonovel.json') 7 | parser.add_argument("--add_freq", action='store_true') 8 | 9 | args = parser.parse_args() 10 | 11 | print('Loading', args.ann) 12 | data = json.load(open(args.ann, 'r')) 13 | cats = data['categories'] 14 | videos = data['videos'] 15 | 16 | video_count = {x['id']: set() for x in cats} 17 | image_count = {x['id']: 0 for x in cats} 18 | ann_count = {x['id']: 0 for x in cats} 19 | 20 | for x in data['annotations']: 21 | video_count[x['category_id']].add(x['video_id']) 22 | ann_count[x['category_id']] += x['length'] 23 | 24 | for category_id, video_set in video_count.items(): 25 | for video_id in video_set: 26 | image_count[category_id] += videos[video_id]['length'] 27 | 28 | num_freqs = {x: 0 for x in ['b', 'n']} 29 | for x in cats: 30 | x['image_count'] = image_count[x['id']] 31 | x['instance_count'] = ann_count[x['id']] 32 | if args.add_freq: 33 | freq = 'b' 34 | if x['image_count'] == 0: 35 | freq = 'n' 36 | x['frequency'] = freq 37 | num_freqs[freq] += 1 38 | 39 | if args.add_freq: 40 | for x in ['b', 'n']: 41 | print(x, num_freqs[x]) 42 | out = cats # {'categories': cats} 43 | out_path = 'datasets/metadata/lvvis_train_cat_info.json' 44 | print('Saving to', out_path) 45 | json.dump(out, open(out_path, 'w')) 46 | -------------------------------------------------------------------------------- /tools/get_lvis_cat_info.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import argparse 3 | import json 4 | 5 | if __name__ == '__main__': 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument("--ann", default='datasets/lvis/lvis_v1_train.json') 8 | parser.add_argument("--add_freq", action='store_true') 9 | parser.add_argument("--r_thresh", type=int, default=10) 10 | parser.add_argument("--c_thresh", type=int, default=100) 11 | args = parser.parse_args() 12 | 13 | print('Loading', args.ann) 14 | data = json.load(open(args.ann, 'r')) 15 | cats = data['categories'] 16 | image_count = {x['id']: set() for x in cats} 17 | ann_count = {x['id']: 0 for x in cats} 18 | for x in data['annotations']: 19 | image_count[x['category_id']].add(x['image_id']) 20 | ann_count[x['category_id']] += 1 21 | num_freqs = {x: 0 for x in ['r', 'f', 'c']} 22 | for x in cats: 23 | x['image_count'] = len(image_count[x['id']]) 24 | x['instance_count'] = ann_count[x['id']] 25 | if args.add_freq: 26 | freq = 'f' 27 | if x['image_count'] < args.c_thresh: 28 | freq = 'c' 29 | if x['image_count'] < args.r_thresh: 30 | freq = 'r' 31 | x['frequency'] = freq 32 | num_freqs[freq] += 1 33 | print(cats) 34 | image_counts = sorted([x['image_count'] for x in cats]) 35 | # print('image count', image_counts) 36 | # import pdb; pdb.set_trace() 37 | if args.add_freq: 38 | for x in ['r', 'c', 'f']: 39 | print(x, num_freqs[x]) 40 | out = cats # {'categories': cats} 41 | out_path = 'datasets/metadata/lvis_v1_train_cat_info.json' 42 | print('Saving to', out_path) 43 | json.dump(out, open(out_path, 'w')) 44 | 45 | -------------------------------------------------------------------------------- /configs/ovis/ovformer_R50_bs8.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../lvis/Base-COCO-InstanceSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "OVFormerVideo" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IGNORE_VALUE: 255 7 | NUM_CLASSES: 25 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 14 | IN_FEATURES: ["layer2", "layer3", "layer4", "layer5"] 15 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["layer3", "layer4", "layer5"] 16 | COMMON_STRIDE: 4 17 | TRANSFORMER_ENC_LAYERS: 6 18 | MASK_FORMER: 19 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 20 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 21 | CLIP_TEXT_PATH: "datasets/metadata/fg_bg_5_10_ovis_ens.npy" 22 | CLIP_IMAGE_PATH: "datasets/metadata/ovis_val_clip_feature.pkl" 23 | CLIP_CLASSIFIER: True 24 | DEEP_SUPERVISION: True 25 | NO_OBJECT_WEIGHT: 0.1 26 | OBJECT_WEIGHT: 2.0 27 | CLASS_WEIGHT: 2.0 28 | MASK_WEIGHT: 5.0 29 | DICE_WEIGHT: 5.0 30 | HIDDEN_DIM: 256 31 | NUM_OBJECT_QUERIES: 100 32 | NHEADS: 8 33 | DROPOUT: 0.0 34 | DIM_FEEDFORWARD: 2048 35 | ENC_LAYERS: 0 36 | PRE_NORM: False 37 | ENFORCE_INPUT_PROJ: False 38 | SIZE_DIVISIBILITY: 32 39 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 40 | TRAIN_NUM_POINTS: 12544 41 | OVERSAMPLE_RATIO: 3.0 42 | IMPORTANCE_SAMPLE_RATIO: 0.75 43 | TEST: 44 | SEMANTIC_ON: False 45 | INSTANCE_ON: True 46 | PANOPTIC_ON: False 47 | OVERLAP_THRESHOLD: 0.8 48 | OBJECT_MASK_THRESHOLD: 0.8 49 | 50 | DATASETS: 51 | TEST: ("ovis_val",) 52 | 53 | INPUT: 54 | MIN_SIZE_TEST: 360 55 | 56 | TEST: 57 | DETECTIONS_PER_IMAGE: 20 58 | -------------------------------------------------------------------------------- /configs/burst/ovformer_R50_bs8.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../lvis/Base-COCO-InstanceSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "OVFormerVideo" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IGNORE_VALUE: 255 7 | NUM_CLASSES: 482 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 14 | IN_FEATURES: ["layer2", "layer3", "layer4", "layer5"] 15 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["layer3", "layer4", "layer5"] 16 | COMMON_STRIDE: 4 17 | TRANSFORMER_ENC_LAYERS: 6 18 | MASK_FORMER: 19 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 20 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 21 | CLIP_TEXT_PATH: "datasets/metadata/fg_bg_5_10_burst_ens.npy" 22 | CLIP_IMAGE_PATH: "datasets/metadata/burst_val_clip_feature.pkl" 23 | CLIP_CLASSIFIER: True 24 | DEEP_SUPERVISION: True 25 | NO_OBJECT_WEIGHT: 0.1 26 | OBJECT_WEIGHT: 2.0 27 | CLASS_WEIGHT: 2.0 28 | MASK_WEIGHT: 5.0 29 | DICE_WEIGHT: 5.0 30 | HIDDEN_DIM: 256 31 | NUM_OBJECT_QUERIES: 100 32 | NHEADS: 8 33 | DROPOUT: 0.0 34 | DIM_FEEDFORWARD: 2048 35 | ENC_LAYERS: 0 36 | PRE_NORM: False 37 | ENFORCE_INPUT_PROJ: False 38 | SIZE_DIVISIBILITY: 32 39 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 40 | TRAIN_NUM_POINTS: 12544 41 | OVERSAMPLE_RATIO: 3.0 42 | IMPORTANCE_SAMPLE_RATIO: 0.75 43 | TEST: 44 | SEMANTIC_ON: False 45 | INSTANCE_ON: True 46 | PANOPTIC_ON: False 47 | OVERLAP_THRESHOLD: 0.8 48 | OBJECT_MASK_THRESHOLD: 0.8 49 | 50 | DATASETS: 51 | TEST: ("burst_val",) 52 | 53 | INPUT: 54 | MIN_SIZE_TEST: 360 55 | 56 | TEST: 57 | DETECTIONS_PER_IMAGE: 50 58 | -------------------------------------------------------------------------------- /configs/lvvis/ovformer_R50_bs8.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../lvis/Base-COCO-InstanceSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "OVFormerVideo" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IGNORE_VALUE: 255 7 | NUM_CLASSES: 1196 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 14 | IN_FEATURES: ["layer2", "layer3", "layer4", "layer5"] 15 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["layer3", "layer4", "layer5"] 16 | COMMON_STRIDE: 4 17 | TRANSFORMER_ENC_LAYERS: 6 18 | MASK_FORMER: 19 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 20 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 21 | CLIP_TEXT_PATH: "datasets/metadata/fg_bg_5_10_lvvis_ens.npy" 22 | CLIP_IMAGE_PATH: "datasets/metadata/lvvis_val_clip_feature.pkl" 23 | CLIP_CLASSIFIER: True 24 | DEEP_SUPERVISION: True 25 | NO_OBJECT_WEIGHT: 0.1 26 | OBJECT_WEIGHT: 2.0 27 | CLASS_WEIGHT: 2.0 28 | MASK_WEIGHT: 5.0 29 | DICE_WEIGHT: 5.0 30 | HIDDEN_DIM: 256 31 | NUM_OBJECT_QUERIES: 100 32 | NHEADS: 8 33 | DROPOUT: 0.0 34 | DIM_FEEDFORWARD: 2048 35 | ENC_LAYERS: 0 36 | PRE_NORM: False 37 | ENFORCE_INPUT_PROJ: False 38 | SIZE_DIVISIBILITY: 32 39 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 40 | TRAIN_NUM_POINTS: 12544 41 | OVERSAMPLE_RATIO: 3.0 42 | IMPORTANCE_SAMPLE_RATIO: 0.75 43 | TEST: 44 | SEMANTIC_ON: False 45 | INSTANCE_ON: True 46 | PANOPTIC_ON: False 47 | OVERLAP_THRESHOLD: 0.8 48 | OBJECT_MASK_THRESHOLD: 0.8 49 | 50 | DATASETS: 51 | TEST: ("lvvis_val",) 52 | 53 | INPUT: 54 | MIN_SIZE_TEST: 360 55 | 56 | TEST: 57 | DETECTIONS_PER_IMAGE: 50 58 | -------------------------------------------------------------------------------- /configs/youtubevis_2019/ovformer_R50_bs8.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../lvis/Base-COCO-InstanceSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "OVFormerVideo" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IGNORE_VALUE: 255 7 | NUM_CLASSES: 40 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 14 | IN_FEATURES: ["layer2", "layer3", "layer4", "layer5"] 15 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["layer3", "layer4", "layer5"] 16 | COMMON_STRIDE: 4 17 | TRANSFORMER_ENC_LAYERS: 6 18 | MASK_FORMER: 19 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 20 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 21 | CLIP_TEXT_PATH: "datasets/metadata/fg_bg_5_10_ytvis19_ens.npy" 22 | CLIP_IMAGE_PATH: "datasets/metadata/ytvis_2019_val_clip_feature.pkl" 23 | CLIP_CLASSIFIER: True 24 | DEEP_SUPERVISION: True 25 | NO_OBJECT_WEIGHT: 0.1 26 | OBJECT_WEIGHT: 2.0 27 | CLASS_WEIGHT: 2.0 28 | MASK_WEIGHT: 5.0 29 | DICE_WEIGHT: 5.0 30 | HIDDEN_DIM: 256 31 | NUM_OBJECT_QUERIES: 100 32 | NHEADS: 8 33 | DROPOUT: 0.0 34 | DIM_FEEDFORWARD: 2048 35 | ENC_LAYERS: 0 36 | PRE_NORM: False 37 | ENFORCE_INPUT_PROJ: False 38 | SIZE_DIVISIBILITY: 32 39 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 40 | TRAIN_NUM_POINTS: 12544 41 | OVERSAMPLE_RATIO: 3.0 42 | IMPORTANCE_SAMPLE_RATIO: 0.75 43 | TEST: 44 | SEMANTIC_ON: False 45 | INSTANCE_ON: True 46 | PANOPTIC_ON: False 47 | OVERLAP_THRESHOLD: 0.8 48 | OBJECT_MASK_THRESHOLD: 0.8 49 | 50 | DATASETS: 51 | TEST: ("ytvis_2019_val",) 52 | 53 | INPUT: 54 | MIN_SIZE_TEST: 360 55 | 56 | TEST: 57 | DETECTIONS_PER_IMAGE: 10 58 | -------------------------------------------------------------------------------- /configs/youtubevis_2021/ovformer_R50_bs8.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../lvis/Base-COCO-InstanceSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "OVFormerVideo" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IGNORE_VALUE: 255 7 | NUM_CLASSES: 40 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 14 | IN_FEATURES: ["layer2", "layer3", "layer4", "layer5"] 15 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["layer3", "layer4", "layer5"] 16 | COMMON_STRIDE: 4 17 | TRANSFORMER_ENC_LAYERS: 6 18 | MASK_FORMER: 19 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 20 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 21 | CLIP_TEXT_PATH: "datasets/metadata/fg_bg_5_10_ytvis21_ens.npy" 22 | CLIP_IMAGE_PATH: "datasets/metadata/ytvis_2021_val_clip_feature.pkl" 23 | CLIP_CLASSIFIER: True 24 | DEEP_SUPERVISION: True 25 | NO_OBJECT_WEIGHT: 0.1 26 | OBJECT_WEIGHT: 2.0 27 | CLASS_WEIGHT: 2.0 28 | MASK_WEIGHT: 5.0 29 | DICE_WEIGHT: 5.0 30 | HIDDEN_DIM: 256 31 | NUM_OBJECT_QUERIES: 100 32 | NHEADS: 8 33 | DROPOUT: 0.0 34 | DIM_FEEDFORWARD: 2048 35 | ENC_LAYERS: 0 36 | PRE_NORM: False 37 | ENFORCE_INPUT_PROJ: False 38 | SIZE_DIVISIBILITY: 32 39 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 40 | TRAIN_NUM_POINTS: 12544 41 | OVERSAMPLE_RATIO: 3.0 42 | IMPORTANCE_SAMPLE_RATIO: 0.75 43 | TEST: 44 | SEMANTIC_ON: False 45 | INSTANCE_ON: True 46 | PANOPTIC_ON: False 47 | OVERLAP_THRESHOLD: 0.8 48 | OBJECT_MASK_THRESHOLD: 0.8 49 | 50 | DATASETS: 51 | TEST: ("ytvis_2021_val",) 52 | 53 | INPUT: 54 | MIN_SIZE_TEST: 360 55 | 56 | TEST: 57 | DETECTIONS_PER_IMAGE: 10 58 | -------------------------------------------------------------------------------- /configs/lvvis/video_ovformer_R50_bs8.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-LVVIS-VideoInstanceSegmentation.yaml 2 | MODEL: 3 | WEIGHTS: "models/ovformer_r50_lvis.pth" 4 | META_ARCHITECTURE: "VideoOVFormer" 5 | SEM_SEG_HEAD: 6 | NAME: "MaskFormerHead" 7 | IGNORE_VALUE: 255 8 | NUM_CLASSES: 1196 9 | LOSS_WEIGHT: 1.0 10 | CONVS_DIM: 256 11 | MASK_DIM: 256 12 | NORM: "GN" 13 | # pixel decoder 14 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 15 | IN_FEATURES: ["layer2", "layer3", "layer4", "layer5"] 16 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["layer3", "layer4", "layer5"] 17 | COMMON_STRIDE: 4 18 | TRANSFORMER_ENC_LAYERS: 6 19 | MASK_FORMER: 20 | TRANSFORMER_DECODER_NAME: "VideoMultiScaleMaskedTransformerDecoder" 21 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 22 | CLIP_TEXT_PATH: "datasets/metadata/fg_bg_5_10_lvvis_ens.npy" 23 | CLIP_IMAGE_PATH: "datasets/metadata/lvvis_val_clip_feature.pkl" 24 | #CLIP_IMAGE_PATH: "datasets/metadata/lvvis_test_clip_feature.pkl" 25 | CLIP_CLASSIFIER: True 26 | DEEP_SUPERVISION: True 27 | NO_OBJECT_WEIGHT: 0.1 28 | OBJECT_WEIGHT: 2.0 29 | CLASS_WEIGHT: 2.0 30 | MASK_WEIGHT: 5.0 31 | DICE_WEIGHT: 5.0 32 | HIDDEN_DIM: 256 33 | NUM_OBJECT_QUERIES: 100 34 | NHEADS: 8 35 | DROPOUT: 0.0 36 | DIM_FEEDFORWARD: 2048 37 | ENC_LAYERS: 0 38 | PRE_NORM: False 39 | ENFORCE_INPUT_PROJ: False 40 | SIZE_DIVISIBILITY: 32 41 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 42 | TRAIN_NUM_POINTS: 12544 43 | OVERSAMPLE_RATIO: 3.0 44 | IMPORTANCE_SAMPLE_RATIO: 0.75 45 | TEST: 46 | SEMANTIC_ON: False 47 | INSTANCE_ON: True 48 | PANOPTIC_ON: False 49 | OVERLAP_THRESHOLD: 0.8 50 | OBJECT_MASK_THRESHOLD: 0.8 51 | 52 | DATASETS: 53 | TEST: ("lvvis_val",) 54 | # TEST: ("lvvis_test",) -------------------------------------------------------------------------------- /ovformer/modeling/pixel_decoder/ops/src/ms_deform_attn.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #pragma once 17 | 18 | #include "cpu/ms_deform_attn_cpu.h" 19 | 20 | #ifdef WITH_CUDA 21 | #include "cuda/ms_deform_attn_cuda.h" 22 | #endif 23 | 24 | 25 | at::Tensor 26 | ms_deform_attn_forward( 27 | const at::Tensor &value, 28 | const at::Tensor &spatial_shapes, 29 | const at::Tensor &level_start_index, 30 | const at::Tensor &sampling_loc, 31 | const at::Tensor &attn_weight, 32 | const int im2col_step) 33 | { 34 | if (value.type().is_cuda()) 35 | { 36 | #ifdef WITH_CUDA 37 | return ms_deform_attn_cuda_forward( 38 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step); 39 | #else 40 | AT_ERROR("Not compiled with GPU support"); 41 | #endif 42 | } 43 | AT_ERROR("Not implemented on the CPU"); 44 | } 45 | 46 | std::vector 47 | ms_deform_attn_backward( 48 | const at::Tensor &value, 49 | const at::Tensor &spatial_shapes, 50 | const at::Tensor &level_start_index, 51 | const at::Tensor &sampling_loc, 52 | const at::Tensor &attn_weight, 53 | const at::Tensor &grad_output, 54 | const int im2col_step) 55 | { 56 | if (value.type().is_cuda()) 57 | { 58 | #ifdef WITH_CUDA 59 | return ms_deform_attn_cuda_backward( 60 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step); 61 | #else 62 | AT_ERROR("Not compiled with GPU support"); 63 | #endif 64 | } 65 | AT_ERROR("Not implemented on the CPU"); 66 | } 67 | 68 | -------------------------------------------------------------------------------- /mAP.py: -------------------------------------------------------------------------------- 1 | from evaluate.lvvis import LVVIS 2 | from evaluate.burst import BURST 3 | from evaluate.bursteval import BURSTeval 4 | from evaluate.lvviseval import LVVISeval 5 | 6 | import sys 7 | import numpy as np 8 | import os 9 | from pycocotools.coco import COCO 10 | from pycocotools.cocoeval import COCOeval 11 | import json 12 | import itertools 13 | import torch 14 | from detectron2.utils.file_io import PathManager 15 | import argparse 16 | import json 17 | from datetime import datetime 18 | 19 | import os 20 | import sys 21 | import logging 22 | 23 | 24 | def pth_to_json(pth_path): 25 | predictions=torch.load(pth_path) 26 | coco_results = list(itertools.chain(*[x["instances"] for x in predictions])) 27 | for result in coco_results: 28 | category_id = result["category_id"] 29 | result["category_id"] = category_id+1 30 | file_path = os.path.join(os.path.dirname(pth_path), "instances_results.json") 31 | with PathManager.open(file_path, "w") as f: 32 | f.write(json.dumps(coco_results)) 33 | f.flush() 34 | 35 | if __name__ == '__main__': 36 | parser = argparse.ArgumentParser() 37 | parser.add_argument('--dt', default='output') 38 | parser.add_argument('--et', default='("lvvis_val",)') # ("burst_val",) 39 | 40 | args = parser.parse_args() 41 | dt_path=os.path.join(args.dt,'inference/results.json') 42 | output_file = os.path.join(os.path.dirname(args.dt), "results.txt") 43 | logging.basicConfig(filename=output_file, level=logging.INFO, filemode='a', format='%(asctime)s - %(levelname)s - %(message)s') 44 | 45 | 46 | console = logging.StreamHandler(sys.stdout) 47 | console.setLevel(logging.INFO) 48 | formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') 49 | console.setFormatter(formatter) 50 | logging.getLogger('').addHandler(console) 51 | logger = logging.getLogger(__name__) 52 | eval_type=args.et 53 | 54 | if 'lvvis' in eval_type: 55 | DATAEVAL=LVVIS 56 | DATAEVALeval=LVVISeval 57 | gt_path='datasets/LVVIS/val/val_instances_.json' 58 | 59 | elif 'burst' in eval_type: 60 | DATAEVAL=BURST 61 | DATAEVALeval=BURSTeval 62 | gt_path='datasets/burst/b2y_val.json' 63 | else: 64 | logger.info("\n") 65 | logger.info(f"\nAnnotations is invalid\n") 66 | raise NotImplementedError 67 | 68 | current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") 69 | logger.info("\n") 70 | logger.info(f"\n===== {current_time} =====\n") 71 | ytvosGT = DATAEVAL(gt_path) 72 | ytvosDT = ytvosGT.loadRes(dt_path) 73 | ytvosEval = DATAEVALeval(ytvosGT, ytvosDT, "segm") 74 | ytvosEval.evaluate() 75 | ytvosEval.accumulate() 76 | ytvosEval.summarize() -------------------------------------------------------------------------------- /ovformer/modeling/transformer_decoder/position_encoding.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py 3 | """ 4 | Various positional encodings for the transformer. 5 | """ 6 | import math 7 | 8 | import torch 9 | from torch import nn 10 | 11 | 12 | class PositionEmbeddingSine(nn.Module): 13 | """ 14 | This is a more standard version of the position embedding, very similar to the one 15 | used by the Attention is all you need paper, generalized to work on images. 16 | """ 17 | 18 | def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None): 19 | super().__init__() 20 | self.num_pos_feats = num_pos_feats 21 | self.temperature = temperature 22 | self.normalize = normalize 23 | if scale is not None and normalize is False: 24 | raise ValueError("normalize should be True if scale is passed") 25 | if scale is None: 26 | scale = 2 * math.pi 27 | self.scale = scale 28 | 29 | def forward(self, x, mask=None): 30 | if mask is None: 31 | mask = torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool) 32 | not_mask = ~mask 33 | y_embed = not_mask.cumsum(1, dtype=torch.float32) 34 | x_embed = not_mask.cumsum(2, dtype=torch.float32) 35 | if self.normalize: 36 | eps = 1e-6 37 | y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale 38 | x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale 39 | 40 | dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) 41 | dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode='floor') / self.num_pos_feats) 42 | 43 | pos_x = x_embed[:, :, :, None] / dim_t 44 | pos_y = y_embed[:, :, :, None] / dim_t 45 | pos_x = torch.stack( 46 | (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4 47 | ).flatten(3) 48 | pos_y = torch.stack( 49 | (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4 50 | ).flatten(3) 51 | pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) 52 | return pos 53 | 54 | def __repr__(self, _repr_indent=4): 55 | head = "Positional encoding " + self.__class__.__name__ 56 | body = [ 57 | "num_pos_feats: {}".format(self.num_pos_feats), 58 | "temperature: {}".format(self.temperature), 59 | "normalize: {}".format(self.normalize), 60 | "scale: {}".format(self.scale), 61 | ] 62 | # _repr_indent = 4 63 | lines = [head] + [" " * _repr_indent + line for line in body] 64 | return "\n".join(lines) 65 | -------------------------------------------------------------------------------- /ovformer_video/modeling/transformer_decoder/position_encoding.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py 3 | """ 4 | Various positional encodings for the transformer. 5 | """ 6 | import math 7 | 8 | import torch 9 | from torch import nn 10 | 11 | 12 | class PositionEmbeddingSine3D(nn.Module): 13 | """ 14 | This is a more standard version of the position embedding, very similar to the one 15 | used by the Attention is all you need paper, generalized to work on images. 16 | """ 17 | 18 | def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None): 19 | super().__init__() 20 | self.num_pos_feats = num_pos_feats 21 | self.temperature = temperature 22 | self.normalize = normalize 23 | if scale is not None and normalize is False: 24 | raise ValueError("normalize should be True if scale is passed") 25 | if scale is None: 26 | scale = 2 * math.pi 27 | self.scale = scale 28 | 29 | def forward(self, x, mask=None): 30 | # b, t, c, h, w 31 | assert x.dim() == 5, f"{x.shape} should be a 5-dimensional Tensor, got {x.dim()}-dimensional Tensor instead" 32 | if mask is None: 33 | mask = torch.zeros((x.size(0), x.size(1), x.size(3), x.size(4)), device=x.device, dtype=torch.bool) 34 | not_mask = ~mask 35 | z_embed = not_mask.cumsum(1, dtype=torch.float32) 36 | y_embed = not_mask.cumsum(2, dtype=torch.float32) 37 | x_embed = not_mask.cumsum(3, dtype=torch.float32) 38 | if self.normalize: 39 | eps = 1e-6 40 | z_embed = z_embed / (z_embed[:, -1:, :, :] + eps) * self.scale 41 | y_embed = y_embed / (y_embed[:, :, -1:, :] + eps) * self.scale 42 | x_embed = x_embed / (x_embed[:, :, :, -1:] + eps) * self.scale 43 | 44 | dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) 45 | dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) 46 | 47 | dim_t_z = torch.arange((self.num_pos_feats * 2), dtype=torch.float32, device=x.device) 48 | dim_t_z = self.temperature ** (2 * (dim_t_z // 2) / (self.num_pos_feats * 2)) 49 | 50 | pos_x = x_embed[:, :, :, :, None] / dim_t 51 | pos_y = y_embed[:, :, :, :, None] / dim_t 52 | pos_z = z_embed[:, :, :, :, None] / dim_t_z 53 | pos_x = torch.stack((pos_x[:, :, :, :, 0::2].sin(), pos_x[:, :, :, :, 1::2].cos()), dim=5).flatten(4) 54 | pos_y = torch.stack((pos_y[:, :, :, :, 0::2].sin(), pos_y[:, :, :, :, 1::2].cos()), dim=5).flatten(4) 55 | pos_z = torch.stack((pos_z[:, :, :, :, 0::2].sin(), pos_z[:, :, :, :, 1::2].cos()), dim=5).flatten(4) 56 | pos = (torch.cat((pos_y, pos_x), dim=4) + pos_z).permute(0, 1, 4, 2, 3) # b, t, c, h, w 57 | return pos 58 | -------------------------------------------------------------------------------- /ovformer_video/utils/memory.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | import logging 4 | from contextlib import contextmanager 5 | from functools import wraps 6 | import torch 7 | from torch.cuda.amp import autocast 8 | 9 | __all__ = ["retry_if_cuda_oom"] 10 | 11 | 12 | @contextmanager 13 | def _ignore_torch_cuda_oom(): 14 | """ 15 | A context which ignores CUDA OOM exception from pytorch. 16 | """ 17 | try: 18 | yield 19 | except RuntimeError as e: 20 | # NOTE: the string may change? 21 | if "CUDA out of memory. " in str(e): 22 | pass 23 | else: 24 | raise 25 | 26 | 27 | def retry_if_cuda_oom(func): 28 | """ 29 | Makes a function retry itself after encountering 30 | pytorch's CUDA OOM error. 31 | It will first retry after calling `torch.cuda.empty_cache()`. 32 | If that still fails, it will then retry by trying to convert inputs to CPUs. 33 | In this case, it expects the function to dispatch to CPU implementation. 34 | The return values may become CPU tensors as well and it's user's 35 | responsibility to convert it back to CUDA tensor if needed. 36 | Args: 37 | func: a stateless callable that takes tensor-like objects as arguments 38 | Returns: 39 | a callable which retries `func` if OOM is encountered. 40 | Examples: 41 | :: 42 | output = retry_if_cuda_oom(some_torch_function)(input1, input2) 43 | # output may be on CPU even if inputs are on GPU 44 | Note: 45 | 1. When converting inputs to CPU, it will only look at each argument and check 46 | if it has `.device` and `.to` for conversion. Nested structures of tensors 47 | are not supported. 48 | 2. Since the function might be called more than once, it has to be 49 | stateless. 50 | """ 51 | 52 | def maybe_to_cpu(x): 53 | try: 54 | like_gpu_tensor = x.device.type == "cuda" and hasattr(x, "to") 55 | except AttributeError: 56 | like_gpu_tensor = False 57 | if like_gpu_tensor: 58 | return x.to(device="cpu").to(torch.float32) 59 | else: 60 | return x 61 | 62 | @wraps(func) 63 | def wrapped(*args, **kwargs): 64 | with _ignore_torch_cuda_oom(): 65 | return func(*args, **kwargs) 66 | 67 | # Clear cache and retry 68 | torch.cuda.empty_cache() 69 | with _ignore_torch_cuda_oom(): 70 | return func(*args, **kwargs) 71 | 72 | # Try on CPU. This slows down the code significantly, therefore print a notice. 73 | logger = logging.getLogger(__name__) 74 | logger.info("Attempting to copy inputs to CPU due to CUDA OOM") 75 | new_args = (maybe_to_cpu(x) for x in args) 76 | new_kwargs = {k: maybe_to_cpu(v) for k, v in kwargs.items()} 77 | with autocast(enabled=False): 78 | return func(*new_args, **new_kwargs) 79 | 80 | return wrapped 81 | -------------------------------------------------------------------------------- /ovformer/modeling/pixel_decoder/ops/setup.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | import os 13 | import glob 14 | 15 | import torch 16 | 17 | from torch.utils.cpp_extension import CUDA_HOME 18 | from torch.utils.cpp_extension import CppExtension 19 | from torch.utils.cpp_extension import CUDAExtension 20 | 21 | from setuptools import find_packages 22 | from setuptools import setup 23 | 24 | requirements = ["torch", "torchvision"] 25 | 26 | def get_extensions(): 27 | this_dir = os.path.dirname(os.path.abspath(__file__)) 28 | extensions_dir = os.path.join(this_dir, "src") 29 | 30 | main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) 31 | source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) 32 | source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) 33 | 34 | sources = main_file + source_cpu 35 | extension = CppExtension 36 | extra_compile_args = {"cxx": []} 37 | define_macros = [] 38 | 39 | # Force cuda since torch ask for a device, not if cuda is in fact available. 40 | if (os.environ.get('FORCE_CUDA') or torch.cuda.is_available()) and CUDA_HOME is not None: 41 | extension = CUDAExtension 42 | sources += source_cuda 43 | define_macros += [("WITH_CUDA", None)] 44 | extra_compile_args["nvcc"] = [ 45 | "-DCUDA_HAS_FP16=1", 46 | "-D__CUDA_NO_HALF_OPERATORS__", 47 | "-D__CUDA_NO_HALF_CONVERSIONS__", 48 | "-D__CUDA_NO_HALF2_OPERATORS__", 49 | ] 50 | else: 51 | if CUDA_HOME is None: 52 | raise NotImplementedError('CUDA_HOME is None. Please set environment variable CUDA_HOME.') 53 | else: 54 | raise NotImplementedError('No CUDA runtime is found. Please set FORCE_CUDA=1 or test it by running torch.cuda.is_available().') 55 | 56 | sources = [os.path.join(extensions_dir, s) for s in sources] 57 | include_dirs = [extensions_dir] 58 | ext_modules = [ 59 | extension( 60 | "MultiScaleDeformableAttention", 61 | sources, 62 | include_dirs=include_dirs, 63 | define_macros=define_macros, 64 | extra_compile_args=extra_compile_args, 65 | ) 66 | ] 67 | return ext_modules 68 | 69 | setup( 70 | name="MultiScaleDeformableAttention", 71 | version="1.0", 72 | author="Weijie Su", 73 | url="https://github.com/fundamentalvision/Deformable-DETR", 74 | description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention", 75 | packages=find_packages(exclude=("configs", "tests",)), 76 | ext_modules=get_extensions(), 77 | cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, 78 | ) 79 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Unified Embedding Alignment for Open-Vocabulary Video Instance Segmentation (ECCV 2024) 2 | 3 | [Hao Fang](https://fanghaook.github.io/), 4 | Peng Wu, 5 | [Yawei Li](https://scholar.google.com.hk/citations?user=IFLsTGsAAAAJ), 6 | [Xinxin Zhang](https://scholar.google.cz/citations?user=rPv44PoAAAAJ), 7 | [Xiankai Lu](https://scholar.google.com.hk/citations?user=QS5V5b8AAAAJ) 8 | 9 | [[`paper`](https://arxiv.org/pdf/2407.07427)] [[`BibTeX`](#CitingOVFormer)] 10 | 11 |
12 | 13 |

14 | 15 | ## Installation 16 | 17 | See [installation instructions](INSTALL.md). 18 | 19 | ## Data Preparation 20 | See [Preparing Datasets for OVFormer](./datasets/README.md). 21 | 22 | ## Getting Started 23 | We firstly train the OVFormer model on LVIS dataset: 24 | ```bash 25 | python train_net.py --num-gpus 4 \ 26 | --config-file configs/lvis/ovformer_R50_bs8.yaml 27 | ``` 28 | To evaluate model's zero-shot generalization performance on VIS Datasets, use 29 | ```bash 30 | python train_net_video.py \ 31 | --config-file configs/youtubevis_2019/ovformer_R50_bs8.yaml \ 32 | --eval-only MODEL.WEIGHTS models/ovformer_r50_lvis.pth 33 | ``` 34 | YTVIS19/21 requires splitting the results.json into base and novel categories by [Tool](./tools/ytvis_json.py), 35 | OVIS directly packages and uploads to the specified server, BURST needs to run ```mAP.py```. 36 | You are expected to get results like this: 37 | 38 | | Model | Backbone | YTVIS19 | YTVIS21 | OVIS | BURST | weights | 39 | |:--------:|:--------:|:-------:|:-------:|:----:|:-----:|:---------:| 40 | | OVFormer | R-50 | 34.8 | 29.8 | 15.1 | 6.8 | [model](https://drive.google.com/file/d/1-tMcjp8xIYr9E5r5JYOESGajXtMAs33y/view?usp=sharing) | 41 | | OVFormer | Swin-B | 44.3 | 37.6 | 21.3 | 7.6 | [model](https://drive.google.com/file/d/102qxZlu05yXILfghhrwjxv-tL3MlcYu7/view?usp=sharing) | 42 | 43 | Then, we video-based train the OVFormer model on LV-VIS dataset: 44 | ```bash 45 | python train_net_lvvis.py --num-gpus 4 \ 46 | --config-file configs/lvvis/video_ovformer_R50_bs8.yaml 47 | ``` 48 | To evaluate a model's performance on LV-VIS dataset, use 49 | ```bash 50 | python train_net_lvvis.py \ 51 | --config-file configs/lvvis/video_ovformer_R50_bs8.yaml \ 52 | --eval-only MODEL.WEIGHTS models/ovformer_r50_lvvis.pth 53 | ``` 54 | Run ```mAP.py```, you are expected to get results like this: 55 | 56 | | Model | Backbone | LVVIS val | LVVIS test | weights | 57 | |:--------------:|:--------:|:---------:|:----------:|:---------:| 58 | | OVFormer | R-50 | 21.9 | 15.2 | [model](https://drive.google.com/file/d/1-zfEwdglPeVHzlc5Ky_HJlZtMgGXAy1S/view?usp=sharing) | 59 | | OVFormer | Swin-B | 24.7 | 19.5 | [model](https://drive.google.com/file/d/107BNsu9eTr5e70B4oj28jgHKBjYTWNWp/view?usp=sharing) | 60 | 61 | ## Citing OVFormer 62 | ```BibTeX 63 | @inproceedings{fang2024unified, 64 | title={Unified embedding alignment for open-vocabulary video instance segmentation}, 65 | author={Fang, Hao and Wu, Peng and Li, Yawei and Zhang, Xinxin and Lu, Xiankai}, 66 | booktitle={ECCV}, 67 | pages={225--241}, 68 | year={2025}, 69 | organization={Springer} 70 | } 71 | ``` 72 | 73 | ## Acknowledgement 74 | 75 | This repo is based on [detectron2](https://github.com/facebookresearch/detectron2), 76 | [Mask2Former](https://github.com/facebookresearch/Mask2Former), 77 | and [LVVIS](https://github.com/haochenheheda/LVVIS). Thanks for their great work! 78 | -------------------------------------------------------------------------------- /tools/vis_results.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import json 4 | import cv2 5 | import tqdm 6 | from pycocotools import mask as pymask 7 | import numpy as np 8 | import tqdm 9 | 10 | def get_center(mask): 11 | # Get the central part of the object 12 | h1,h2 = np.argwhere(mask.sum(axis=1).reshape(-1)).min(), np.argwhere(mask.sum(axis=1).reshape(-1)).max() 13 | w1,w2 = np.argwhere(mask.sum(axis=0).reshape(-1)).min(), np.argwhere(mask.sum(axis=0).reshape(-1)).max() 14 | return int((h1+h2)/2), int((w1+w2)/2), h1, w1, h2, w2 15 | 16 | color_map = [[20,255,20], [20, 20, 255], [255, 20, 20], [20, 255, 255], [255,20,255], [255,255,20],[42,42,128],[165, 42, 42], [134, 134, 103], [0, 0, 142], [255, 109, 65], \ 17 | [0, 226, 252], [5, 121, 0], [0, 60, 100], [250, 170, 30], [100, 170, 30], [179, 0, 194], [255, 77, 255], [120, 166, 157], \ 18 | [73, 77, 174], [0, 80, 100], [182, 182, 255], [0, 143, 149], [174, 57, 255], [0, 0, 230], [72, 0, 118], [255, 179, 240], \ 19 | [0, 125, 92], [209, 0, 151], [188, 208, 182], [145, 148, 174], [106, 0, 228], [0, 0, 70], [199, 100, 0], [166, 196, 102], \ 20 | [110, 76, 0], [133, 129, 255], [0, 0, 192], [183, 130, 88], [130, 114, 135], [107, 142, 35], [0, 228, 0], [174, 255, 243], [255, 208, 186]] 21 | 22 | 23 | output_dir = 'output/lvvis_vis' 24 | anno_json = 'datasets/LVVIS/val/val_instances_.json' 25 | dt_json = 'output/ov2seg/inference/lvvis_val/results.json' 26 | img_dir = 'datasets/LVVIS/val/JPEGImages' 27 | 28 | 29 | dt = json.load(open(dt_json, 'r')) 30 | data = json.load(open(anno_json, 'r')) 31 | categories = data['categories'] 32 | videos = data['videos'] 33 | 34 | dt_dic = {} 35 | category_dic = {} 36 | for category in categories: 37 | category_dic[category['id']] = category['name'] 38 | 39 | for d in dt: 40 | if d['video_id'] not in dt_dic.keys(): 41 | dt_dic[d['video_id']] = [] 42 | dt_dic[d['video_id']].append(d) 43 | for video in tqdm.tqdm(videos): 44 | video_name = video['file_names'][0].split('/')[0] 45 | img_list = video['file_names'] 46 | img_list.sort() 47 | video_id = video['id'] 48 | video_dt = dt_dic[video_id] 49 | for fid, img_path in enumerate(img_list): 50 | img = cv2.imread(os.path.join(img_dir, img_path)) 51 | h,w,_ = img.shape 52 | mask_vis = np.zeros((h,w,3)) 53 | for obj_id, obj in enumerate(video_dt): 54 | category_id = obj['category_id'] 55 | category_name = category_dic[category_id] 56 | score = obj['score'] 57 | if score < 0.5: 58 | continue 59 | obj_mask = pymask.decode(obj['segmentations'][fid]) 60 | if obj_mask.sum() == 0: 61 | continue 62 | color = color_map[int(obj_id)%len(color_map)] 63 | mask_vis[obj_mask > 0] = color 64 | img[obj_mask > 0] = img[obj_mask > 0] * 0.45 + mask_vis[obj_mask>0]*0.55 65 | contours,hierarchy = cv2.findContours(obj_mask,cv2.RETR_TREE,cv2.CHAIN_APPROX_SIMPLE) 66 | img = cv2.drawContours(img,contours,-1,(222,222,222),2) 67 | h_,w_,y1,x1,y2,x2 = get_center(obj_mask) 68 | img = cv2.putText(img, category_name, ((x1 + x2)//2 - 45, (y1+y2)//2 -25), cv2.FONT_HERSHEY_SIMPLEX, 1.3, (255,255,255), 5) 69 | img = cv2.putText(img, category_name, ((x1 + x2)//2 - 45, (y1+y2)//2 -25), cv2.FONT_HERSHEY_SIMPLEX, 1.3, color, 2) 70 | 71 | img_name = img_path.split('/')[-1] 72 | os.makedirs(os.path.join(output_dir, video_name),exist_ok=True) 73 | output_path = os.path.join(output_dir, video_name, img_name) 74 | cv2.imwrite(output_path, img) 75 | 76 | -------------------------------------------------------------------------------- /ovformer/modeling/pixel_decoder/ops/functions/ms_deform_attn_func.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | from __future__ import absolute_import 13 | from __future__ import print_function 14 | from __future__ import division 15 | 16 | import torch 17 | import torch.nn.functional as F 18 | from torch.autograd import Function 19 | from torch.autograd.function import once_differentiable 20 | 21 | try: 22 | import MultiScaleDeformableAttention as MSDA 23 | except ModuleNotFoundError as e: 24 | info_string = ( 25 | "\n\nPlease compile MultiScaleDeformableAttention CUDA op with the following commands:\n" 26 | "\t`cd mask2former/modeling/pixel_decoder/ops`\n" 27 | "\t`sh make.sh`\n" 28 | ) 29 | raise ModuleNotFoundError(info_string) 30 | 31 | 32 | class MSDeformAttnFunction(Function): 33 | @staticmethod 34 | def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step): 35 | ctx.im2col_step = im2col_step 36 | output = MSDA.ms_deform_attn_forward( 37 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step) 38 | ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights) 39 | return output 40 | 41 | @staticmethod 42 | @once_differentiable 43 | def backward(ctx, grad_output): 44 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors 45 | grad_value, grad_sampling_loc, grad_attn_weight = \ 46 | MSDA.ms_deform_attn_backward( 47 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step) 48 | 49 | return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None 50 | 51 | 52 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights): 53 | # for debug and test only, 54 | # need to use cuda version instead 55 | N_, S_, M_, D_ = value.shape 56 | _, Lq_, M_, L_, P_, _ = sampling_locations.shape 57 | value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1) 58 | sampling_grids = 2 * sampling_locations - 1 59 | sampling_value_list = [] 60 | for lid_, (H_, W_) in enumerate(value_spatial_shapes): 61 | # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_ 62 | value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_) 63 | # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2 64 | sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1) 65 | # N_*M_, D_, Lq_, P_ 66 | sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_, 67 | mode='bilinear', padding_mode='zeros', align_corners=False) 68 | sampling_value_list.append(sampling_value_l_) 69 | # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_) 70 | attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_) 71 | output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_) 72 | return output.transpose(1, 2).contiguous() 73 | -------------------------------------------------------------------------------- /ovformer/test_time_augmentation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import copy 3 | import logging 4 | from itertools import count 5 | 6 | import numpy as np 7 | import torch 8 | from fvcore.transforms import HFlipTransform 9 | from torch import nn 10 | from torch.nn.parallel import DistributedDataParallel 11 | 12 | from detectron2.data.detection_utils import read_image 13 | from detectron2.modeling import DatasetMapperTTA 14 | 15 | 16 | __all__ = [ 17 | "SemanticSegmentorWithTTA", 18 | ] 19 | 20 | 21 | class SemanticSegmentorWithTTA(nn.Module): 22 | """ 23 | A SemanticSegmentor with test-time augmentation enabled. 24 | Its :meth:`__call__` method has the same interface as :meth:`SemanticSegmentor.forward`. 25 | """ 26 | 27 | def __init__(self, cfg, model, tta_mapper=None, batch_size=1): 28 | """ 29 | Args: 30 | cfg (CfgNode): 31 | model (SemanticSegmentor): a SemanticSegmentor to apply TTA on. 32 | tta_mapper (callable): takes a dataset dict and returns a list of 33 | augmented versions of the dataset dict. Defaults to 34 | `DatasetMapperTTA(cfg)`. 35 | batch_size (int): batch the augmented images into this batch size for inference. 36 | """ 37 | super().__init__() 38 | if isinstance(model, DistributedDataParallel): 39 | model = model.module 40 | self.cfg = cfg.clone() 41 | 42 | self.model = model 43 | 44 | if tta_mapper is None: 45 | tta_mapper = DatasetMapperTTA(cfg) 46 | self.tta_mapper = tta_mapper 47 | self.batch_size = batch_size 48 | 49 | def __call__(self, batched_inputs): 50 | """ 51 | Same input/output format as :meth:`SemanticSegmentor.forward` 52 | """ 53 | 54 | def _maybe_read_image(dataset_dict): 55 | ret = copy.copy(dataset_dict) 56 | if "image" not in ret: 57 | image = read_image(ret.pop("file_name"), self.model.input_format) 58 | image = torch.from_numpy(np.ascontiguousarray(image.transpose(2, 0, 1))) # CHW 59 | ret["image"] = image 60 | if "height" not in ret and "width" not in ret: 61 | ret["height"] = image.shape[1] 62 | ret["width"] = image.shape[2] 63 | return ret 64 | 65 | processed_results = [] 66 | for x in batched_inputs: 67 | result = self._inference_one_image(_maybe_read_image(x)) 68 | processed_results.append(result) 69 | return processed_results 70 | 71 | def _inference_one_image(self, input): 72 | """ 73 | Args: 74 | input (dict): one dataset dict with "image" field being a CHW tensor 75 | Returns: 76 | dict: one output dict 77 | """ 78 | orig_shape = (input["height"], input["width"]) 79 | augmented_inputs, tfms = self._get_augmented_inputs(input) 80 | 81 | final_predictions = None 82 | count_predictions = 0 83 | for input, tfm in zip(augmented_inputs, tfms): 84 | count_predictions += 1 85 | with torch.no_grad(): 86 | if final_predictions is None: 87 | if any(isinstance(t, HFlipTransform) for t in tfm.transforms): 88 | final_predictions = self.model([input])[0].pop("sem_seg").flip(dims=[2]) 89 | else: 90 | final_predictions = self.model([input])[0].pop("sem_seg") 91 | else: 92 | if any(isinstance(t, HFlipTransform) for t in tfm.transforms): 93 | final_predictions += self.model([input])[0].pop("sem_seg").flip(dims=[2]) 94 | else: 95 | final_predictions += self.model([input])[0].pop("sem_seg") 96 | 97 | final_predictions = final_predictions / count_predictions 98 | return {"sem_seg": final_predictions} 99 | 100 | def _get_augmented_inputs(self, input): 101 | augmented_inputs = self.tta_mapper(input) 102 | tfms = [x.pop("transforms") for x in augmented_inputs] 103 | return augmented_inputs, tfms 104 | -------------------------------------------------------------------------------- /ovformer/utils/misc.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/util/misc.py 3 | """ 4 | Misc functions, including distributed helpers. 5 | 6 | Mostly copy-paste from torchvision references. 7 | """ 8 | from typing import List, Optional 9 | 10 | import torch 11 | import torch.distributed as dist 12 | import torchvision 13 | from torch import Tensor 14 | 15 | 16 | def _max_by_axis(the_list): 17 | # type: (List[List[int]]) -> List[int] 18 | maxes = the_list[0] 19 | for sublist in the_list[1:]: 20 | for index, item in enumerate(sublist): 21 | maxes[index] = max(maxes[index], item) 22 | return maxes 23 | 24 | 25 | class NestedTensor(object): 26 | def __init__(self, tensors, mask: Optional[Tensor]): 27 | self.tensors = tensors 28 | self.mask = mask 29 | 30 | def to(self, device): 31 | # type: (Device) -> NestedTensor # noqa 32 | cast_tensor = self.tensors.to(device) 33 | mask = self.mask 34 | if mask is not None: 35 | assert mask is not None 36 | cast_mask = mask.to(device) 37 | else: 38 | cast_mask = None 39 | return NestedTensor(cast_tensor, cast_mask) 40 | 41 | def decompose(self): 42 | return self.tensors, self.mask 43 | 44 | def __repr__(self): 45 | return str(self.tensors) 46 | 47 | 48 | def nested_tensor_from_tensor_list(tensor_list: List[Tensor]): 49 | # TODO make this more general 50 | if tensor_list[0].ndim == 3: 51 | if torchvision._is_tracing(): 52 | # nested_tensor_from_tensor_list() does not export well to ONNX 53 | # call _onnx_nested_tensor_from_tensor_list() instead 54 | return _onnx_nested_tensor_from_tensor_list(tensor_list) 55 | 56 | # TODO make it support different-sized images 57 | max_size = _max_by_axis([list(img.shape) for img in tensor_list]) 58 | # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list])) 59 | batch_shape = [len(tensor_list)] + max_size 60 | b, c, h, w = batch_shape 61 | dtype = tensor_list[0].dtype 62 | device = tensor_list[0].device 63 | tensor = torch.zeros(batch_shape, dtype=dtype, device=device) 64 | mask = torch.ones((b, h, w), dtype=torch.bool, device=device) 65 | for img, pad_img, m in zip(tensor_list, tensor, mask): 66 | pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) 67 | m[: img.shape[1], : img.shape[2]] = False 68 | else: 69 | raise ValueError("not supported") 70 | return NestedTensor(tensor, mask) 71 | 72 | 73 | # _onnx_nested_tensor_from_tensor_list() is an implementation of 74 | # nested_tensor_from_tensor_list() that is supported by ONNX tracing. 75 | @torch.jit.unused 76 | def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor: 77 | max_size = [] 78 | for i in range(tensor_list[0].dim()): 79 | max_size_i = torch.max( 80 | torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32) 81 | ).to(torch.int64) 82 | max_size.append(max_size_i) 83 | max_size = tuple(max_size) 84 | 85 | # work around for 86 | # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) 87 | # m[: img.shape[1], :img.shape[2]] = False 88 | # which is not yet supported in onnx 89 | padded_imgs = [] 90 | padded_masks = [] 91 | for img in tensor_list: 92 | padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))] 93 | padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0])) 94 | padded_imgs.append(padded_img) 95 | 96 | m = torch.zeros_like(img[0], dtype=torch.int, device=img.device) 97 | padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1) 98 | padded_masks.append(padded_mask.to(torch.bool)) 99 | 100 | tensor = torch.stack(padded_imgs) 101 | mask = torch.stack(padded_masks) 102 | 103 | return NestedTensor(tensor, mask=mask) 104 | 105 | 106 | def is_dist_avail_and_initialized(): 107 | if not dist.is_available(): 108 | return False 109 | if not dist.is_initialized(): 110 | return False 111 | return True 112 | -------------------------------------------------------------------------------- /ovformer/modeling/pixel_decoder/ops/test.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | from __future__ import absolute_import 13 | from __future__ import print_function 14 | from __future__ import division 15 | 16 | import time 17 | import torch 18 | import torch.nn as nn 19 | from torch.autograd import gradcheck 20 | 21 | from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch 22 | 23 | 24 | N, M, D = 1, 2, 2 25 | Lq, L, P = 2, 2, 2 26 | shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda() 27 | level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1])) 28 | S = sum([(H*W).item() for H, W in shapes]) 29 | 30 | 31 | torch.manual_seed(3) 32 | 33 | 34 | @torch.no_grad() 35 | def check_forward_equal_with_pytorch_double(): 36 | value = torch.rand(N, S, M, D).cuda() * 0.01 37 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 38 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 39 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 40 | im2col_step = 2 41 | output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu() 42 | output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu() 43 | fwdok = torch.allclose(output_cuda, output_pytorch) 44 | max_abs_err = (output_cuda - output_pytorch).abs().max() 45 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() 46 | 47 | print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') 48 | 49 | 50 | @torch.no_grad() 51 | def check_forward_equal_with_pytorch_float(): 52 | value = torch.rand(N, S, M, D).cuda() * 0.01 53 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 54 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 55 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 56 | im2col_step = 2 57 | output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu() 58 | output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu() 59 | fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3) 60 | max_abs_err = (output_cuda - output_pytorch).abs().max() 61 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() 62 | 63 | print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') 64 | 65 | 66 | def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True): 67 | 68 | value = torch.rand(N, S, M, channels).cuda() * 0.01 69 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 70 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 71 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 72 | im2col_step = 2 73 | func = MSDeformAttnFunction.apply 74 | 75 | value.requires_grad = grad_value 76 | sampling_locations.requires_grad = grad_sampling_loc 77 | attention_weights.requires_grad = grad_attn_weight 78 | 79 | gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step)) 80 | 81 | print(f'* {gradok} check_gradient_numerical(D={channels})') 82 | 83 | 84 | if __name__ == '__main__': 85 | check_forward_equal_with_pytorch_double() 86 | check_forward_equal_with_pytorch_float() 87 | 88 | for channels in [30, 32, 64, 71, 1025, 2048, 3096]: 89 | check_gradient_numerical(channels, True, True, True) 90 | 91 | 92 | 93 | -------------------------------------------------------------------------------- /ovformer/data/datasets/register_ade20k_instance.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import json 3 | import logging 4 | import numpy as np 5 | import os 6 | from PIL import Image 7 | 8 | from detectron2.data import DatasetCatalog, MetadataCatalog 9 | from detectron2.data.datasets.coco import load_coco_json, register_coco_instances 10 | from detectron2.utils.file_io import PathManager 11 | 12 | ADE_CATEGORIES = [{'id': 7, 'name': 'bed'}, {'id': 8, 'name': 'windowpane'}, {'id': 10, 'name': 'cabinet'}, {'id': 12, 'name': 'person'}, {'id': 14, 'name': 'door'}, {'id': 15, 'name': 'table'}, {'id': 18, 'name': 'curtain'}, {'id': 19, 'name': 'chair'}, {'id': 20, 'name': 'car'}, {'id': 22, 'name': 'painting'}, {'id': 23, 'name': 'sofa'}, {'id': 24, 'name': 'shelf'}, {'id': 27, 'name': 'mirror'}, {'id': 30, 'name': 'armchair'}, {'id': 31, 'name': 'seat'}, {'id': 32, 'name': 'fence'}, {'id': 33, 'name': 'desk'}, {'id': 35, 'name': 'wardrobe'}, {'id': 36, 'name': 'lamp'}, {'id': 37, 'name': 'bathtub'}, {'id': 38, 'name': 'railing'}, {'id': 39, 'name': 'cushion'}, {'id': 41, 'name': 'box'}, {'id': 42, 'name': 'column'}, {'id': 43, 'name': 'signboard'}, {'id': 44, 'name': 'chest of drawers'}, {'id': 45, 'name': 'counter'}, {'id': 47, 'name': 'sink'}, {'id': 49, 'name': 'fireplace'}, {'id': 50, 'name': 'refrigerator'}, {'id': 53, 'name': 'stairs'}, {'id': 55, 'name': 'case'}, {'id': 56, 'name': 'pool table'}, {'id': 57, 'name': 'pillow'}, {'id': 58, 'name': 'screen door'}, {'id': 62, 'name': 'bookcase'}, {'id': 64, 'name': 'coffee table'}, {'id': 65, 'name': 'toilet'}, {'id': 66, 'name': 'flower'}, {'id': 67, 'name': 'book'}, {'id': 69, 'name': 'bench'}, {'id': 70, 'name': 'countertop'}, {'id': 71, 'name': 'stove'}, {'id': 72, 'name': 'palm'}, {'id': 73, 'name': 'kitchen island'}, {'id': 74, 'name': 'computer'}, {'id': 75, 'name': 'swivel chair'}, {'id': 76, 'name': 'boat'}, {'id': 78, 'name': 'arcade machine'}, {'id': 80, 'name': 'bus'}, {'id': 81, 'name': 'towel'}, {'id': 82, 'name': 'light'}, {'id': 83, 'name': 'truck'}, {'id': 85, 'name': 'chandelier'}, {'id': 86, 'name': 'awning'}, {'id': 87, 'name': 'streetlight'}, {'id': 88, 'name': 'booth'}, {'id': 89, 'name': 'television receiver'}, {'id': 90, 'name': 'airplane'}, {'id': 92, 'name': 'apparel'}, {'id': 93, 'name': 'pole'}, {'id': 95, 'name': 'bannister'}, {'id': 97, 'name': 'ottoman'}, {'id': 98, 'name': 'bottle'}, {'id': 102, 'name': 'van'}, {'id': 103, 'name': 'ship'}, {'id': 104, 'name': 'fountain'}, {'id': 107, 'name': 'washer'}, {'id': 108, 'name': 'plaything'}, {'id': 110, 'name': 'stool'}, {'id': 111, 'name': 'barrel'}, {'id': 112, 'name': 'basket'}, {'id': 115, 'name': 'bag'}, {'id': 116, 'name': 'minibike'}, {'id': 118, 'name': 'oven'}, {'id': 119, 'name': 'ball'}, {'id': 120, 'name': 'food'}, {'id': 121, 'name': 'step'}, {'id': 123, 'name': 'trade name'}, {'id': 124, 'name': 'microwave'}, {'id': 125, 'name': 'pot'}, {'id': 126, 'name': 'animal'}, {'id': 127, 'name': 'bicycle'}, {'id': 129, 'name': 'dishwasher'}, {'id': 130, 'name': 'screen'}, {'id': 132, 'name': 'sculpture'}, {'id': 133, 'name': 'hood'}, {'id': 134, 'name': 'sconce'}, {'id': 135, 'name': 'vase'}, {'id': 136, 'name': 'traffic light'}, {'id': 137, 'name': 'tray'}, {'id': 138, 'name': 'ashcan'}, {'id': 139, 'name': 'fan'}, {'id': 142, 'name': 'plate'}, {'id': 143, 'name': 'monitor'}, {'id': 144, 'name': 'bulletin board'}, {'id': 146, 'name': 'radiator'}, {'id': 147, 'name': 'glass'}, {'id': 148, 'name': 'clock'}, {'id': 149, 'name': 'flag'}] 13 | 14 | 15 | _PREDEFINED_SPLITS = { 16 | # point annotations without masks 17 | "ade20k_instance_train": ( 18 | "ADEChallengeData2016/images/training", 19 | "ADEChallengeData2016/ade20k_instance_train.json", 20 | ), 21 | "ade20k_instance_val": ( 22 | "ADEChallengeData2016/images/validation", 23 | "ADEChallengeData2016/ade20k_instance_val.json", 24 | ), 25 | } 26 | 27 | 28 | def _get_ade_instances_meta(): 29 | thing_ids = [k["id"] for k in ADE_CATEGORIES] 30 | assert len(thing_ids) == 100, len(thing_ids) 31 | # Mapping from the incontiguous ADE category id to an id in [0, 99] 32 | thing_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(thing_ids)} 33 | thing_classes = [k["name"] for k in ADE_CATEGORIES] 34 | ret = { 35 | "thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id, 36 | "thing_classes": thing_classes, 37 | } 38 | return ret 39 | 40 | 41 | def register_all_ade20k_instance(root): 42 | for key, (image_root, json_file) in _PREDEFINED_SPLITS.items(): 43 | # Assume pre-defined datasets live in `./datasets`. 44 | register_coco_instances( 45 | key, 46 | _get_ade_instances_meta(), 47 | os.path.join(root, json_file) if "://" not in json_file else json_file, 48 | os.path.join(root, image_root), 49 | ) 50 | 51 | 52 | _root = os.getenv("DETECTRON2_DATASETS", "datasets") 53 | register_all_ade20k_instance(_root) 54 | -------------------------------------------------------------------------------- /evaluate/mask.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tsungyi' 2 | 3 | import pycocotools._mask as _mask 4 | 5 | # Interface for manipulating masks stored in RLE format. 6 | # 7 | # RLE is a simple yet efficient format for storing binary masks. RLE 8 | # first divides a vector (or vectorized image) into a series of piecewise 9 | # constant regions and then for each piece simply stores the length of 10 | # that piece. For example, given M=[0 0 1 1 1 0 1] the RLE counts would 11 | # be [2 3 1 1], or for M=[1 1 1 1 1 1 0] the counts would be [0 6 1] 12 | # (note that the odd counts are always the numbers of zeros). Instead of 13 | # storing the counts directly, additional compression is achieved with a 14 | # variable bitrate representation based on a common scheme called LEB128. 15 | # 16 | # Compression is greatest given large piecewise constant regions. 17 | # Specifically, the size of the RLE is proportional to the number of 18 | # *boundaries* in M (or for an image the number of boundaries in the y 19 | # direction). Assuming fairly simple shapes, the RLE representation is 20 | # O(sqrt(n)) where n is number of pixels in the object. Hence space usage 21 | # is substantially lower, especially for large simple objects (large n). 22 | # 23 | # Many common operations on masks can be computed directly using the RLE 24 | # (without need for decoding). This includes computations such as area, 25 | # union, intersection, etc. All of these operations are linear in the 26 | # size of the RLE, in other words they are O(sqrt(n)) where n is the area 27 | # of the object. Computing these operations on the original mask is O(n). 28 | # Thus, using the RLE can result in substantial computational savings. 29 | # 30 | # The following API functions are defined: 31 | # encode - Encode binary masks using RLE. 32 | # decode - Decode binary masks encoded via RLE. 33 | # merge - Compute union or intersection of encoded masks. 34 | # iou - Compute intersection over union between masks. 35 | # area - Compute area of encoded masks. 36 | # toBbox - Get bounding boxes surrounding encoded masks. 37 | # frPyObjects - Convert polygon, bbox, and uncompressed RLE to encoded RLE mask. 38 | # 39 | # Usage: 40 | # Rs = encode( masks ) 41 | # masks = decode( Rs ) 42 | # R = merge( Rs, intersect=false ) 43 | # o = iou( dt, gt, iscrowd ) 44 | # a = area( Rs ) 45 | # bbs = toBbox( Rs ) 46 | # Rs = frPyObjects( [pyObjects], h, w ) 47 | # 48 | # In the API the following formats are used: 49 | # Rs - [dict] Run-length encoding of binary masks 50 | # R - dict Run-length encoding of binary mask 51 | # masks - [hxwxn] Binary mask(s) (must have type np.ndarray(dtype=uint8) in column-major order) 52 | # iscrowd - [nx1] list of np.ndarray. 1 indicates corresponding gt image has crowd region to ignore 53 | # bbs - [nx4] Bounding box(es) stored as [x y w h] 54 | # poly - Polygon stored as [[x1 y1 x2 y2...],[x1 y1 ...],...] (2D list) 55 | # dt,gt - May be either bounding boxes or encoded masks 56 | # Both poly and bbs are 0-indexed (bbox=[0 0 1 1] encloses first pixel). 57 | # 58 | # Finally, a note about the intersection over union (iou) computation. 59 | # The standard iou of a ground truth (gt) and detected (dt) object is 60 | # iou(gt,dt) = area(intersect(gt,dt)) / area(union(gt,dt)) 61 | # For "crowd" regions, we use a modified criteria. If a gt object is 62 | # marked as "iscrowd", we allow a dt to match any subregion of the gt. 63 | # Choosing gt' in the crowd gt that best matches the dt can be done using 64 | # gt'=intersect(dt,gt). Since by definition union(gt',dt)=dt, computing 65 | # iou(gt,dt,iscrowd) = iou(gt',dt) = area(intersect(gt,dt)) / area(dt) 66 | # For crowd gt regions we use this modified criteria above for the iou. 67 | # 68 | # To compile run "python setup.py build_ext --inplace" 69 | # Please do not contact us for help with compiling. 70 | # 71 | # Microsoft COCO Toolbox. version 2.0 72 | # Data, paper, and tutorials available at: http://mscoco.org/ 73 | # Code written by Piotr Dollar and Tsung-Yi Lin, 2015. 74 | # Licensed under the Simplified BSD License [see coco/license.txt] 75 | 76 | iou = _mask.iou 77 | merge = _mask.merge 78 | frPyObjects = _mask.frPyObjects 79 | 80 | def encode(bimask): 81 | if len(bimask.shape) == 3: 82 | return _mask.encode(bimask) 83 | elif len(bimask.shape) == 2: 84 | h, w = bimask.shape 85 | return _mask.encode(bimask.reshape((h, w, 1), order='F'))[0] 86 | 87 | def decode(rleObjs): 88 | if type(rleObjs) == list: 89 | return _mask.decode(rleObjs) 90 | else: 91 | return _mask.decode([rleObjs])[:,:,0] 92 | 93 | def area(rleObjs): 94 | if type(rleObjs) == list: 95 | return _mask.area(rleObjs) 96 | else: 97 | return _mask.area([rleObjs])[0] 98 | 99 | def toBbox(rleObjs): 100 | if type(rleObjs) == list: 101 | return _mask.toBbox(rleObjs) 102 | else: 103 | return _mask.toBbox([rleObjs])[0] -------------------------------------------------------------------------------- /ovformer/data_video/datasets/builtin.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC 3 | 4 | import os 5 | 6 | from .ytvis import ( 7 | register_ytvis_instances, 8 | _get_ytvis_2019_instances_meta, 9 | _get_ytvis_2021_instances_meta, 10 | ) 11 | 12 | from .ovis import ( 13 | register_ovis_instances, 14 | _get_ovis_instances_meta, 15 | ) 16 | 17 | from .lvvis import ( 18 | register_lvvis_instances, 19 | _get_lvvis_instances_meta, 20 | ) 21 | 22 | from .burst import ( 23 | register_burst_instances, 24 | _get_burst_instances_meta, 25 | ) 26 | 27 | # ==== Predefined splits for YTVIS 2019 =========== 28 | _PREDEFINED_SPLITS_YTVIS_2019 = { 29 | "ytvis_2019_train": ("ytvis_2019/train/JPEGImages", 30 | "ytvis_2019/train.json"), 31 | "ytvis_2019_val": ("ytvis_2019/valid/JPEGImages", 32 | "ytvis_2019/valid.json"), 33 | "ytvis_2019_test": ("ytvis_2019/test/JPEGImages", 34 | "ytvis_2019/test.json"), 35 | } 36 | 37 | 38 | # ==== Predefined splits for YTVIS 2021 =========== 39 | _PREDEFINED_SPLITS_YTVIS_2021 = { 40 | "ytvis_2021_train": ("ytvis_2021/train/JPEGImages", 41 | "ytvis_2021/train.json"), 42 | "ytvis_2021_val": ("ytvis_2021/valid/JPEGImages", 43 | "ytvis_2021/valid.json"), 44 | "ytvis_2021_test": ("ytvis_2021/test/JPEGImages", 45 | "ytvis_2021/test.json"), 46 | } 47 | 48 | # ==== Predefined splits for OVIS =========== 49 | _PREDEFINED_SPLITS_OVIS = { 50 | "ovis_train": ("ovis/train", 51 | "ovis/annotations/train.json"), 52 | "ovis_val": ("ovis/valid", 53 | "ovis/annotations/valid.json"), 54 | "ovis_test": ("ovis/test", 55 | "ovis/annotations/test.json"), 56 | } 57 | 58 | # ==== Predefined splits for LVVIS =========== 59 | _PREDEFINED_SPLITS_LVVIS = { 60 | "lvvis_train": ("LVVIS/train/JPEGImages", 61 | "LVVIS/train/train_instances_nonovel.json"), 62 | "lvvis_val": ("LVVIS/val/JPEGImages", 63 | "LVVIS/val/val_instances_.json"), 64 | "lvvis_test": ("LVVIS/test/JPEGImages", 65 | "LVVIS/test/test_instances.json"), 66 | } 67 | 68 | _PREDEFINED_SPLITS_BURST= { 69 | "burst_val": ("burst/val", 70 | "burst/b2y_val.json"), 71 | } 72 | 73 | 74 | def register_all_ytvis_2019(root): 75 | for key, (image_root, json_file) in _PREDEFINED_SPLITS_YTVIS_2019.items(): 76 | # Assume pre-defined datasets live in `./datasets`. 77 | register_ytvis_instances( 78 | key, 79 | _get_ytvis_2019_instances_meta(), 80 | os.path.join(root, json_file) if "://" not in json_file else json_file, 81 | os.path.join(root, image_root), 82 | ) 83 | 84 | 85 | def register_all_ytvis_2021(root): 86 | for key, (image_root, json_file) in _PREDEFINED_SPLITS_YTVIS_2021.items(): 87 | # Assume pre-defined datasets live in `./datasets`. 88 | register_ytvis_instances( 89 | key, 90 | _get_ytvis_2021_instances_meta(), 91 | os.path.join(root, json_file) if "://" not in json_file else json_file, 92 | os.path.join(root, image_root), 93 | ) 94 | 95 | def register_all_ovis(root): 96 | for key, (image_root, json_file) in _PREDEFINED_SPLITS_OVIS.items(): 97 | # Assume pre-defined datasets live in `./datasets`. 98 | register_ovis_instances( 99 | key, 100 | _get_ovis_instances_meta(), 101 | os.path.join(root, json_file) if "://" not in json_file else json_file, 102 | os.path.join(root, image_root), 103 | ) 104 | 105 | def register_all_lvvis(root): 106 | for key, (image_root, json_file) in _PREDEFINED_SPLITS_LVVIS.items(): 107 | # Assume pre-defined datasets live in `./datasets`. 108 | register_lvvis_instances( 109 | key, 110 | _get_lvvis_instances_meta(), 111 | os.path.join(root, json_file) if "://" not in json_file else json_file, 112 | os.path.join(root, image_root), 113 | ) 114 | 115 | def register_all_burst(root): 116 | for key, (image_root, json_file) in _PREDEFINED_SPLITS_BURST.items(): 117 | # Assume pre-defined datasets live in `./datasets`. 118 | register_burst_instances( 119 | key, 120 | _get_burst_instances_meta(), 121 | os.path.join(root, json_file) if "://" not in json_file else json_file, 122 | os.path.join(root, image_root), 123 | ) 124 | 125 | if __name__.endswith(".builtin"): 126 | # Assume pre-defined datasets live in `./datasets`. 127 | _root = os.getenv("DETECTRON2_DATASETS", "datasets") 128 | register_all_ytvis_2019(_root) 129 | register_all_ytvis_2021(_root) 130 | register_all_ovis(_root) 131 | register_all_lvvis(_root) 132 | register_all_burst(_root) 133 | -------------------------------------------------------------------------------- /ovformer/evaluation/instance_evaluation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import contextlib 3 | import copy 4 | import io 5 | import itertools 6 | import json 7 | import logging 8 | import numpy as np 9 | import os 10 | import pickle 11 | from collections import OrderedDict 12 | import pycocotools.mask as mask_util 13 | import torch 14 | from pycocotools.coco import COCO 15 | from pycocotools.cocoeval import COCOeval 16 | from tabulate import tabulate 17 | 18 | import detectron2.utils.comm as comm 19 | from detectron2.config import CfgNode 20 | from detectron2.data import MetadataCatalog 21 | from detectron2.data.datasets.coco import convert_to_coco_json 22 | from detectron2.evaluation.coco_evaluation import COCOEvaluator, _evaluate_predictions_on_coco 23 | from detectron2.evaluation.fast_eval_api import COCOeval_opt 24 | from detectron2.structures import Boxes, BoxMode, pairwise_iou 25 | from detectron2.utils.file_io import PathManager 26 | from detectron2.utils.logger import create_small_table 27 | 28 | 29 | # modified from COCOEvaluator for instance segmetnat 30 | class InstanceSegEvaluator(COCOEvaluator): 31 | """ 32 | Evaluate AR for object proposals, AP for instance detection/segmentation, AP 33 | for keypoint detection outputs using COCO's metrics. 34 | See http://cocodataset.org/#detection-eval and 35 | http://cocodataset.org/#keypoints-eval to understand its metrics. 36 | The metrics range from 0 to 100 (instead of 0 to 1), where a -1 or NaN means 37 | the metric cannot be computed (e.g. due to no predictions made). 38 | 39 | In addition to COCO, this evaluator is able to support any bounding box detection, 40 | instance segmentation, or keypoint detection dataset. 41 | """ 42 | 43 | def _eval_predictions(self, predictions, img_ids=None): 44 | """ 45 | Evaluate predictions. Fill self._results with the metrics of the tasks. 46 | """ 47 | self._logger.info("Preparing results for COCO format ...") 48 | coco_results = list(itertools.chain(*[x["instances"] for x in predictions])) 49 | tasks = self._tasks or self._tasks_from_predictions(coco_results) 50 | 51 | # unmap the category ids for COCO 52 | if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"): 53 | dataset_id_to_contiguous_id = self._metadata.thing_dataset_id_to_contiguous_id 54 | # all_contiguous_ids = list(dataset_id_to_contiguous_id.values()) 55 | # num_classes = len(all_contiguous_ids) 56 | # assert min(all_contiguous_ids) == 0 and max(all_contiguous_ids) == num_classes - 1 57 | 58 | reverse_id_mapping = {v: k for k, v in dataset_id_to_contiguous_id.items()} 59 | for result in coco_results: 60 | category_id = result["category_id"] 61 | # assert category_id < num_classes, ( 62 | # f"A prediction has class={category_id}, " 63 | # f"but the dataset only has {num_classes} classes and " 64 | # f"predicted class id should be in [0, {num_classes - 1}]." 65 | # ) 66 | assert category_id in reverse_id_mapping, ( 67 | f"A prediction has class={category_id}, " 68 | f"but the dataset only has class ids in {dataset_id_to_contiguous_id}." 69 | ) 70 | result["category_id"] = reverse_id_mapping[category_id] 71 | 72 | if self._output_dir: 73 | file_path = os.path.join(self._output_dir, "coco_instances_results.json") 74 | self._logger.info("Saving results to {}".format(file_path)) 75 | with PathManager.open(file_path, "w") as f: 76 | f.write(json.dumps(coco_results)) 77 | f.flush() 78 | 79 | if not self._do_evaluation: 80 | self._logger.info("Annotations are not available for evaluation.") 81 | return 82 | 83 | self._logger.info( 84 | "Evaluating predictions with {} COCO API...".format( 85 | "unofficial" if self._use_fast_impl else "official" 86 | ) 87 | ) 88 | for task in sorted(tasks): 89 | assert task in {"bbox", "segm", "keypoints"}, f"Got unknown task: {task}!" 90 | coco_eval = ( 91 | _evaluate_predictions_on_coco( 92 | self._coco_api, 93 | coco_results, 94 | task, 95 | kpt_oks_sigmas=self._kpt_oks_sigmas, 96 | use_fast_impl=self._use_fast_impl, 97 | img_ids=img_ids, 98 | max_dets_per_image=self._max_dets_per_image, 99 | ) 100 | if len(coco_results) > 0 101 | else None # cocoapi does not handle empty results very well 102 | ) 103 | 104 | res = self._derive_coco_results( 105 | coco_eval, task, class_names=self._metadata.get("thing_classes") 106 | ) 107 | self._results[task] = res 108 | -------------------------------------------------------------------------------- /ovformer/config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | from detectron2.config import CfgNode as CN 4 | 5 | 6 | def add_ovformer_config(cfg): 7 | """ 8 | Add config for ovformer. 9 | """ 10 | # data config 11 | # select the dataset mapper 12 | cfg.INPUT.DATASET_MAPPER_NAME = "mask_former_semantic" 13 | # Color augmentation 14 | cfg.INPUT.COLOR_AUG_SSD = False 15 | # We retry random cropping until no single category in semantic segmentation GT occupies more 16 | # than `SINGLE_CATEGORY_MAX_AREA` part of the crop. 17 | cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA = 1.0 18 | # Pad image and segmentation GT in dataset mapper. 19 | cfg.INPUT.SIZE_DIVISIBILITY = -1 20 | 21 | cfg.INPUT.SAMPLING_FRAME_NUM = 2 22 | cfg.INPUT.SAMPLING_FRAME_RANGE = 20 23 | cfg.INPUT.SAMPLING_FRAME_SHUFFLE = False 24 | cfg.INPUT.AUGMENTATIONS = [] # "brightness", "contrast", "saturation", "rotation" 25 | 26 | 27 | # solver config 28 | # weight decay on embedding 29 | cfg.SOLVER.WEIGHT_DECAY_EMBED = 0.0 30 | # optimizer 31 | cfg.SOLVER.OPTIMIZER = "ADAMW" 32 | cfg.SOLVER.BACKBONE_MULTIPLIER = 0.1 33 | 34 | # mask_former model config 35 | cfg.MODEL.MASK_FORMER = CN() 36 | 37 | 38 | # loss 39 | cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION = True 40 | cfg.MODEL.MASK_FORMER.NO_OBJECT_WEIGHT = 0.1 41 | cfg.MODEL.MASK_FORMER.OBJECT_WEIGHT = 1.0 42 | cfg.MODEL.MASK_FORMER.CLASS_WEIGHT = 1.0 43 | cfg.MODEL.MASK_FORMER.DICE_WEIGHT = 1.0 44 | cfg.MODEL.MASK_FORMER.MASK_WEIGHT = 20.0 45 | 46 | # transformer config 47 | cfg.MODEL.MASK_FORMER.NHEADS = 8 48 | cfg.MODEL.MASK_FORMER.DROPOUT = 0.1 49 | cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD = 2048 50 | cfg.MODEL.MASK_FORMER.ENC_LAYERS = 0 51 | cfg.MODEL.MASK_FORMER.DEC_LAYERS = 6 52 | cfg.MODEL.MASK_FORMER.PRE_NORM = False 53 | 54 | cfg.MODEL.MASK_FORMER.HIDDEN_DIM = 256 55 | cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES = 100 56 | 57 | cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE = "res5" 58 | cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ = False 59 | 60 | # mask_former inference config 61 | cfg.MODEL.MASK_FORMER.TEST = CN() 62 | cfg.MODEL.MASK_FORMER.TEST.SEMANTIC_ON = True 63 | cfg.MODEL.MASK_FORMER.TEST.INSTANCE_ON = False 64 | cfg.MODEL.MASK_FORMER.TEST.PANOPTIC_ON = False 65 | cfg.MODEL.MASK_FORMER.TEST.OBJECT_MASK_THRESHOLD = 0.0 66 | cfg.MODEL.MASK_FORMER.TEST.OVERLAP_THRESHOLD = 0.0 67 | cfg.MODEL.MASK_FORMER.TEST.SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE = False 68 | 69 | # Sometimes `backbone.size_divisibility` is set to 0 for some backbone (e.g. ResNet) 70 | # you can use this config to override 71 | cfg.MODEL.MASK_FORMER.SIZE_DIVISIBILITY = 32 72 | 73 | 74 | cfg.MODEL.MASK_FORMER.CLIP_TEXT_PATH = '' 75 | cfg.MODEL.MASK_FORMER.CLIP_IMAGE_PATH = '' 76 | 77 | # classifier config 78 | cfg.MODEL.MASK_FORMER.CLIP_CLASSIFIER = False 79 | cfg.MODEL.MASK_FORMER.AGNOSTIC_CLASSIFIER = False 80 | 81 | # pixel decoder config 82 | cfg.MODEL.SEM_SEG_HEAD.MASK_DIM = 256 83 | # adding transformer in pixel decoder 84 | cfg.MODEL.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS = 0 85 | # pixel decoder 86 | cfg.MODEL.SEM_SEG_HEAD.PIXEL_DECODER_NAME = "BasePixelDecoder" 87 | 88 | # swin transformer backbone 89 | cfg.MODEL.SWIN = CN() 90 | cfg.MODEL.SWIN.PRETRAIN_IMG_SIZE = 224 91 | cfg.MODEL.SWIN.PATCH_SIZE = 4 92 | cfg.MODEL.SWIN.EMBED_DIM = 96 93 | cfg.MODEL.SWIN.DEPTHS = [2, 2, 6, 2] 94 | cfg.MODEL.SWIN.NUM_HEADS = [3, 6, 12, 24] 95 | cfg.MODEL.SWIN.WINDOW_SIZE = 7 96 | cfg.MODEL.SWIN.MLP_RATIO = 4.0 97 | cfg.MODEL.SWIN.QKV_BIAS = True 98 | cfg.MODEL.SWIN.QK_SCALE = None 99 | cfg.MODEL.SWIN.DROP_RATE = 0.0 100 | cfg.MODEL.SWIN.ATTN_DROP_RATE = 0.0 101 | cfg.MODEL.SWIN.DROP_PATH_RATE = 0.3 102 | cfg.MODEL.SWIN.APE = False 103 | cfg.MODEL.SWIN.PATCH_NORM = True 104 | cfg.MODEL.SWIN.OUT_FEATURES = ["res2", "res3", "res4", "res5"] 105 | cfg.MODEL.SWIN.USE_CHECKPOINT = False 106 | 107 | # NOTE: maskformer2 extra configs 108 | # transformer module 109 | cfg.MODEL.MASK_FORMER.TRANSFORMER_DECODER_NAME = "MultiScaleMaskedTransformerDecoder" 110 | 111 | cfg.MODEL.TIMM = CN() 112 | cfg.MODEL.TIMM.BASE_NAME = 'resnet50' 113 | cfg.MODEL.TIMM.OUT_LEVELS = (2, 3, 4, 5) 114 | cfg.MODEL.TIMM.NORM = 'FrozenBN' 115 | cfg.MODEL.TIMM.FREEZE_AT = 0 116 | cfg.MODEL.TIMM.PRETRAINED = False 117 | 118 | # LSJ aug 119 | cfg.INPUT.IMAGE_SIZE = 1024 120 | cfg.INPUT.MIN_SCALE = 0.1 121 | cfg.INPUT.MAX_SCALE = 2.0 122 | 123 | # MSDeformAttn encoder configs 124 | cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES = ["res3", "res4", "res5"] 125 | cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_POINTS = 4 126 | cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_HEADS = 8 127 | 128 | # point loss configs 129 | # Number of points sampled during training for a mask point head. 130 | cfg.MODEL.MASK_FORMER.TRAIN_NUM_POINTS = 112 * 112 131 | # Oversampling parameter for PointRend point sampling during training. Parameter `k` in the 132 | # original paper. 133 | cfg.MODEL.MASK_FORMER.OVERSAMPLE_RATIO = 3.0 134 | # Importance sampling parameter for PointRend point sampling during training. Parametr `beta` in 135 | # the original paper. 136 | cfg.MODEL.MASK_FORMER.IMPORTANCE_SAMPLE_RATIO = 0.75 137 | -------------------------------------------------------------------------------- /tools/save_clip_features.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from clip import clip 3 | from PIL import Image 4 | from tqdm import tqdm 5 | import json 6 | import os 7 | 8 | device = "cuda" if torch.cuda.is_available() else "cpu" 9 | model, preprocess = clip.load("ViT-B/32", device=device) 10 | for _, param in model.named_parameters(): 11 | param.requires_grad = False 12 | 13 | # LVIS train 14 | json_path = 'datasets/lvis/lvis_v1_train.json' 15 | file_dir = "datasets/coco/train2017/" 16 | save_path = "datasets/metadata/lvis_train_clip_feature.pkl" 17 | data = json.load(open(json_path, 'r')) 18 | dic = {} 19 | for image in tqdm(data['images']): 20 | file_name = file_dir + f"{image['id']}".zfill(12) + ".jpg" 21 | image_clip = preprocess(Image.open(file_name)).unsqueeze(0).to(device) 22 | feature_clip = model.encode_image(image_clip) 23 | dic[image['id']] = feature_clip 24 | torch.save(dic, save_path) 25 | 26 | 27 | # LVIS val 28 | json_path = 'datasets/lvis/lvis_v1_val.json' 29 | file_dir = "datasets/coco/val2017/" 30 | save_path = "datasets/metadata/lvis_val_clip_feature.pkl" 31 | data = json.load(open(json_path, 'r')) 32 | dic = {} 33 | for image in tqdm(data['images']): 34 | file_name = file_dir + f"{image['id']}".zfill(12) + ".jpg" 35 | if not os.path.exists(file_name): 36 | file_name = "datasets/coco/train2017/" + f"{image['id']}".zfill(12) + ".jpg" 37 | image_clip = preprocess(Image.open(file_name)).unsqueeze(0).to(device) 38 | feature_clip = model.encode_image(image_clip) 39 | dic[image['id']] = feature_clip 40 | torch.save(dic, save_path) 41 | 42 | # LVVIS train 43 | json_path = 'datasets/LVVIS/train/train_instances_.json' # 44 | file_dir = "datasets/LVVIS/train/JPEGImages/" 45 | save_path = "datasets/metadata/lvvis_train_clip_feature.pkl" 46 | data = json.load(open(json_path, 'r')) 47 | dic = {} 48 | for video in tqdm(data['videos']): 49 | for image in video["file_names"]: 50 | file_name = file_dir + image 51 | image_clip = preprocess(Image.open(file_name)).unsqueeze(0).to(device) 52 | feature_clip = model.encode_image(image_clip) 53 | dic[file_name] = feature_clip 54 | torch.save(dic, save_path) 55 | 56 | # LVVIS val 57 | json_path = 'datasets/LVVIS/val/val_instances_.json' 58 | file_dir = "datasets/LVVIS/val/JPEGImages/" 59 | save_path = "datasets/metadata/lvvis_val_clip_feature.pkl" 60 | data = json.load(open(json_path, 'r')) 61 | dic = {} 62 | for video in tqdm(data['videos']): 63 | for image in video["file_names"]: 64 | file_name = file_dir + image 65 | image_clip = preprocess(Image.open(file_name)).unsqueeze(0).to(device) 66 | feature_clip = model.encode_image(image_clip) 67 | dic[file_name] = feature_clip 68 | torch.save(dic, save_path) 69 | 70 | # LVVIS test 71 | json_path = 'datasets/LVVIS/test/test_instances.json' 72 | file_dir = "datasets/LVVIS/test/JPEGImages/" 73 | save_path = "datasets/metadata/lvvis_test_clip_feature.pkl" 74 | data = json.load(open(json_path, 'r')) 75 | dic = {} 76 | for video in tqdm(data['videos']): 77 | for image in video["file_names"]: 78 | file_name = file_dir + image 79 | image_clip = preprocess(Image.open(file_name)).unsqueeze(0).to(device) 80 | feature_clip = model.encode_image(image_clip) 81 | dic[file_name] = feature_clip 82 | torch.save(dic, save_path) 83 | 84 | # ytvis_2019 val 85 | json_path = 'datasets/ytvis_2019/valid.json' 86 | file_dir = "datasets/ytvis_2019/valid/JPEGImages/" 87 | save_path = "datasets/metadata/ytvis_2019_val_clip_feature.pkl" 88 | data = json.load(open(json_path, 'r')) 89 | dic = {} 90 | for video in tqdm(data['videos']): 91 | for image in video["file_names"]: 92 | file_name = file_dir + image 93 | image_clip = preprocess(Image.open(file_name)).unsqueeze(0).to(device) 94 | feature_clip = model.encode_image(image_clip) 95 | dic[file_name] = feature_clip 96 | torch.save(dic, save_path) 97 | 98 | # ytvis_2021 val 99 | json_path = 'datasets/ytvis_2021/valid.json' 100 | file_dir = "datasets/ytvis_2021/valid/JPEGImages/" 101 | save_path = "datasets/metadata/ytvis_2021_val_clip_feature.pkl" 102 | data = json.load(open(json_path, 'r')) 103 | dic = {} 104 | for video in tqdm(data['videos']): 105 | for image in video["file_names"]: 106 | file_name = file_dir + image 107 | image_clip = preprocess(Image.open(file_name)).unsqueeze(0).to(device) 108 | feature_clip = model.encode_image(image_clip) 109 | dic[file_name] = feature_clip 110 | torch.save(dic, save_path) 111 | 112 | # ovis val 113 | json_path = 'datasets/ovis/annotations/valid.json' 114 | file_dir = "datasets/ovis/valid/" 115 | save_path = "datasets/metadata/ovis_val_clip_feature.pkl" 116 | data = json.load(open(json_path, 'r')) 117 | dic = {} 118 | for video in tqdm(data['videos']): 119 | for image in video["file_names"]: 120 | file_name = file_dir + image 121 | image_clip = preprocess(Image.open(file_name)).unsqueeze(0).to(device) 122 | feature_clip = model.encode_image(image_clip) 123 | dic[file_name] = feature_clip 124 | torch.save(dic, save_path) 125 | 126 | # burst val 127 | json_path = 'datasets/burst/b2y_val.json' 128 | file_dir = "datasets/burst/val/" 129 | save_path = "datasets/metadata/burst_val_clip_feature.pkl" 130 | data = json.load(open(json_path, 'r')) 131 | dic = {} 132 | for video in tqdm(data['videos']): 133 | for image in video["file_names"]: 134 | file_name = file_dir + image 135 | image_clip = preprocess(Image.open(file_name)).unsqueeze(0).to(device) 136 | feature_clip = model.encode_image(image_clip) 137 | dic[file_name] = feature_clip 138 | torch.save(dic, save_path) 139 | -------------------------------------------------------------------------------- /datasets/README.md: -------------------------------------------------------------------------------- 1 | # Prepare Datasets for OVFormer 2 | 3 | OVFormer has builtin support for a few datasets. 4 | The datasets are assumed to exist in a directory specified by the environment variable 5 | `DETECTRON2_DATASETS`. 6 | Under this directory, detectron2 will look for datasets in the structure described below, if needed. 7 | ``` 8 | $DETECTRON2_DATASETS/ 9 | coco/ 10 | lvis/ 11 | LVVIS/ 12 | ytvis_2019/ 13 | ytvis_2021/ 14 | ovis/ 15 | burst/ 16 | metadata/ 17 | ``` 18 | 19 | You can set the location for builtin datasets by `export DETECTRON2_DATASETS=/path/to/datasets`. 20 | If left unset, the default is `./datasets` relative to your current working directory. 21 | 22 | 24 | 25 | ## STEP-1: Prepare Image & Video Instance Segmentation datasets 26 | ### Expected dataset structure for [COCO](https://cocodataset.org/#download): 27 | 28 | ``` 29 | coco/ 30 | annotations/ 31 | instances_{train,val}2017.json 32 | {train,val}2017/ 33 | ``` 34 | 35 | ### Expected dataset structure for [LVIS](https://www.lvisdataset.org/dataset): 36 | 37 | ``` 38 | lvis/ 39 | lvis_v1_train.json 40 | lvis_v1_train_norare.json 41 | lvis_v1_val.json 42 | ``` 43 | Next, prepare the open-vocabulary LVIS training set using 44 | ```bash 45 | python tools/remove_lvis_rare.py --ann datasets/lvis/lvis_v1_train.json 46 | ``` 47 | This will generate```datasets/lvis/lvis_v1_train_norare.json.``` 48 | 49 | ### Expected dataset structure for [LV-VIS](https://github.com/haochenheheda/LVVIS): 50 | 51 | ``` 52 | LVVIS/ 53 | train/ 54 | JPEGImages/ 55 | train_instances_.json 56 | train_instances_nonovel.json 57 | val/ 58 | JPEGImages/ 59 | val_instances_.json 60 | test/ 61 | JPEGImages/ 62 | test_instances.json 63 | ``` 64 | LV-VIS official did not provide JSON files for the test set, using 65 | ```bash 66 | python tools/lvivs_test_instances_json.py 67 | ``` 68 | This will generate```datasets/LVVIS/test/test_instances.json.``` 69 | 70 | Next, prepare the open-vocabulary LV-VIS training set using 71 | ```bash 72 | python tools/remove_lvvis_novel.py --ann datasets/LVVIS/train/train_instances_.json 73 | ``` 74 | This will generate```datasets/lvvis/train/train_instances_nonovel.json.``` 75 | 76 | ### Expected dataset structure for [YouTubeVIS 2019](https://codalab.lisn.upsaclay.fr/competitions/7682): 77 | 78 | ``` 79 | ytvis_2019/ 80 | {train,valid,test}.json 81 | {train,valid,test}/ 82 | JPEGImages/ 83 | ``` 84 | 85 | ### Expected dataset structure for [YouTubeVIS 2021](https://codalab.lisn.upsaclay.fr/competitions/7680): 86 | 87 | ``` 88 | ytvis_2021/ 89 | {train,valid,test}.json 90 | {train,valid,test}/ 91 | JPEGImages/ 92 | ``` 93 | 94 | ### Expected dataset structure for [OVIS](https://codalab.lisn.upsaclay.fr/competitions/4763): 95 | 96 | ``` 97 | ovis/ 98 | annotations/ 99 | {train,valid,test}.json 100 | {train,valid,test}/ 101 | JPEGImages/ 102 | ``` 103 | 104 | ### Expected dataset structure for [BURST](https://github.com/Ali2500/BURST-benchmark): 105 | ``` 106 | burst/ 107 | info/ 108 | class_split.json 109 | val/ 110 | ArgoVerse/ 111 | AVA/ 112 | BDD/ 113 | Charades/ 114 | HACS/ 115 | LaSOT/ 116 | YFCC100M/ 117 | all_classes.json 118 | b2y_val.json 119 | 120 | ``` 121 | Download the data of BURST val set (except AVA and HACS videos): 122 | 123 | ```bash 124 | wget https://motchallenge.net/data/2-TAO_VAL.zip 125 | wget https://omnomnom.vision.rwth-aachen.de/data/BURST/annotations.zip 126 | ``` 127 | To download the TAO AVA and HACS videos you need to sign in [MOTChallenge](https://motchallenge.net/login/) account. 128 | 129 | The b2y_val.json is the youtube-vis format annitation files generated by 130 | ```bsah 131 | python tools/burst2ytvis.py --ann datasets/burst/val/all_classes.json --out datasets/burst/b2y_val.json 132 | ``` 133 | 134 | 135 | ## STEP-2: Prepare metadata 136 | #### Download [metadata](https://drive.google.com/file/d/10M7PQdCc9n6dM0NHbOKXOO_cdBHPavRZ/view?usp=sharing), and organize the files according to the following structure: 137 | ``` 138 | metadata/ 139 | fg_bg_5_10_coco_ens.npy 140 | fg_bg_5_10_lvis_ens.npy 141 | fg_bg_5_10_lvvis_ens.npy 142 | fg_bg_5_10_ovis_ens.npy 143 | fg_bg_5_10_ytvis19_ens.npy 144 | fg_bg_5_10_ytvis21_ens.npy 145 | fg_bg_5_10_burst_ens.npy 146 | ``` 147 | the metadata contains pre-computed classifiers for each dataset, which are generated by [DetPro](https://github.com/dyabel/detpro). 148 | If you want to generate customer classifiers, please follow this project. 149 | 150 | ``` 151 | metadata/ 152 | lvis_v1_train_cat_info.json 153 | lvvis_train_cat_info.json 154 | ``` 155 | the metadata contains category information for two training sets, 156 | which are generated by [get_lvis_cat_info.py](../tools/get_lvis_cat_info.py) and [get_lvvis_cat_info.py](../tools/get_lvvis_cat_info.py). 157 | 158 | ``` 159 | metadata/ 160 | lvis_train_clip_feature.pkl 161 | lvis_val_clip_feature.pkl 162 | lvvis_train_clip_feature.pkl 163 | lvvis_val_clip_feature.pkl 164 | lvvis_test_clip_feature.pkl 165 | ytvis_2019_val_clip_feature.pkl 166 | ytvis_2021_val_clip_feature.pkl 167 | ovis_val_clip_feature.pkl 168 | burst_val_clip_feature.pkl 169 | ``` 170 | the metadata contains CLIP image features for each dataset, 171 | which are generated by [save_clip_features.py](../tools/save_clip_features.py) 172 | 173 | ## STEP-3: Prepare Pretrained Model 174 | Like [OV2Seg](https://github.com/haochenheheda/LVVIS), our paper uses ImageNet-21K pretrained models that are not part of Detectron2 (ResNet-50-21K from [MIIL](https://github.com/Alibaba-MIIL/ImageNet21K) and SwinB-21K from [Swin-Transformer](https://github.com/microsoft/Swin-Transformer)). Before training, 175 | please download the models and place them under `models/`, and following [this tool](../tools/convert-thirdparty-pretrained-model-to-d2.py) to convert the format. 176 | 177 | ``` 178 | models/ 179 | resnet50_miil_21k.pkl 180 | swin_base_patch4_window12_384_22k.pkl 181 | datasets/ 182 | metadata/ 183 | ... 184 | ``` -------------------------------------------------------------------------------- /ovformer/modeling/meta_arch/mask_former_head.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import logging 3 | from copy import deepcopy 4 | from typing import Callable, Dict, List, Optional, Tuple, Union 5 | 6 | import fvcore.nn.weight_init as weight_init 7 | from torch import nn 8 | from torch.nn import functional as F 9 | 10 | from detectron2.config import configurable 11 | from detectron2.layers import Conv2d, ShapeSpec, get_norm 12 | from detectron2.modeling import SEM_SEG_HEADS_REGISTRY 13 | 14 | from ..transformer_decoder.maskformer_transformer_decoder import build_transformer_decoder 15 | from ..pixel_decoder.fpn import build_pixel_decoder 16 | 17 | 18 | @SEM_SEG_HEADS_REGISTRY.register() 19 | class MaskFormerHead(nn.Module): 20 | 21 | _version = 2 22 | 23 | def _load_from_state_dict( 24 | self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs 25 | ): 26 | version = local_metadata.get("version", None) 27 | if version is None or version < 2: 28 | # Do not warn if train from scratch 29 | scratch = True 30 | logger = logging.getLogger(__name__) 31 | for k in list(state_dict.keys()): 32 | newk = k 33 | #if "sem_seg_head" in k and not k.startswith(prefix + "predictor"): 34 | # newk = k.replace(prefix, prefix + "pixel_decoder.") 35 | # # logger.debug(f"{k} ==> {newk}") 36 | if newk != k: 37 | state_dict[newk] = state_dict[k] 38 | del state_dict[k] 39 | scratch = False 40 | 41 | if not scratch: 42 | logger.warning( 43 | f"Weight format of {self.__class__.__name__} have changed! " 44 | "Please upgrade your models. Applying automatic conversion now ..." 45 | ) 46 | 47 | @configurable 48 | def __init__( 49 | self, 50 | input_shape: Dict[str, ShapeSpec], 51 | *, 52 | num_classes: int, 53 | pixel_decoder: nn.Module, 54 | loss_weight: float = 1.0, 55 | ignore_value: int = -1, 56 | # extra parameters 57 | transformer_predictor: nn.Module, 58 | transformer_in_feature: str, 59 | ): 60 | """ 61 | NOTE: this interface is experimental. 62 | Args: 63 | input_shape: shapes (channels and stride) of the input features 64 | num_classes: number of classes to predict 65 | pixel_decoder: the pixel decoder module 66 | loss_weight: loss weight 67 | ignore_value: category id to be ignored during training. 68 | transformer_predictor: the transformer decoder that makes prediction 69 | transformer_in_feature: input feature name to the transformer_predictor 70 | """ 71 | super().__init__() 72 | input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride) 73 | self.in_features = [k for k, v in input_shape] 74 | feature_strides = [v.stride for k, v in input_shape] 75 | feature_channels = [v.channels for k, v in input_shape] 76 | 77 | self.ignore_value = ignore_value 78 | self.common_stride = 4 79 | self.loss_weight = loss_weight 80 | 81 | self.pixel_decoder = pixel_decoder 82 | self.predictor = transformer_predictor 83 | self.transformer_in_feature = transformer_in_feature 84 | 85 | self.num_classes = num_classes 86 | 87 | @classmethod 88 | def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]): 89 | # figure out in_channels to transformer predictor 90 | if cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "transformer_encoder": 91 | transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM 92 | elif cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "pixel_embedding": 93 | transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM 94 | elif cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "multi_scale_pixel_decoder": # for maskformer2 95 | transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM 96 | else: 97 | transformer_predictor_in_channels = input_shape[cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE].channels 98 | 99 | return { 100 | "input_shape": { 101 | k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES 102 | }, 103 | "ignore_value": cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE, 104 | "num_classes": cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES, 105 | "pixel_decoder": build_pixel_decoder(cfg, input_shape), 106 | "loss_weight": cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT, 107 | "transformer_in_feature": cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE, 108 | "transformer_predictor": build_transformer_decoder( 109 | cfg, 110 | transformer_predictor_in_channels, 111 | mask_classification=True, 112 | ), 113 | } 114 | 115 | def forward(self, features, features_clip, mask=None): 116 | return self.layers(features, features_clip, mask) 117 | 118 | def layers(self, features, features_clip, mask=None): 119 | mask_features, transformer_encoder_features, multi_scale_features = self.pixel_decoder.forward_features(features) 120 | if self.transformer_in_feature == "multi_scale_pixel_decoder": 121 | predictions = self.predictor(multi_scale_features, mask_features, features_clip, mask) 122 | else: 123 | if self.transformer_in_feature == "transformer_encoder": 124 | assert ( 125 | transformer_encoder_features is not None 126 | ), "Please use the TransformerEncoderPixelDecoder." 127 | predictions = self.predictor(transformer_encoder_features, mask_features, mask) 128 | elif self.transformer_in_feature == "pixel_embedding": 129 | predictions = self.predictor(mask_features, mask_features, mask) 130 | else: 131 | predictions = self.predictor(features[self.transformer_in_feature], mask_features, mask) 132 | return predictions 133 | -------------------------------------------------------------------------------- /ovformer/data/dataset_mappers/coco_panoptic_new_baseline_dataset_mapper.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/d2/detr/dataset_mapper.py 3 | import copy 4 | import logging 5 | 6 | import numpy as np 7 | import torch 8 | 9 | from detectron2.config import configurable 10 | from detectron2.data import detection_utils as utils 11 | from detectron2.data import transforms as T 12 | from detectron2.data.transforms import TransformGen 13 | from detectron2.structures import BitMasks, Boxes, Instances 14 | 15 | __all__ = ["COCOPanopticNewBaselineDatasetMapper"] 16 | 17 | 18 | def build_transform_gen(cfg, is_train): 19 | """ 20 | Create a list of default :class:`Augmentation` from config. 21 | Now it includes resizing and flipping. 22 | Returns: 23 | list[Augmentation] 24 | """ 25 | assert is_train, "Only support training augmentation" 26 | image_size = cfg.INPUT.IMAGE_SIZE 27 | min_scale = cfg.INPUT.MIN_SCALE 28 | max_scale = cfg.INPUT.MAX_SCALE 29 | 30 | augmentation = [] 31 | 32 | if cfg.INPUT.RANDOM_FLIP != "none": 33 | augmentation.append( 34 | T.RandomFlip( 35 | horizontal=cfg.INPUT.RANDOM_FLIP == "horizontal", 36 | vertical=cfg.INPUT.RANDOM_FLIP == "vertical", 37 | ) 38 | ) 39 | 40 | augmentation.extend([ 41 | T.ResizeScale( 42 | min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size 43 | ), 44 | T.FixedSizeCrop(crop_size=(image_size, image_size)), 45 | ]) 46 | 47 | return augmentation 48 | 49 | 50 | # This is specifically designed for the COCO dataset. 51 | class COCOPanopticNewBaselineDatasetMapper: 52 | """ 53 | A callable which takes a dataset dict in Detectron2 Dataset format, 54 | and map it into a format used by MaskFormer. 55 | 56 | This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation. 57 | 58 | The callable currently does the following: 59 | 60 | 1. Read the image from "file_name" 61 | 2. Applies geometric transforms to the image and annotation 62 | 3. Find and applies suitable cropping to the image and annotation 63 | 4. Prepare image and annotation to Tensors 64 | """ 65 | 66 | @configurable 67 | def __init__( 68 | self, 69 | is_train=True, 70 | *, 71 | tfm_gens, 72 | image_format, 73 | ): 74 | """ 75 | NOTE: this interface is experimental. 76 | Args: 77 | is_train: for training or inference 78 | augmentations: a list of augmentations or deterministic transforms to apply 79 | crop_gen: crop augmentation 80 | tfm_gens: data augmentation 81 | image_format: an image format supported by :func:`detection_utils.read_image`. 82 | """ 83 | self.tfm_gens = tfm_gens 84 | logging.getLogger(__name__).info( 85 | "[COCOPanopticNewBaselineDatasetMapper] Full TransformGens used in training: {}".format( 86 | str(self.tfm_gens) 87 | ) 88 | ) 89 | 90 | self.img_format = image_format 91 | self.is_train = is_train 92 | 93 | @classmethod 94 | def from_config(cls, cfg, is_train=True): 95 | # Build augmentation 96 | tfm_gens = build_transform_gen(cfg, is_train) 97 | 98 | ret = { 99 | "is_train": is_train, 100 | "tfm_gens": tfm_gens, 101 | "image_format": cfg.INPUT.FORMAT, 102 | } 103 | return ret 104 | 105 | def __call__(self, dataset_dict): 106 | """ 107 | Args: 108 | dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. 109 | 110 | Returns: 111 | dict: a format that builtin models in detectron2 accept 112 | """ 113 | dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below 114 | image = utils.read_image(dataset_dict["file_name"], format=self.img_format) 115 | utils.check_image_size(dataset_dict, image) 116 | 117 | image, transforms = T.apply_transform_gens(self.tfm_gens, image) 118 | image_shape = image.shape[:2] # h, w 119 | 120 | # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory, 121 | # but not efficient on large generic data structures due to the use of pickle & mp.Queue. 122 | # Therefore it's important to use torch.Tensor. 123 | dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))) 124 | 125 | if not self.is_train: 126 | # USER: Modify this if you want to keep them for some reason. 127 | dataset_dict.pop("annotations", None) 128 | return dataset_dict 129 | 130 | if "pan_seg_file_name" in dataset_dict: 131 | pan_seg_gt = utils.read_image(dataset_dict.pop("pan_seg_file_name"), "RGB") 132 | segments_info = dataset_dict["segments_info"] 133 | 134 | # apply the same transformation to panoptic segmentation 135 | pan_seg_gt = transforms.apply_segmentation(pan_seg_gt) 136 | 137 | from panopticapi.utils import rgb2id 138 | 139 | pan_seg_gt = rgb2id(pan_seg_gt) 140 | 141 | instances = Instances(image_shape) 142 | classes = [] 143 | masks = [] 144 | for segment_info in segments_info: 145 | class_id = segment_info["category_id"] 146 | if not segment_info["iscrowd"]: 147 | classes.append(class_id) 148 | masks.append(pan_seg_gt == segment_info["id"]) 149 | 150 | classes = np.array(classes) 151 | instances.gt_classes = torch.tensor(classes, dtype=torch.int64) 152 | if len(masks) == 0: 153 | # Some image does not have annotation (all ignored) 154 | instances.gt_masks = torch.zeros((0, pan_seg_gt.shape[-2], pan_seg_gt.shape[-1])) 155 | instances.gt_boxes = Boxes(torch.zeros((0, 4))) 156 | else: 157 | masks = BitMasks( 158 | torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks]) 159 | ) 160 | instances.gt_masks = masks.tensor 161 | instances.gt_boxes = masks.get_bounding_boxes() 162 | 163 | dataset_dict["instances"] = instances 164 | 165 | return dataset_dict 166 | -------------------------------------------------------------------------------- /tools/analyze_model.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # Modified by Bowen Cheng from https://github.com/facebookresearch/detectron2/blob/main/tools/analyze_model.py 4 | 5 | import logging 6 | import numpy as np 7 | from collections import Counter 8 | import tqdm 9 | from fvcore.nn import flop_count_table # can also try flop_count_str 10 | 11 | from detectron2.checkpoint import DetectionCheckpointer 12 | from detectron2.config import CfgNode, LazyConfig, get_cfg, instantiate 13 | from detectron2.data import build_detection_test_loader 14 | from detectron2.engine import default_argument_parser 15 | from detectron2.modeling import build_model 16 | from detectron2.projects.deeplab import add_deeplab_config 17 | from detectron2.utils.analysis import ( 18 | FlopCountAnalysis, 19 | activation_count_operators, 20 | parameter_count_table, 21 | ) 22 | from detectron2.utils.logger import setup_logger 23 | 24 | # fmt: off 25 | import os 26 | import sys 27 | sys.path.insert(1, os.path.join(sys.path[0], '..')) 28 | # fmt: on 29 | 30 | from ovformer import add_ovformer_config 31 | 32 | logger = logging.getLogger("detectron2") 33 | 34 | 35 | def setup(args): 36 | if args.config_file.endswith(".yaml"): 37 | cfg = get_cfg() 38 | add_deeplab_config(cfg) 39 | add_ovformer_config(cfg) 40 | cfg.merge_from_file(args.config_file) 41 | cfg.DATALOADER.NUM_WORKERS = 0 42 | cfg.merge_from_list(args.opts) 43 | cfg.freeze() 44 | else: 45 | cfg = LazyConfig.load(args.config_file) 46 | cfg = LazyConfig.apply_overrides(cfg, args.opts) 47 | setup_logger(name="fvcore") 48 | setup_logger() 49 | return cfg 50 | 51 | 52 | def do_flop(cfg): 53 | if isinstance(cfg, CfgNode): 54 | data_loader = build_detection_test_loader(cfg, cfg.DATASETS.TEST[0]) 55 | model = build_model(cfg) 56 | DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS) 57 | else: 58 | data_loader = instantiate(cfg.dataloader.test) 59 | model = instantiate(cfg.model) 60 | model.to(cfg.train.device) 61 | DetectionCheckpointer(model).load(cfg.train.init_checkpoint) 62 | model.eval() 63 | 64 | counts = Counter() 65 | total_flops = [] 66 | for idx, data in zip(tqdm.trange(args.num_inputs), data_loader): # noqa 67 | if args.use_fixed_input_size and isinstance(cfg, CfgNode): 68 | import torch 69 | crop_size = cfg.INPUT.CROP.SIZE[0] 70 | data[0]["image"] = torch.zeros((3, crop_size, crop_size)) 71 | flops = FlopCountAnalysis(model, data) 72 | if idx > 0: 73 | flops.unsupported_ops_warnings(False).uncalled_modules_warnings(False) 74 | counts += flops.by_operator() 75 | total_flops.append(flops.total()) 76 | 77 | logger.info("Flops table computed from only one input sample:\n" + flop_count_table(flops)) 78 | logger.info( 79 | "Average GFlops for each type of operators:\n" 80 | + str([(k, v / (idx + 1) / 1e9) for k, v in counts.items()]) 81 | ) 82 | logger.info( 83 | "Total GFlops: {:.1f}±{:.1f}".format(np.mean(total_flops) / 1e9, np.std(total_flops) / 1e9) 84 | ) 85 | 86 | 87 | def do_activation(cfg): 88 | if isinstance(cfg, CfgNode): 89 | data_loader = build_detection_test_loader(cfg, cfg.DATASETS.TEST[0]) 90 | model = build_model(cfg) 91 | DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS) 92 | else: 93 | data_loader = instantiate(cfg.dataloader.test) 94 | model = instantiate(cfg.model) 95 | model.to(cfg.train.device) 96 | DetectionCheckpointer(model).load(cfg.train.init_checkpoint) 97 | model.eval() 98 | 99 | counts = Counter() 100 | total_activations = [] 101 | for idx, data in zip(tqdm.trange(args.num_inputs), data_loader): # noqa 102 | count = activation_count_operators(model, data) 103 | counts += count 104 | total_activations.append(sum(count.values())) 105 | logger.info( 106 | "(Million) Activations for Each Type of Operators:\n" 107 | + str([(k, v / idx) for k, v in counts.items()]) 108 | ) 109 | logger.info( 110 | "Total (Million) Activations: {}±{}".format( 111 | np.mean(total_activations), np.std(total_activations) 112 | ) 113 | ) 114 | 115 | 116 | def do_parameter(cfg): 117 | if isinstance(cfg, CfgNode): 118 | model = build_model(cfg) 119 | else: 120 | model = instantiate(cfg.model) 121 | logger.info("Parameter Count:\n" + parameter_count_table(model, max_depth=5)) 122 | 123 | 124 | def do_structure(cfg): 125 | if isinstance(cfg, CfgNode): 126 | model = build_model(cfg) 127 | else: 128 | model = instantiate(cfg.model) 129 | logger.info("Model Structure:\n" + str(model)) 130 | 131 | 132 | if __name__ == "__main__": 133 | parser = default_argument_parser( 134 | epilog=""" 135 | Examples: 136 | To show parameters of a model: 137 | $ ./analyze_model.py --tasks parameter \\ 138 | --config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml 139 | Flops and activations are data-dependent, therefore inputs and model weights 140 | are needed to count them: 141 | $ ./analyze_model.py --num-inputs 100 --tasks flop \\ 142 | --config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \\ 143 | MODEL.WEIGHTS /path/to/model.pkl 144 | For OVFormer: 145 | $ ./analyze_model.py--num-inputs 100 --tasks flop --config-file configs/lvis/ovformer_R50_bs8.yaml \\ 146 | MODEL.WEIGHTS models/ovformer_r50_lvis.pth 147 | """ 148 | ) 149 | parser.add_argument( 150 | "--tasks", 151 | choices=["flop", "activation", "parameter", "structure"], 152 | required=True, 153 | nargs="+", 154 | ) 155 | parser.add_argument( 156 | "-n", 157 | "--num-inputs", 158 | default=100, 159 | type=int, 160 | help="number of inputs used to compute statistics for flops/activations, " 161 | "both are data dependent.", 162 | ) 163 | parser.add_argument( 164 | "--use-fixed-input-size", 165 | action="store_true", 166 | help="use fixed input size when calculating flops", 167 | ) 168 | args = parser.parse_args() 169 | assert not args.eval_only 170 | assert args.num_gpus == 1 171 | 172 | cfg = setup(args) 173 | 174 | for task in args.tasks: 175 | { 176 | "flop": do_flop, 177 | "activation": do_activation, 178 | "parameter": do_parameter, 179 | "structure": do_structure, 180 | }[task](cfg) 181 | -------------------------------------------------------------------------------- /ovformer/data_video/augmentation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC 3 | 4 | import numpy as np 5 | import logging 6 | import sys 7 | from fvcore.transforms.transform import ( 8 | HFlipTransform, 9 | NoOpTransform, 10 | VFlipTransform, 11 | ) 12 | from PIL import Image 13 | 14 | from detectron2.data import transforms as T 15 | 16 | 17 | class ResizeShortestEdge(T.Augmentation): 18 | """ 19 | Scale the shorter edge to the given size, with a limit of `max_size` on the longer edge. 20 | If `max_size` is reached, then downscale so that the longer edge does not exceed max_size. 21 | """ 22 | 23 | def __init__( 24 | self, short_edge_length, max_size=sys.maxsize, sample_style="range", interp=Image.BILINEAR, clip_frame_cnt=1 25 | ): 26 | """ 27 | Args: 28 | short_edge_length (list[int]): If ``sample_style=="range"``, 29 | a [min, max] interval from which to sample the shortest edge length. 30 | If ``sample_style=="choice"``, a list of shortest edge lengths to sample from. 31 | max_size (int): maximum allowed longest edge length. 32 | sample_style (str): either "range" or "choice". 33 | """ 34 | super().__init__() 35 | assert sample_style in ["range", "choice", "range_by_clip", "choice_by_clip"], sample_style 36 | 37 | self.is_range = ("range" in sample_style) 38 | if isinstance(short_edge_length, int): 39 | short_edge_length = (short_edge_length, short_edge_length) 40 | if self.is_range: 41 | assert len(short_edge_length) == 2, ( 42 | "short_edge_length must be two values using 'range' sample style." 43 | f" Got {short_edge_length}!" 44 | ) 45 | self._cnt = 0 46 | self._init(locals()) 47 | 48 | def get_transform(self, image): 49 | if self._cnt % self.clip_frame_cnt == 0: 50 | if self.is_range: 51 | self.size = np.random.randint(self.short_edge_length[0], self.short_edge_length[1] + 1) 52 | else: 53 | self.size = np.random.choice(self.short_edge_length) 54 | if self.size == 0: 55 | return NoOpTransform() 56 | 57 | self._cnt = 0 # avoiding overflow 58 | self._cnt += 1 59 | 60 | h, w = image.shape[:2] 61 | 62 | scale = self.size * 1.0 / min(h, w) 63 | if h < w: 64 | newh, neww = self.size, scale * w 65 | else: 66 | newh, neww = scale * h, self.size 67 | if max(newh, neww) > self.max_size: 68 | scale = self.max_size * 1.0 / max(newh, neww) 69 | newh = newh * scale 70 | neww = neww * scale 71 | neww = int(neww + 0.5) 72 | newh = int(newh + 0.5) 73 | return T.ResizeTransform(h, w, newh, neww, self.interp) 74 | 75 | 76 | class RandomFlip(T.Augmentation): 77 | """ 78 | Flip the image horizontally or vertically with the given probability. 79 | """ 80 | 81 | def __init__(self, prob=0.5, *, horizontal=True, vertical=False, clip_frame_cnt=1): 82 | """ 83 | Args: 84 | prob (float): probability of flip. 85 | horizontal (boolean): whether to apply horizontal flipping 86 | vertical (boolean): whether to apply vertical flipping 87 | """ 88 | super().__init__() 89 | 90 | if horizontal and vertical: 91 | raise ValueError("Cannot do both horiz and vert. Please use two Flip instead.") 92 | if not horizontal and not vertical: 93 | raise ValueError("At least one of horiz or vert has to be True!") 94 | self._cnt = 0 95 | 96 | self._init(locals()) 97 | 98 | def get_transform(self, image): 99 | if self._cnt % self.clip_frame_cnt == 0: 100 | self.do = self._rand_range() < self.prob 101 | self._cnt = 0 # avoiding overflow 102 | self._cnt += 1 103 | 104 | h, w = image.shape[:2] 105 | 106 | if self.do: 107 | if self.horizontal: 108 | return HFlipTransform(w) 109 | elif self.vertical: 110 | return VFlipTransform(h) 111 | else: 112 | return NoOpTransform() 113 | 114 | 115 | def build_augmentation(cfg, is_train): 116 | logger = logging.getLogger(__name__) 117 | aug_list = [] 118 | if is_train: 119 | # Crop 120 | if cfg.INPUT.CROP.ENABLED: 121 | aug_list.append(T.RandomCrop(cfg.INPUT.CROP.TYPE, cfg.INPUT.CROP.SIZE)) 122 | 123 | # Resize 124 | min_size = cfg.INPUT.MIN_SIZE_TRAIN 125 | max_size = cfg.INPUT.MAX_SIZE_TRAIN 126 | sample_style = cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING 127 | ms_clip_frame_cnt = cfg.INPUT.SAMPLING_FRAME_NUM if "by_clip" in cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING else 1 128 | aug_list.append(ResizeShortestEdge(min_size, max_size, sample_style, clip_frame_cnt=ms_clip_frame_cnt)) 129 | 130 | # Flip 131 | if cfg.INPUT.RANDOM_FLIP != "none": 132 | if cfg.INPUT.RANDOM_FLIP == "flip_by_clip": 133 | flip_clip_frame_cnt = cfg.INPUT.SAMPLING_FRAME_NUM 134 | else: 135 | flip_clip_frame_cnt = 1 136 | 137 | aug_list.append( 138 | # NOTE using RandomFlip modified for the support of flip maintenance 139 | RandomFlip( 140 | horizontal=(cfg.INPUT.RANDOM_FLIP == "horizontal") or (cfg.INPUT.RANDOM_FLIP == "flip_by_clip"), 141 | vertical=cfg.INPUT.RANDOM_FLIP == "vertical", 142 | clip_frame_cnt=flip_clip_frame_cnt, 143 | ) 144 | ) 145 | 146 | # Additional augmentations : brightness, contrast, saturation, rotation 147 | augmentations = cfg.INPUT.AUGMENTATIONS 148 | if "brightness" in augmentations: 149 | aug_list.append(T.RandomBrightness(0.9, 1.1)) 150 | if "contrast" in augmentations: 151 | aug_list.append(T.RandomContrast(0.9, 1.1)) 152 | if "saturation" in augmentations: 153 | aug_list.append(T.RandomSaturation(0.9, 1.1)) 154 | if "rotation" in augmentations: 155 | aug_list.append( 156 | T.RandomRotation( 157 | [-15, 15], expand=False, center=[(0.4, 0.4), (0.6, 0.6)], sample_style="range" 158 | ) 159 | ) 160 | else: 161 | # Resize 162 | min_size = cfg.INPUT.MIN_SIZE_TEST 163 | max_size = cfg.INPUT.MAX_SIZE_TEST 164 | sample_style = "choice" 165 | aug_list.append(T.ResizeShortestEdge(min_size, max_size, sample_style)) 166 | 167 | return aug_list 168 | -------------------------------------------------------------------------------- /ovformer_video/modeling/transformer_decoder/zero_shot_classifier.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import numpy as np 3 | import torch 4 | from torch import nn, Tensor 5 | from torch.nn import functional as F 6 | from detectron2.config import configurable 7 | from detectron2.layers import Linear, ShapeSpec 8 | from typing import Optional 9 | 10 | 11 | class CrossAttentionLayer(nn.Module): 12 | def __init__(self, d_model, nhead, dropout=0.0, 13 | activation="relu", normalize_before=False): 14 | super().__init__() 15 | self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) 16 | 17 | self.norm = nn.LayerNorm(d_model) 18 | self.dropout = nn.Dropout(dropout) 19 | 20 | self.activation = _get_activation_fn(activation) 21 | self.normalize_before = normalize_before 22 | 23 | self._reset_parameters() 24 | 25 | def _reset_parameters(self): 26 | for p in self.parameters(): 27 | if p.dim() > 1: 28 | nn.init.xavier_uniform_(p) 29 | 30 | def with_pos_embed(self, tensor, pos: Optional[Tensor]): 31 | return tensor if pos is None else tensor + pos 32 | 33 | def forward_post(self, tgt, memory, 34 | memory_mask: Optional[Tensor] = None, 35 | memory_key_padding_mask: Optional[Tensor] = None, 36 | pos: Optional[Tensor] = None, 37 | query_pos: Optional[Tensor] = None): 38 | tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt, query_pos), 39 | key=self.with_pos_embed(memory, pos), 40 | value=memory, attn_mask=memory_mask, 41 | key_padding_mask=memory_key_padding_mask)[0] 42 | tgt = tgt + self.dropout(tgt2) 43 | tgt = self.norm(tgt) 44 | 45 | return tgt 46 | 47 | def forward_pre(self, tgt, memory, 48 | memory_mask: Optional[Tensor] = None, 49 | memory_key_padding_mask: Optional[Tensor] = None, 50 | pos: Optional[Tensor] = None, 51 | query_pos: Optional[Tensor] = None): 52 | tgt2 = self.norm(tgt) 53 | tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt2, query_pos), 54 | key=self.with_pos_embed(memory, pos), 55 | value=memory, attn_mask=memory_mask, 56 | key_padding_mask=memory_key_padding_mask)[0] 57 | tgt = tgt + self.dropout(tgt2) 58 | 59 | return tgt 60 | 61 | def forward(self, tgt, memory, 62 | memory_mask: Optional[Tensor] = None, 63 | memory_key_padding_mask: Optional[Tensor] = None, 64 | pos: Optional[Tensor] = None, 65 | query_pos: Optional[Tensor] = None): 66 | if self.normalize_before: 67 | return self.forward_pre(tgt, memory, memory_mask, 68 | memory_key_padding_mask, pos, query_pos) 69 | return self.forward_post(tgt, memory, memory_mask, 70 | memory_key_padding_mask, pos, query_pos) 71 | 72 | 73 | def _get_activation_fn(activation): 74 | """Return an activation function given a string""" 75 | if activation == "relu": 76 | return F.relu 77 | if activation == "gelu": 78 | return F.gelu 79 | if activation == "glu": 80 | return F.glu 81 | raise RuntimeError(F"activation should be relu/gelu, not {activation}.") 82 | 83 | 84 | class ZeroShotClassifier(nn.Module): 85 | def __init__( 86 | self, 87 | input_shape: ShapeSpec, 88 | num_classes: int, 89 | zs_weight_path: str, 90 | zs_weight_dim: int = 512, 91 | use_bias: float = 0.0, 92 | norm_weight: bool = True, 93 | norm_temperature: float = 50.0, 94 | ): 95 | super().__init__() 96 | if isinstance(input_shape, int): # some backward compatibility 97 | input_shape = ShapeSpec(channels=input_shape) 98 | input_size = input_shape.channels * (input_shape.width or 1) * (input_shape.height or 1) 99 | self.norm_weight = norm_weight 100 | self.norm_temperature = norm_temperature 101 | 102 | self.use_bias = use_bias < 0 103 | if self.use_bias: 104 | self.cls_bias = nn.Parameter(torch.ones(1) * use_bias) 105 | 106 | self.linear = nn.Sequential(nn.Linear(input_size, zs_weight_dim//2), 107 | nn.ReLU(), 108 | nn.Linear(zs_weight_dim//2, zs_weight_dim)) 109 | 110 | self.cross_attention = CrossAttentionLayer( 111 | d_model=zs_weight_dim, 112 | nhead=8, 113 | dropout=0.0, 114 | normalize_before=False, 115 | ) 116 | 117 | 118 | if zs_weight_path == 'rand': 119 | zs_weight = torch.randn((zs_weight_dim, num_classes)) 120 | nn.init.normal_(zs_weight, std=0.01) 121 | else: 122 | zs_weight = torch.tensor( 123 | np.load(zs_weight_path), 124 | dtype=torch.float32).permute(1, 0).contiguous() # D x C 125 | zs_weight = torch.cat( 126 | [zs_weight, zs_weight.new_zeros((zs_weight_dim, 1))], 127 | dim=1) # D x (C + 1) 128 | 129 | if self.norm_weight: 130 | zs_weight = F.normalize(zs_weight, p=2, dim=0) 131 | 132 | if zs_weight_path == 'rand': 133 | self.zs_weight = nn.Parameter(zs_weight) 134 | else: 135 | self.register_buffer('zs_weight', zs_weight) 136 | assert self.zs_weight.shape[1] == num_classes + 1 137 | 138 | 139 | def forward(self, x, features_clip, classifier=None): 140 | x = self.linear(x).transpose(0, 1) # (b,100,512) 141 | x = self.cross_attention( 142 | x, features_clip, 143 | memory_mask=None, 144 | memory_key_padding_mask=None, 145 | pos=None, query_pos=None 146 | ) 147 | x = x.transpose(0, 1) 148 | 149 | if classifier is not None: 150 | zs_weight = classifier.permute(1, 0).contiguous() # D x C' 151 | zs_weight = F.normalize(zs_weight, p=2, dim=0) \ 152 | if self.norm_weight else zs_weight 153 | else: 154 | zs_weight = self.zs_weight # (512, 1197) 155 | if self.norm_weight: 156 | x = self.norm_temperature * F.normalize(x, p=2, dim=2) 157 | bs, qn, _ = x.shape 158 | x = x.reshape(bs * qn, -1) # (b*100, 512) 159 | x = torch.mm(x, zs_weight) # (b*100, 1197) 160 | x = x.reshape(bs, qn, -1) # (b,100,1197) 161 | if self.use_bias: 162 | x = x + self.cls_bias 163 | return x 164 | -------------------------------------------------------------------------------- /ovformer/modeling/transformer_decoder/zero_shot_classifier.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from torch import nn, Tensor 4 | from torch.nn import functional as F 5 | from detectron2.config import configurable 6 | from detectron2.layers import Linear, ShapeSpec 7 | from typing import Optional 8 | 9 | 10 | class CrossAttentionLayer(nn.Module): 11 | def __init__(self, d_model, nhead, dropout=0.0, 12 | activation="relu", normalize_before=False): 13 | super().__init__() 14 | self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) 15 | 16 | self.norm = nn.LayerNorm(d_model) 17 | self.dropout = nn.Dropout(dropout) 18 | 19 | self.activation = _get_activation_fn(activation) 20 | self.normalize_before = normalize_before 21 | 22 | self._reset_parameters() 23 | 24 | def _reset_parameters(self): 25 | for p in self.parameters(): 26 | if p.dim() > 1: 27 | nn.init.xavier_uniform_(p) 28 | 29 | def with_pos_embed(self, tensor, pos: Optional[Tensor]): 30 | return tensor if pos is None else tensor + pos 31 | 32 | def forward_post(self, tgt, memory, 33 | memory_mask: Optional[Tensor] = None, 34 | memory_key_padding_mask: Optional[Tensor] = None, 35 | pos: Optional[Tensor] = None, 36 | query_pos: Optional[Tensor] = None): 37 | tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt, query_pos), 38 | key=self.with_pos_embed(memory, pos), 39 | value=memory, attn_mask=memory_mask, 40 | key_padding_mask=memory_key_padding_mask)[0] 41 | tgt = tgt + self.dropout(tgt2) 42 | tgt = self.norm(tgt) 43 | 44 | return tgt 45 | 46 | def forward_pre(self, tgt, memory, 47 | memory_mask: Optional[Tensor] = None, 48 | memory_key_padding_mask: Optional[Tensor] = None, 49 | pos: Optional[Tensor] = None, 50 | query_pos: Optional[Tensor] = None): 51 | tgt2 = self.norm(tgt) 52 | tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt2, query_pos), 53 | key=self.with_pos_embed(memory, pos), 54 | value=memory, attn_mask=memory_mask, 55 | key_padding_mask=memory_key_padding_mask)[0] 56 | tgt = tgt + self.dropout(tgt2) 57 | 58 | return tgt 59 | 60 | def forward(self, tgt, memory, 61 | memory_mask: Optional[Tensor] = None, 62 | memory_key_padding_mask: Optional[Tensor] = None, 63 | pos: Optional[Tensor] = None, 64 | query_pos: Optional[Tensor] = None): 65 | if self.normalize_before: 66 | return self.forward_pre(tgt, memory, memory_mask, 67 | memory_key_padding_mask, pos, query_pos) 68 | return self.forward_post(tgt, memory, memory_mask, 69 | memory_key_padding_mask, pos, query_pos) 70 | 71 | 72 | def _get_activation_fn(activation): 73 | """Return an activation function given a string""" 74 | if activation == "relu": 75 | return F.relu 76 | if activation == "gelu": 77 | return F.gelu 78 | if activation == "glu": 79 | return F.glu 80 | raise RuntimeError(F"activation should be relu/gelu, not {activation}.") 81 | 82 | 83 | class ZeroShotClassifier(nn.Module): 84 | def __init__( 85 | self, 86 | input_shape: ShapeSpec, 87 | num_classes: int, 88 | zs_weight_path: str, 89 | zs_weight_dim: int = 512, 90 | use_bias: float = 0.0, 91 | norm_weight: bool = True, 92 | norm_temperature: float = 50.0, 93 | ): 94 | super().__init__() 95 | if isinstance(input_shape, int): # some backward compatibility 96 | input_shape = ShapeSpec(channels=input_shape) 97 | input_size = input_shape.channels * (input_shape.width or 1) * (input_shape.height or 1) 98 | self.norm_weight = norm_weight 99 | self.norm_temperature = norm_temperature 100 | 101 | self.use_bias = use_bias < 0 102 | if self.use_bias: 103 | self.cls_bias = nn.Parameter(torch.ones(1) * use_bias) 104 | 105 | self.linear = nn.Sequential(nn.Linear(input_size, zs_weight_dim // 2), 106 | nn.ReLU(), 107 | nn.Linear(zs_weight_dim // 2, zs_weight_dim)) 108 | 109 | self.cross_attention = CrossAttentionLayer( 110 | d_model=zs_weight_dim, 111 | nhead=8, 112 | dropout=0.0, 113 | normalize_before=False, 114 | ) 115 | 116 | if zs_weight_path == 'rand': 117 | zs_weight = torch.randn((zs_weight_dim, num_classes)) 118 | nn.init.normal_(zs_weight, std=0.01) 119 | else: 120 | zs_weight = torch.tensor( 121 | np.load(zs_weight_path), 122 | dtype=torch.float32).permute(1, 0).contiguous() # D x C 123 | zs_weight = torch.cat( 124 | [zs_weight, zs_weight.new_zeros((zs_weight_dim, 1))], 125 | dim=1) # D x (C + 1) 126 | 127 | if self.norm_weight: 128 | zs_weight = F.normalize(zs_weight, p=2, dim=0) 129 | 130 | if zs_weight_path == 'rand': 131 | self.zs_weight = nn.Parameter(zs_weight) 132 | else: 133 | self.register_buffer('zs_weight', zs_weight) 134 | assert self.zs_weight.shape[1] == num_classes + 1 135 | 136 | def forward(self, x, features_clip, classifier=None): 137 | x = self.linear(x).transpose(0, 1) # (b,100,512) 138 | features_clip = torch.stack(features_clip, dim=1) 139 | x = self.cross_attention( 140 | x, features_clip, 141 | memory_mask=None, 142 | memory_key_padding_mask=None, 143 | pos=None, query_pos=None 144 | ) 145 | x = x.transpose(0, 1) 146 | 147 | if classifier is not None: 148 | zs_weight = classifier.permute(1, 0).contiguous() # D x C' 149 | zs_weight = F.normalize(zs_weight, p=2, dim=0) \ 150 | if self.norm_weight else zs_weight 151 | else: 152 | zs_weight = self.zs_weight # (512, k) 153 | if self.norm_weight: 154 | x = self.norm_temperature * F.normalize(x, p=2, dim=2) 155 | bs, qn, _ = x.shape 156 | x = x.reshape(bs * qn, -1) # (b*100, 512) 157 | x = torch.mm(x, zs_weight) # (b*100, k) 158 | x = x.reshape(bs, qn, -1) # (b,100,k) 159 | if self.use_bias: 160 | x = x + self.cls_bias 161 | return x 162 | -------------------------------------------------------------------------------- /ovformer/data/dataset_mappers/mask_former_panoptic_dataset_mapper.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import copy 3 | import logging 4 | 5 | import numpy as np 6 | import torch 7 | from torch.nn import functional as F 8 | 9 | from detectron2.config import configurable 10 | from detectron2.data import detection_utils as utils 11 | from detectron2.data import transforms as T 12 | from detectron2.structures import BitMasks, Instances 13 | 14 | from .mask_former_semantic_dataset_mapper import MaskFormerSemanticDatasetMapper 15 | 16 | __all__ = ["MaskFormerPanopticDatasetMapper"] 17 | 18 | 19 | class MaskFormerPanopticDatasetMapper(MaskFormerSemanticDatasetMapper): 20 | """ 21 | A callable which takes a dataset dict in Detectron2 Dataset format, 22 | and map it into a format used by MaskFormer for panoptic segmentation. 23 | 24 | The callable currently does the following: 25 | 26 | 1. Read the image from "file_name" 27 | 2. Applies geometric transforms to the image and annotation 28 | 3. Find and applies suitable cropping to the image and annotation 29 | 4. Prepare image and annotation to Tensors 30 | """ 31 | 32 | @configurable 33 | def __init__( 34 | self, 35 | is_train=True, 36 | *, 37 | augmentations, 38 | image_format, 39 | ignore_label, 40 | size_divisibility, 41 | ): 42 | """ 43 | NOTE: this interface is experimental. 44 | Args: 45 | is_train: for training or inference 46 | augmentations: a list of augmentations or deterministic transforms to apply 47 | image_format: an image format supported by :func:`detection_utils.read_image`. 48 | ignore_label: the label that is ignored to evaluation 49 | size_divisibility: pad image size to be divisible by this value 50 | """ 51 | super().__init__( 52 | is_train, 53 | augmentations=augmentations, 54 | image_format=image_format, 55 | ignore_label=ignore_label, 56 | size_divisibility=size_divisibility, 57 | ) 58 | 59 | def __call__(self, dataset_dict): 60 | """ 61 | Args: 62 | dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. 63 | 64 | Returns: 65 | dict: a format that builtin models in detectron2 accept 66 | """ 67 | assert self.is_train, "MaskFormerPanopticDatasetMapper should only be used for training!" 68 | 69 | dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below 70 | image = utils.read_image(dataset_dict["file_name"], format=self.img_format) 71 | utils.check_image_size(dataset_dict, image) 72 | 73 | # semantic segmentation 74 | if "sem_seg_file_name" in dataset_dict: 75 | # PyTorch transformation not implemented for uint16, so converting it to double first 76 | sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name")).astype("double") 77 | else: 78 | sem_seg_gt = None 79 | 80 | # panoptic segmentation 81 | if "pan_seg_file_name" in dataset_dict: 82 | pan_seg_gt = utils.read_image(dataset_dict.pop("pan_seg_file_name"), "RGB") 83 | segments_info = dataset_dict["segments_info"] 84 | else: 85 | pan_seg_gt = None 86 | segments_info = None 87 | 88 | if pan_seg_gt is None: 89 | raise ValueError( 90 | "Cannot find 'pan_seg_file_name' for panoptic segmentation dataset {}.".format( 91 | dataset_dict["file_name"] 92 | ) 93 | ) 94 | 95 | aug_input = T.AugInput(image, sem_seg=sem_seg_gt) 96 | aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input) 97 | image = aug_input.image 98 | if sem_seg_gt is not None: 99 | sem_seg_gt = aug_input.sem_seg 100 | 101 | # apply the same transformation to panoptic segmentation 102 | pan_seg_gt = transforms.apply_segmentation(pan_seg_gt) 103 | 104 | from panopticapi.utils import rgb2id 105 | 106 | pan_seg_gt = rgb2id(pan_seg_gt) 107 | 108 | # Pad image and segmentation label here! 109 | image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))) 110 | if sem_seg_gt is not None: 111 | sem_seg_gt = torch.as_tensor(sem_seg_gt.astype("long")) 112 | pan_seg_gt = torch.as_tensor(pan_seg_gt.astype("long")) 113 | 114 | if self.size_divisibility > 0: 115 | image_size = (image.shape[-2], image.shape[-1]) 116 | padding_size = [ 117 | 0, 118 | self.size_divisibility - image_size[1], 119 | 0, 120 | self.size_divisibility - image_size[0], 121 | ] 122 | image = F.pad(image, padding_size, value=128).contiguous() 123 | if sem_seg_gt is not None: 124 | sem_seg_gt = F.pad(sem_seg_gt, padding_size, value=self.ignore_label).contiguous() 125 | pan_seg_gt = F.pad( 126 | pan_seg_gt, padding_size, value=0 127 | ).contiguous() # 0 is the VOID panoptic label 128 | 129 | image_shape = (image.shape[-2], image.shape[-1]) # h, w 130 | 131 | # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory, 132 | # but not efficient on large generic data structures due to the use of pickle & mp.Queue. 133 | # Therefore it's important to use torch.Tensor. 134 | dataset_dict["image"] = image 135 | if sem_seg_gt is not None: 136 | dataset_dict["sem_seg"] = sem_seg_gt.long() 137 | 138 | if "annotations" in dataset_dict: 139 | raise ValueError("Pemantic segmentation dataset should not have 'annotations'.") 140 | 141 | # Prepare per-category binary masks 142 | pan_seg_gt = pan_seg_gt.numpy() 143 | instances = Instances(image_shape) 144 | classes = [] 145 | masks = [] 146 | for segment_info in segments_info: 147 | class_id = segment_info["category_id"] 148 | if not segment_info["iscrowd"]: 149 | classes.append(class_id) 150 | masks.append(pan_seg_gt == segment_info["id"]) 151 | 152 | classes = np.array(classes) 153 | instances.gt_classes = torch.tensor(classes, dtype=torch.int64) 154 | if len(masks) == 0: 155 | # Some image does not have annotation (all ignored) 156 | instances.gt_masks = torch.zeros((0, pan_seg_gt.shape[-2], pan_seg_gt.shape[-1])) 157 | else: 158 | masks = BitMasks( 159 | torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks]) 160 | ) 161 | instances.gt_masks = masks.tensor 162 | 163 | dataset_dict["instances"] = instances 164 | 165 | return dataset_dict 166 | -------------------------------------------------------------------------------- /ovformer/data/datasets/lvis_v1.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import logging 3 | import os 4 | 5 | from fvcore.common.timer import Timer 6 | from detectron2.structures import BoxMode 7 | from fvcore.common.file_io import PathManager 8 | from detectron2.data import DatasetCatalog, MetadataCatalog 9 | from detectron2.data.datasets.lvis import get_lvis_instances_meta 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | __all__ = ["custom_load_lvis_json", "custom_register_lvis_instances"] 14 | 15 | 16 | def custom_register_lvis_instances(name, metadata, json_file, image_root): 17 | """ 18 | """ 19 | DatasetCatalog.register(name, lambda: custom_load_lvis_json( 20 | json_file, image_root, name)) 21 | MetadataCatalog.get(name).set( 22 | json_file=json_file, image_root=image_root, 23 | evaluator_type="lvis", **metadata 24 | ) 25 | 26 | 27 | def custom_load_lvis_json(json_file, image_root, dataset_name=None): 28 | ''' 29 | Modifications: 30 | use `file_name` 31 | convert neg_category_ids 32 | add pos_category_ids 33 | ''' 34 | from lvis import LVIS 35 | 36 | json_file = PathManager.get_local_path(json_file) 37 | 38 | timer = Timer() 39 | lvis_api = LVIS(json_file) 40 | if timer.seconds() > 1: 41 | logger.info("Loading {} takes {:.2f} seconds.".format( 42 | json_file, timer.seconds())) 43 | 44 | catid2contid = {x['id']: i for i, x in enumerate( 45 | sorted(lvis_api.dataset['categories'], key=lambda x: x['id']))} 46 | if len(lvis_api.dataset['categories']) == 1203: 47 | for x in lvis_api.dataset['categories']: 48 | assert catid2contid[x['id']] == x['id'] - 1 49 | img_ids = sorted(lvis_api.imgs.keys()) 50 | imgs = lvis_api.load_imgs(img_ids) 51 | anns = [lvis_api.img_ann_map[img_id] for img_id in img_ids] 52 | 53 | ann_ids = [ann["id"] for anns_per_image in anns for ann in anns_per_image] 54 | assert len(set(ann_ids)) == len(ann_ids), \ 55 | "Annotation ids in '{}' are not unique".format(json_file) 56 | 57 | imgs_anns = list(zip(imgs, anns)) 58 | logger.info("Loaded {} images in the LVIS v1 format from {}".format( 59 | len(imgs_anns), json_file)) 60 | 61 | dataset_dicts = [] 62 | 63 | for (img_dict, anno_dict_list) in imgs_anns: 64 | record = {} 65 | if "file_name" in img_dict: 66 | file_name = img_dict["file_name"] 67 | if img_dict["file_name"].startswith("COCO"): 68 | file_name = file_name[-16:] 69 | record["file_name"] = os.path.join(image_root, file_name) 70 | elif 'coco_url' in img_dict: 71 | # e.g., http://images.cocodataset.org/train2017/000000391895.jpg 72 | file_name = img_dict["coco_url"][30:] 73 | record["file_name"] = os.path.join(image_root, file_name) 74 | elif 'tar_index' in img_dict: 75 | record['tar_index'] = img_dict['tar_index'] 76 | 77 | record["height"] = img_dict["height"] 78 | record["width"] = img_dict["width"] 79 | record["not_exhaustive_category_ids"] = img_dict.get( 80 | "not_exhaustive_category_ids", []) 81 | record["neg_category_ids"] = img_dict.get("neg_category_ids", []) 82 | # NOTE: modified by Xingyi: convert to 0-based 83 | record["neg_category_ids"] = [ 84 | catid2contid[x] for x in record["neg_category_ids"]] 85 | if 'pos_category_ids' in img_dict: 86 | record['pos_category_ids'] = [ 87 | catid2contid[x] for x in img_dict.get("pos_category_ids", [])] 88 | if 'captions' in img_dict: 89 | record['captions'] = img_dict['captions'] 90 | if 'caption_features' in img_dict: 91 | record['caption_features'] = img_dict['caption_features'] 92 | image_id = record["image_id"] = img_dict["id"] 93 | 94 | objs = [] 95 | for anno in anno_dict_list: 96 | assert anno["image_id"] == image_id 97 | if anno.get('iscrowd', 0) > 0: 98 | continue 99 | obj = {"bbox": anno["bbox"], "bbox_mode": BoxMode.XYWH_ABS} 100 | obj["category_id"] = catid2contid[anno['category_id']] 101 | if 'segmentation' in anno: 102 | segm = anno["segmentation"] 103 | valid_segm = [poly for poly in segm \ 104 | if len(poly) % 2 == 0 and len(poly) >= 6] 105 | # assert len(segm) == len( 106 | # valid_segm 107 | # ), "Annotation contains an invalid polygon with < 3 points" 108 | if not len(segm) == len(valid_segm): 109 | print('Annotation contains an invalid polygon with < 3 points') 110 | assert len(segm) > 0 111 | obj["segmentation"] = segm 112 | objs.append(obj) 113 | record["annotations"] = objs 114 | dataset_dicts.append(record) 115 | 116 | return dataset_dicts 117 | 118 | _CUSTOM_SPLITS_LVIS = { 119 | "lvis_v1_train+coco": ("coco/", "lvis/lvis_v1_train+coco_mask.json"), 120 | "lvis_v1_train_norare": ("coco/", "lvis/lvis_v1_train_norare.json"), 121 | "lvis_v1_train_norare_cloth": ("coco/", "lvis/lvis_v1_train_norare_cloth.json"), 122 | "lvis_v1_train_norare_coco": ("coco/", "lvis/lvis_v1_train_norare_coco.json"), 123 | "lvis_v1_train_norare_nocloth": ("coco/", "lvis/lvis_v1_train_norare_nocloth.json"), 124 | "lvis_v1_val_cloth": ("coco/", "lvis/lvis_v1_val_cloth.json"), 125 | "lvis_v1_val_person": ("coco/", "lvis/lvis_v1_val_person.json"), 126 | "lvis_v1_val_car": ("coco/", "lvis/lvis_v1_val_car.json"), 127 | "lvis_v1_val_coco": ("coco/", "lvis/lvis_v1_val_coco.json") 128 | } 129 | 130 | 131 | for key, (image_root, json_file) in _CUSTOM_SPLITS_LVIS.items(): 132 | custom_register_lvis_instances( 133 | key, 134 | get_lvis_instances_meta(key), 135 | os.path.join("datasets", json_file) if "://" not in json_file else json_file, 136 | os.path.join("datasets", image_root), 137 | ) 138 | 139 | 140 | def get_lvis_22k_meta(): 141 | from .lvis_22k_categories import CATEGORIES 142 | cat_ids = [k["id"] for k in CATEGORIES] 143 | assert min(cat_ids) == 1 and max(cat_ids) == len( 144 | cat_ids 145 | ), "Category ids are not in [1, #categories], as expected" 146 | # Ensure that the category list is sorted by id 147 | lvis_categories = sorted(CATEGORIES, key=lambda x: x["id"]) 148 | thing_classes = [k["name"] for k in lvis_categories] 149 | meta = {"thing_classes": thing_classes} 150 | return meta 151 | 152 | _CUSTOM_SPLITS_LVIS_22K = { 153 | "lvis_v1_train_22k": ("coco/", "lvis/lvis_v1_train_lvis-22k.json"), 154 | } 155 | 156 | for key, (image_root, json_file) in _CUSTOM_SPLITS_LVIS_22K.items(): 157 | custom_register_lvis_instances( 158 | key, 159 | get_lvis_22k_meta(), 160 | os.path.join("datasets", json_file) if "://" not in json_file else json_file, 161 | os.path.join("datasets", image_root), 162 | ) 163 | -------------------------------------------------------------------------------- /tools/burst2ytvis.py: -------------------------------------------------------------------------------- 1 | import json 2 | import argparse 3 | import os 4 | import json 5 | import pycocotools.mask as cocomask 6 | from tabulate import tabulate 7 | from typing import Union 8 | import copy 9 | 10 | def _global_track_id(*, local_track_id: Union[str, int], 11 | video_id: Union[str, int], 12 | track_id_mapping) -> int: 13 | # remap local track ids into globally unique ids 14 | return track_id_mapping[str(video_id)][str(local_track_id)] 15 | 16 | 17 | class B2YConverter: 18 | def __init__(self, b_format,class_split): 19 | self._b_format = b_format 20 | self._class_common_split=class_split 21 | self._split = b_format['split'] 22 | self._ori_categories = b_format['categories'] 23 | self._categories =self._make_categories() 24 | self._cate_map=self._make_map() 25 | self._videos = [] 26 | self._annotations = [] 27 | self._tracks = {} 28 | self._images = [] 29 | self._next_img_id = 0 30 | self._next_ann_id = 0 31 | 32 | self._track_id_mapping = self._load_track_id_mapping() 33 | 34 | for seq in b_format['sequences']: 35 | self._visit_seq(seq) 36 | 37 | def _load_track_id_mapping(self): 38 | id_map = {} 39 | next_global_track_id = 1 40 | for seq in self._b_format['sequences']: 41 | seq_id = seq['id'] 42 | seq_id_map = {} 43 | id_map[str(seq_id)] = seq_id_map 44 | for local_track_id in seq['track_category_ids']: 45 | seq_id_map[str(local_track_id)] = next_global_track_id 46 | next_global_track_id += 1 47 | return id_map 48 | 49 | def global_track_id(self, *, local_track_id: Union[str, int], 50 | video_id: Union[str, int]) -> int: 51 | return _global_track_id(local_track_id=local_track_id, 52 | video_id=video_id, 53 | track_id_mapping=self._track_id_mapping) 54 | 55 | def _visit_seq(self, seq): 56 | self._make_video(seq) 57 | imgs = self._make_images(seq) 58 | self._make_annotations_and_tracks(seq, imgs) 59 | 60 | def _make_images(self, seq): 61 | imgs = [] 62 | for img_path in seq['annotated_image_paths']: 63 | video = self._split + '/' + seq['dataset'] + '/' + seq['seq_name'] 64 | file_name = video + '/' + img_path 65 | 66 | # TODO: once python 3.9 is more common, we can use this nicer and safer code 67 | #stripped = img_path.removesuffix('.jpg').removesuffix('.png').removeprefix('frame') 68 | stripped = img_path.replace('.jpg', '').replace('.png', '').replace('frame', '') 69 | 70 | last = stripped.split('_')[-1] 71 | frame_idx = int(last) 72 | 73 | img = {'id': self._next_img_id, 'video': video, 74 | 'width': seq['width'], 'height': seq['height'], 75 | 'file_name': file_name, 76 | 'frame_index': frame_idx, 77 | 'video_id': seq['id']} 78 | self._next_img_id += 1 79 | self._images.append(img) 80 | imgs.append(img) 81 | return imgs 82 | 83 | def _make_video(self, seq): 84 | video_id = seq['id'] 85 | dataset = seq['dataset'] 86 | seq_name = seq['seq_name'] 87 | name = dataset + '/' + seq_name 88 | file_name=[name+'/'+iname for iname in seq['annotated_image_paths']] 89 | video = { 90 | 'id': video_id, 'width': seq['width'], 'height': seq['height'],'length':len(file_name), 91 | 'neg_category_ids': seq['neg_category_ids'], 92 | 'not_exhaustive_category_ids': seq['not_exhaustive_category_ids'], 93 | 'file_names': file_name, 'metadata': {'dataset': dataset}} 94 | self._videos.append(video) 95 | 96 | def _make_annotations_and_tracks(self, seq, imgs): 97 | video_id = seq['id'] 98 | segs = seq['segmentations'] 99 | assert len(segs) == len(imgs), (len(segs), len(imgs)) 100 | for i in seq['track_category_ids'].keys(): 101 | segmentations=[] 102 | bboxs=[] 103 | for frame_segs, img in zip(segs, imgs): 104 | if i in frame_segs: 105 | rle = frame_segs[i]['rle'] 106 | segment = {'counts': rle, 'size': [img['height'], img['width']]} 107 | segmentations.append(segment) 108 | coco_bbox = cocomask.toBbox(segment) 109 | bbox = [int(x) for x in coco_bbox] 110 | bboxs.append(bbox) 111 | else : 112 | segmentations.append(None) 113 | bboxs.append(None) 114 | category_id = int(seq['track_category_ids'][i]) 115 | ann = {'segmentations': segmentations, 'id': self._next_ann_id, 116 | 'category_id': self._cate_map[category_id],'width': seq['width'], 'height': seq['height'], 117 | 'video_id': video_id, 118 | 'bboxes': bboxs} 119 | self._next_ann_id += 1 120 | self._annotations.append(ann) 121 | 122 | def convert(self): 123 | return {'videos': self._videos, 'annotations': self._annotations, 124 | 'images': self._images, 125 | 'categories': self._categories, 126 | 'cate_ori':self._ori_categories, 127 | 'track_id_mapping': self._track_id_mapping, 128 | 'split': self._split} 129 | 130 | def _make_categories(self): 131 | common_class=self._class_common_split['common'] 132 | uncommon_class=self._class_common_split['uncommon'] 133 | cate_mod=[] 134 | for idx,cate in enumerate(self._ori_categories): 135 | cate_2=copy.deepcopy(cate) 136 | if cate_2['id'] in common_class: 137 | cate_2['split']='common' 138 | if cate_2['id'] in uncommon_class: 139 | cate_2['split']='uncommon' 140 | cate_2['id']=idx+1 141 | cate_mod.append(cate_2) 142 | 143 | return cate_mod 144 | def _make_map(self): 145 | 146 | cate_map={} 147 | for idx,(ori,mod) in enumerate(zip(self._ori_categories,self._categories)): 148 | cate_map[ori['id']]=mod['id'] 149 | return cate_map 150 | 151 | 152 | if __name__ == '__main__': 153 | parser = argparse.ArgumentParser() 154 | parser.add_argument('--ann', type=str,default='datasets/burst/val/all_classes.json') 155 | parser.add_argument('--out', type=str,default='datasets/burst/val/b2y_val.json') 156 | args = parser.parse_args() 157 | class_common='datasets/burst/info/class_split.json' 158 | with open(class_common) as ft: 159 | class_common_dict = json.load(ft) 160 | with open(args.ann) as f: 161 | b_format_gt = json.load(f) 162 | y_format_gt = B2YConverter(b_format_gt,class_common_dict).convert() 163 | with open(args.out, 'w') as f: 164 | json.dump(y_format_gt, f) 165 | -------------------------------------------------------------------------------- /ovformer/modeling/pixel_decoder/ops/modules/ms_deform_attn.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | from __future__ import absolute_import 13 | from __future__ import print_function 14 | from __future__ import division 15 | 16 | import warnings 17 | import math 18 | 19 | import torch 20 | from torch import nn 21 | import torch.nn.functional as F 22 | from torch.nn.init import xavier_uniform_, constant_ 23 | 24 | from ..functions import MSDeformAttnFunction 25 | from ..functions.ms_deform_attn_func import ms_deform_attn_core_pytorch 26 | 27 | 28 | def _is_power_of_2(n): 29 | if (not isinstance(n, int)) or (n < 0): 30 | raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n))) 31 | return (n & (n-1) == 0) and n != 0 32 | 33 | 34 | class MSDeformAttn(nn.Module): 35 | def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4): 36 | """ 37 | Multi-Scale Deformable Attention Module 38 | :param d_model hidden dimension 39 | :param n_levels number of feature levels 40 | :param n_heads number of attention heads 41 | :param n_points number of sampling points per attention head per feature level 42 | """ 43 | super().__init__() 44 | if d_model % n_heads != 0: 45 | raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads)) 46 | _d_per_head = d_model // n_heads 47 | # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation 48 | if not _is_power_of_2(_d_per_head): 49 | warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 " 50 | "which is more efficient in our CUDA implementation.") 51 | 52 | self.im2col_step = 128 53 | 54 | self.d_model = d_model 55 | self.n_levels = n_levels 56 | self.n_heads = n_heads 57 | self.n_points = n_points 58 | 59 | self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2) 60 | self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points) 61 | self.value_proj = nn.Linear(d_model, d_model) 62 | self.output_proj = nn.Linear(d_model, d_model) 63 | 64 | self._reset_parameters() 65 | 66 | def _reset_parameters(self): 67 | constant_(self.sampling_offsets.weight.data, 0.) 68 | thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads) 69 | grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) 70 | grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1) 71 | for i in range(self.n_points): 72 | grid_init[:, :, i, :] *= i + 1 73 | with torch.no_grad(): 74 | self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1)) 75 | constant_(self.attention_weights.weight.data, 0.) 76 | constant_(self.attention_weights.bias.data, 0.) 77 | xavier_uniform_(self.value_proj.weight.data) 78 | constant_(self.value_proj.bias.data, 0.) 79 | xavier_uniform_(self.output_proj.weight.data) 80 | constant_(self.output_proj.bias.data, 0.) 81 | 82 | def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None): 83 | """ 84 | :param query (N, Length_{query}, C) 85 | :param reference_points (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area 86 | or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes 87 | :param input_flatten (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C) 88 | :param input_spatial_shapes (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})] 89 | :param input_level_start_index (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}] 90 | :param input_padding_mask (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements 91 | 92 | :return output (N, Length_{query}, C) 93 | """ 94 | N, Len_q, _ = query.shape 95 | N, Len_in, _ = input_flatten.shape 96 | assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in 97 | 98 | value = self.value_proj(input_flatten) 99 | if input_padding_mask is not None: 100 | value = value.masked_fill(input_padding_mask[..., None], float(0)) 101 | value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads) 102 | sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2) 103 | attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points) 104 | attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points) 105 | # N, Len_q, n_heads, n_levels, n_points, 2 106 | if reference_points.shape[-1] == 2: 107 | offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1) 108 | sampling_locations = reference_points[:, :, None, :, None, :] \ 109 | + sampling_offsets / offset_normalizer[None, None, None, :, None, :] 110 | elif reference_points.shape[-1] == 4: 111 | sampling_locations = reference_points[:, :, None, :, None, :2] \ 112 | + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5 113 | else: 114 | raise ValueError( 115 | 'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1])) 116 | try: 117 | output = MSDeformAttnFunction.apply( 118 | value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step) 119 | except: 120 | # CPU 121 | output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights) 122 | # # For FLOPs calculation only 123 | # output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights) 124 | output = self.output_proj(output) 125 | return output 126 | -------------------------------------------------------------------------------- /ovformer/data/dataset_mappers/mask_former_instance_dataset_mapper.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import copy 3 | import logging 4 | 5 | import numpy as np 6 | import pycocotools.mask as mask_util 7 | import torch 8 | from torch.nn import functional as F 9 | 10 | from detectron2.config import configurable 11 | from detectron2.data import detection_utils as utils 12 | from detectron2.data import transforms as T 13 | from detectron2.projects.point_rend import ColorAugSSDTransform 14 | from detectron2.structures import BitMasks, Instances, polygons_to_bitmask 15 | 16 | __all__ = ["MaskFormerInstanceDatasetMapper"] 17 | 18 | 19 | class MaskFormerInstanceDatasetMapper: 20 | """ 21 | A callable which takes a dataset dict in Detectron2 Dataset format, 22 | and map it into a format used by MaskFormer for instance segmentation. 23 | 24 | The callable currently does the following: 25 | 26 | 1. Read the image from "file_name" 27 | 2. Applies geometric transforms to the image and annotation 28 | 3. Find and applies suitable cropping to the image and annotation 29 | 4. Prepare image and annotation to Tensors 30 | """ 31 | 32 | @configurable 33 | def __init__( 34 | self, 35 | is_train=True, 36 | *, 37 | augmentations, 38 | image_format, 39 | size_divisibility, 40 | ): 41 | """ 42 | NOTE: this interface is experimental. 43 | Args: 44 | is_train: for training or inference 45 | augmentations: a list of augmentations or deterministic transforms to apply 46 | image_format: an image format supported by :func:`detection_utils.read_image`. 47 | size_divisibility: pad image size to be divisible by this value 48 | """ 49 | self.is_train = is_train 50 | self.tfm_gens = augmentations 51 | self.img_format = image_format 52 | self.size_divisibility = size_divisibility 53 | 54 | logger = logging.getLogger(__name__) 55 | mode = "training" if is_train else "inference" 56 | logger.info(f"[{self.__class__.__name__}] Augmentations used in {mode}: {augmentations}") 57 | 58 | @classmethod 59 | def from_config(cls, cfg, is_train=True): 60 | # Build augmentation 61 | augs = [ 62 | T.ResizeShortestEdge( 63 | cfg.INPUT.MIN_SIZE_TRAIN, 64 | cfg.INPUT.MAX_SIZE_TRAIN, 65 | cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING, 66 | ) 67 | ] 68 | if cfg.INPUT.CROP.ENABLED: 69 | augs.append( 70 | T.RandomCrop( 71 | cfg.INPUT.CROP.TYPE, 72 | cfg.INPUT.CROP.SIZE, 73 | ) 74 | ) 75 | if cfg.INPUT.COLOR_AUG_SSD: 76 | augs.append(ColorAugSSDTransform(img_format=cfg.INPUT.FORMAT)) 77 | augs.append(T.RandomFlip()) 78 | 79 | ret = { 80 | "is_train": is_train, 81 | "augmentations": augs, 82 | "image_format": cfg.INPUT.FORMAT, 83 | "size_divisibility": cfg.INPUT.SIZE_DIVISIBILITY, 84 | } 85 | return ret 86 | 87 | def __call__(self, dataset_dict): 88 | """ 89 | Args: 90 | dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. 91 | 92 | Returns: 93 | dict: a format that builtin models in detectron2 accept 94 | """ 95 | assert self.is_train, "MaskFormerPanopticDatasetMapper should only be used for training!" 96 | 97 | dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below 98 | image = utils.read_image(dataset_dict["file_name"], format=self.img_format) 99 | utils.check_image_size(dataset_dict, image) 100 | 101 | aug_input = T.AugInput(image) 102 | aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input) 103 | image = aug_input.image 104 | 105 | # transform instnace masks 106 | assert "annotations" in dataset_dict 107 | for anno in dataset_dict["annotations"]: 108 | anno.pop("keypoints", None) 109 | 110 | annos = [ 111 | utils.transform_instance_annotations(obj, transforms, image.shape[:2]) 112 | for obj in dataset_dict.pop("annotations") 113 | if obj.get("iscrowd", 0) == 0 114 | ] 115 | 116 | if len(annos): 117 | assert "segmentation" in annos[0] 118 | segms = [obj["segmentation"] for obj in annos] 119 | masks = [] 120 | for segm in segms: 121 | if isinstance(segm, list): 122 | # polygon 123 | masks.append(polygons_to_bitmask(segm, *image.shape[:2])) 124 | elif isinstance(segm, dict): 125 | # COCO RLE 126 | masks.append(mask_util.decode(segm)) 127 | elif isinstance(segm, np.ndarray): 128 | assert segm.ndim == 2, "Expect segmentation of 2 dimensions, got {}.".format( 129 | segm.ndim 130 | ) 131 | # mask array 132 | masks.append(segm) 133 | else: 134 | raise ValueError( 135 | "Cannot convert segmentation of type '{}' to BitMasks!" 136 | "Supported types are: polygons as list[list[float] or ndarray]," 137 | " COCO-style RLE as a dict, or a binary segmentation mask " 138 | " in a 2D numpy array of shape HxW.".format(type(segm)) 139 | ) 140 | 141 | # Pad image and segmentation label here! 142 | image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))) 143 | masks = [torch.from_numpy(np.ascontiguousarray(x)) for x in masks] 144 | 145 | classes = [int(obj["category_id"]) for obj in annos] 146 | classes = torch.tensor(classes, dtype=torch.int64) 147 | 148 | if self.size_divisibility > 0: 149 | image_size = (image.shape[-2], image.shape[-1]) 150 | padding_size = [ 151 | 0, 152 | self.size_divisibility - image_size[1], 153 | 0, 154 | self.size_divisibility - image_size[0], 155 | ] 156 | # pad image 157 | image = F.pad(image, padding_size, value=128).contiguous() 158 | # pad mask 159 | masks = [F.pad(x, padding_size, value=0).contiguous() for x in masks] 160 | 161 | image_shape = (image.shape[-2], image.shape[-1]) # h, w 162 | 163 | # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory, 164 | # but not efficient on large generic data structures due to the use of pickle & mp.Queue. 165 | # Therefore it's important to use torch.Tensor. 166 | dataset_dict["image"] = image 167 | 168 | # Prepare per-category binary masks 169 | instances = Instances(image_shape) 170 | instances.gt_classes = classes 171 | if len(masks) == 0: 172 | # Some image does not have annotation (all ignored) 173 | instances.gt_masks = torch.zeros((0, image.shape[-2], image.shape[-1])) 174 | else: 175 | masks = BitMasks(torch.stack(masks)) 176 | instances.gt_masks = masks.tensor 177 | 178 | dataset_dict["instances"] = instances 179 | 180 | return dataset_dict 181 | -------------------------------------------------------------------------------- /ovformer/data/dataset_mappers/mask_former_semantic_dataset_mapper.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import copy 3 | import logging 4 | 5 | import numpy as np 6 | import torch 7 | from torch.nn import functional as F 8 | 9 | from detectron2.config import configurable 10 | from detectron2.data import MetadataCatalog 11 | from detectron2.data import detection_utils as utils 12 | from detectron2.data import transforms as T 13 | from detectron2.projects.point_rend import ColorAugSSDTransform 14 | from detectron2.structures import BitMasks, Instances 15 | 16 | __all__ = ["MaskFormerSemanticDatasetMapper"] 17 | 18 | 19 | class MaskFormerSemanticDatasetMapper: 20 | """ 21 | A callable which takes a dataset dict in Detectron2 Dataset format, 22 | and map it into a format used by MaskFormer for semantic segmentation. 23 | 24 | The callable currently does the following: 25 | 26 | 1. Read the image from "file_name" 27 | 2. Applies geometric transforms to the image and annotation 28 | 3. Find and applies suitable cropping to the image and annotation 29 | 4. Prepare image and annotation to Tensors 30 | """ 31 | 32 | @configurable 33 | def __init__( 34 | self, 35 | is_train=True, 36 | *, 37 | augmentations, 38 | image_format, 39 | ignore_label, 40 | size_divisibility, 41 | ): 42 | """ 43 | NOTE: this interface is experimental. 44 | Args: 45 | is_train: for training or inference 46 | augmentations: a list of augmentations or deterministic transforms to apply 47 | image_format: an image format supported by :func:`detection_utils.read_image`. 48 | ignore_label: the label that is ignored to evaluation 49 | size_divisibility: pad image size to be divisible by this value 50 | """ 51 | self.is_train = is_train 52 | self.tfm_gens = augmentations 53 | self.img_format = image_format 54 | self.ignore_label = ignore_label 55 | self.size_divisibility = size_divisibility 56 | 57 | logger = logging.getLogger(__name__) 58 | mode = "training" if is_train else "inference" 59 | logger.info(f"[{self.__class__.__name__}] Augmentations used in {mode}: {augmentations}") 60 | 61 | @classmethod 62 | def from_config(cls, cfg, is_train=True): 63 | # Build augmentation 64 | augs = [ 65 | T.ResizeShortestEdge( 66 | cfg.INPUT.MIN_SIZE_TRAIN, 67 | cfg.INPUT.MAX_SIZE_TRAIN, 68 | cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING, 69 | ) 70 | ] 71 | if cfg.INPUT.CROP.ENABLED: 72 | augs.append( 73 | T.RandomCrop_CategoryAreaConstraint( 74 | cfg.INPUT.CROP.TYPE, 75 | cfg.INPUT.CROP.SIZE, 76 | cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA, 77 | cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE, 78 | ) 79 | ) 80 | if cfg.INPUT.COLOR_AUG_SSD: 81 | augs.append(ColorAugSSDTransform(img_format=cfg.INPUT.FORMAT)) 82 | augs.append(T.RandomFlip()) 83 | 84 | # Assume always applies to the training set. 85 | dataset_names = cfg.DATASETS.TRAIN 86 | meta = MetadataCatalog.get(dataset_names[0]) 87 | ignore_label = meta.ignore_label 88 | 89 | ret = { 90 | "is_train": is_train, 91 | "augmentations": augs, 92 | "image_format": cfg.INPUT.FORMAT, 93 | "ignore_label": ignore_label, 94 | "size_divisibility": cfg.INPUT.SIZE_DIVISIBILITY, 95 | } 96 | return ret 97 | 98 | def __call__(self, dataset_dict): 99 | """ 100 | Args: 101 | dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. 102 | 103 | Returns: 104 | dict: a format that builtin models in detectron2 accept 105 | """ 106 | assert self.is_train, "MaskFormerSemanticDatasetMapper should only be used for training!" 107 | 108 | dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below 109 | image = utils.read_image(dataset_dict["file_name"], format=self.img_format) 110 | utils.check_image_size(dataset_dict, image) 111 | 112 | if "sem_seg_file_name" in dataset_dict: 113 | # PyTorch transformation not implemented for uint16, so converting it to double first 114 | sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name")).astype("double") 115 | else: 116 | sem_seg_gt = None 117 | 118 | if sem_seg_gt is None: 119 | raise ValueError( 120 | "Cannot find 'sem_seg_file_name' for semantic segmentation dataset {}.".format( 121 | dataset_dict["file_name"] 122 | ) 123 | ) 124 | 125 | aug_input = T.AugInput(image, sem_seg=sem_seg_gt) 126 | aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input) 127 | image = aug_input.image 128 | sem_seg_gt = aug_input.sem_seg 129 | 130 | # Pad image and segmentation label here! 131 | image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))) 132 | if sem_seg_gt is not None: 133 | sem_seg_gt = torch.as_tensor(sem_seg_gt.astype("long")) 134 | 135 | if self.size_divisibility > 0: 136 | image_size = (image.shape[-2], image.shape[-1]) 137 | padding_size = [ 138 | 0, 139 | self.size_divisibility - image_size[1], 140 | 0, 141 | self.size_divisibility - image_size[0], 142 | ] 143 | image = F.pad(image, padding_size, value=128).contiguous() 144 | if sem_seg_gt is not None: 145 | sem_seg_gt = F.pad(sem_seg_gt, padding_size, value=self.ignore_label).contiguous() 146 | 147 | image_shape = (image.shape[-2], image.shape[-1]) # h, w 148 | 149 | # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory, 150 | # but not efficient on large generic data structures due to the use of pickle & mp.Queue. 151 | # Therefore it's important to use torch.Tensor. 152 | dataset_dict["image"] = image 153 | 154 | if sem_seg_gt is not None: 155 | dataset_dict["sem_seg"] = sem_seg_gt.long() 156 | 157 | if "annotations" in dataset_dict: 158 | raise ValueError("Semantic segmentation dataset should not have 'annotations'.") 159 | 160 | # Prepare per-category binary masks 161 | if sem_seg_gt is not None: 162 | sem_seg_gt = sem_seg_gt.numpy() 163 | instances = Instances(image_shape) 164 | classes = np.unique(sem_seg_gt) 165 | # remove ignored region 166 | classes = classes[classes != self.ignore_label] 167 | instances.gt_classes = torch.tensor(classes, dtype=torch.int64) 168 | 169 | masks = [] 170 | for class_id in classes: 171 | masks.append(sem_seg_gt == class_id) 172 | 173 | if len(masks) == 0: 174 | # Some image does not have annotation (all ignored) 175 | instances.gt_masks = torch.zeros((0, sem_seg_gt.shape[-2], sem_seg_gt.shape[-1])) 176 | else: 177 | masks = BitMasks( 178 | torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks]) 179 | ) 180 | instances.gt_masks = masks.tensor 181 | 182 | dataset_dict["instances"] = instances 183 | 184 | return dataset_dict 185 | -------------------------------------------------------------------------------- /ovformer/modeling/transformer_decoder/maskformer_transformer_decoder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/detr.py 3 | import fvcore.nn.weight_init as weight_init 4 | import torch 5 | from torch import nn 6 | from torch.nn import functional as F 7 | 8 | from detectron2.config import configurable 9 | from detectron2.layers import Conv2d 10 | from detectron2.utils.registry import Registry 11 | 12 | from .position_encoding import PositionEmbeddingSine 13 | from .transformer import Transformer 14 | 15 | 16 | TRANSFORMER_DECODER_REGISTRY = Registry("TRANSFORMER_MODULE") 17 | TRANSFORMER_DECODER_REGISTRY.__doc__ = """ 18 | Registry for transformer module in MaskFormer. 19 | """ 20 | 21 | 22 | def build_transformer_decoder(cfg, in_channels, mask_classification=True): 23 | """ 24 | Build a instance embedding branch from `cfg.MODEL.INS_EMBED_HEAD.NAME`. 25 | """ 26 | name = cfg.MODEL.MASK_FORMER.TRANSFORMER_DECODER_NAME 27 | return TRANSFORMER_DECODER_REGISTRY.get(name)(cfg, in_channels, mask_classification) 28 | 29 | 30 | @TRANSFORMER_DECODER_REGISTRY.register() 31 | class StandardTransformerDecoder(nn.Module): 32 | @configurable 33 | def __init__( 34 | self, 35 | in_channels, 36 | mask_classification=True, 37 | *, 38 | num_classes: int, 39 | hidden_dim: int, 40 | num_queries: int, 41 | nheads: int, 42 | dropout: float, 43 | dim_feedforward: int, 44 | enc_layers: int, 45 | dec_layers: int, 46 | pre_norm: bool, 47 | deep_supervision: bool, 48 | mask_dim: int, 49 | enforce_input_project: bool, 50 | ): 51 | """ 52 | NOTE: this interface is experimental. 53 | Args: 54 | in_channels: channels of the input features 55 | mask_classification: whether to add mask classifier or not 56 | num_classes: number of classes 57 | hidden_dim: Transformer feature dimension 58 | num_queries: number of queries 59 | nheads: number of heads 60 | dropout: dropout in Transformer 61 | dim_feedforward: feature dimension in feedforward network 62 | enc_layers: number of Transformer encoder layers 63 | dec_layers: number of Transformer decoder layers 64 | pre_norm: whether to use pre-LayerNorm or not 65 | deep_supervision: whether to add supervision to every decoder layers 66 | mask_dim: mask feature dimension 67 | enforce_input_project: add input project 1x1 conv even if input 68 | channels and hidden dim is identical 69 | """ 70 | super().__init__() 71 | 72 | self.mask_classification = mask_classification 73 | 74 | # positional encoding 75 | N_steps = hidden_dim // 2 76 | self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True) 77 | 78 | transformer = Transformer( 79 | d_model=hidden_dim, 80 | dropout=dropout, 81 | nhead=nheads, 82 | dim_feedforward=dim_feedforward, 83 | num_encoder_layers=enc_layers, 84 | num_decoder_layers=dec_layers, 85 | normalize_before=pre_norm, 86 | return_intermediate_dec=deep_supervision, 87 | ) 88 | 89 | self.num_queries = num_queries 90 | self.transformer = transformer 91 | hidden_dim = transformer.d_model 92 | 93 | self.query_embed = nn.Embedding(num_queries, hidden_dim) 94 | 95 | if in_channels != hidden_dim or enforce_input_project: 96 | self.input_proj = Conv2d(in_channels, hidden_dim, kernel_size=1) 97 | weight_init.c2_xavier_fill(self.input_proj) 98 | else: 99 | self.input_proj = nn.Sequential() 100 | self.aux_loss = deep_supervision 101 | 102 | # output FFNs 103 | if self.mask_classification: 104 | self.class_embed = nn.Linear(hidden_dim, num_classes + 1) 105 | self.mask_embed = MLP(hidden_dim, hidden_dim, mask_dim, 3) 106 | 107 | @classmethod 108 | def from_config(cls, cfg, in_channels, mask_classification): 109 | ret = {} 110 | ret["in_channels"] = in_channels 111 | ret["mask_classification"] = mask_classification 112 | 113 | ret["num_classes"] = cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES 114 | ret["hidden_dim"] = cfg.MODEL.MASK_FORMER.HIDDEN_DIM 115 | ret["num_queries"] = cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES 116 | # Transformer parameters: 117 | ret["nheads"] = cfg.MODEL.MASK_FORMER.NHEADS 118 | ret["dropout"] = cfg.MODEL.MASK_FORMER.DROPOUT 119 | ret["dim_feedforward"] = cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD 120 | ret["enc_layers"] = cfg.MODEL.MASK_FORMER.ENC_LAYERS 121 | ret["dec_layers"] = cfg.MODEL.MASK_FORMER.DEC_LAYERS 122 | ret["pre_norm"] = cfg.MODEL.MASK_FORMER.PRE_NORM 123 | ret["deep_supervision"] = cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION 124 | ret["enforce_input_project"] = cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ 125 | 126 | ret["mask_dim"] = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM 127 | 128 | return ret 129 | 130 | def forward(self, x, mask_features, mask=None): 131 | if mask is not None: 132 | mask = F.interpolate(mask[None].float(), size=x.shape[-2:]).to(torch.bool)[0] 133 | pos = self.pe_layer(x, mask) 134 | 135 | src = x 136 | hs, memory = self.transformer(self.input_proj(src), mask, self.query_embed.weight, pos) 137 | 138 | if self.mask_classification: 139 | outputs_class = self.class_embed(hs) 140 | out = {"pred_logits": outputs_class[-1]} 141 | else: 142 | out = {} 143 | 144 | if self.aux_loss: 145 | # [l, bs, queries, embed] 146 | mask_embed = self.mask_embed(hs) 147 | outputs_seg_masks = torch.einsum("lbqc,bchw->lbqhw", mask_embed, mask_features) 148 | out["pred_masks"] = outputs_seg_masks[-1] 149 | out["aux_outputs"] = self._set_aux_loss( 150 | outputs_class if self.mask_classification else None, outputs_seg_masks 151 | ) 152 | else: 153 | # FIXME h_boxes takes the last one computed, keep this in mind 154 | # [bs, queries, embed] 155 | mask_embed = self.mask_embed(hs[-1]) 156 | outputs_seg_masks = torch.einsum("bqc,bchw->bqhw", mask_embed, mask_features) 157 | out["pred_masks"] = outputs_seg_masks 158 | return out 159 | 160 | @torch.jit.unused 161 | def _set_aux_loss(self, outputs_class, outputs_seg_masks): 162 | # this is a workaround to make torchscript happy, as torchscript 163 | # doesn't support dictionary with non-homogeneous values, such 164 | # as a dict having both a Tensor and a list. 165 | if self.mask_classification: 166 | return [ 167 | {"pred_logits": a, "pred_masks": b} 168 | for a, b in zip(outputs_class[:-1], outputs_seg_masks[:-1]) 169 | ] 170 | else: 171 | return [{"pred_masks": b} for b in outputs_seg_masks[:-1]] 172 | 173 | 174 | class MLP(nn.Module): 175 | """Very simple multi-layer perceptron (also called FFN)""" 176 | 177 | def __init__(self, input_dim, hidden_dim, output_dim, num_layers): 178 | super().__init__() 179 | self.num_layers = num_layers 180 | h = [hidden_dim] * (num_layers - 1) 181 | self.layers = nn.ModuleList( 182 | nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]) 183 | ) 184 | 185 | def forward(self, x): 186 | for i, layer in enumerate(self.layers): 187 | x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x) 188 | return x 189 | -------------------------------------------------------------------------------- /ovformer/data_video/datasets/burst.py: -------------------------------------------------------------------------------- 1 | 2 | import contextlib 3 | import io 4 | import json 5 | import logging 6 | import numpy as np 7 | import os 8 | import pycocotools.mask as mask_util 9 | from fvcore.common.file_io import PathManager 10 | from fvcore.common.timer import Timer 11 | 12 | from detectron2.structures import Boxes, BoxMode, PolygonMasks 13 | from detectron2.data import DatasetCatalog, MetadataCatalog 14 | from .burst_categories import BURST_CATEGORIES 15 | """ 16 | This file contains functions to parse YTVIS dataset of 17 | COCO-format annotations into dicts in "Detectron2 format". 18 | """ 19 | 20 | logger = logging.getLogger(__name__) 21 | 22 | __all__ = ["load_burst_json", "register_burst_instances"] 23 | 24 | 25 | def _get_burst_instances_meta(): 26 | thing_ids = [k["id"] for k in BURST_CATEGORIES] 27 | assert len(thing_ids) == 482, len(thing_ids) 28 | # Mapping from the incontiguous YTVIS category id to an id in [0, 39] 29 | thing_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(thing_ids)} 30 | thing_classes = [k["name"] for k in BURST_CATEGORIES] 31 | ret = { 32 | "thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id, 33 | "thing_classes": thing_classes, 34 | } 35 | return ret 36 | 37 | 38 | def load_burst_json(json_file, image_root, dataset_name=None, extra_annotation_keys=None): 39 | from .ytvis_api.burst import BURST 40 | 41 | timer = Timer() 42 | json_file = PathManager.get_local_path(json_file) 43 | with contextlib.redirect_stdout(io.StringIO()): 44 | ytvis_api = BURST(json_file) 45 | if timer.seconds() > 1: 46 | logger.info("Loading {} takes {:.2f} seconds.".format(json_file, timer.seconds())) 47 | 48 | id_map = None 49 | if dataset_name is not None: 50 | meta = MetadataCatalog.get(dataset_name) 51 | cat_ids = sorted(ytvis_api.getCatIds()) 52 | cats = ytvis_api.loadCats(cat_ids) 53 | # The categories in a custom json file may not be sorted. 54 | thing_classes = [c["name"] for c in sorted(cats, key=lambda x: x["id"])] 55 | meta.thing_classes = thing_classes 56 | 57 | # In COCO, certain category ids are artificially removed, 58 | # and by convention they are always ignored. 59 | # We deal with COCO's id issue and translate 60 | # the category ids to contiguous ids in [0, 80). 61 | 62 | # It works by looking at the "categories" field in the json, therefore 63 | # if users' own json also have incontiguous ids, we'll 64 | # apply this mapping as well but print a warning. 65 | if not (min(cat_ids) == 1 and max(cat_ids) == len(cat_ids)): 66 | if "coco" not in dataset_name: 67 | logger.warning( 68 | """ 69 | Category ids in annotations are not in [1, #categories]! We'll apply a mapping for you. 70 | """ 71 | ) 72 | id_map = {v: i for i, v in enumerate(cat_ids)} 73 | meta.thing_dataset_id_to_contiguous_id = id_map 74 | # sort indices for reproducible results 75 | vid_ids = sorted(ytvis_api.vids.keys()) 76 | # vids is a list of dicts, each looks something like: 77 | # {'license': 1, 78 | # 'flickr_url': ' ', 79 | # 'file_names': ['ff25f55852/00000.jpg', 'ff25f55852/00005.jpg', ..., 'ff25f55852/00175.jpg'], 80 | # 'height': 720, 81 | # 'width': 1280, 82 | # 'length': 36, 83 | # 'date_captured': '2019-04-11 00:55:41.903902', 84 | # 'id': 2232} 85 | vids = ytvis_api.loadVids(vid_ids) 86 | 87 | anns = [ytvis_api.vidToAnns[vid_id] for vid_id in vid_ids] 88 | total_num_valid_anns = sum([len(x) for x in anns]) 89 | total_num_anns = len(ytvis_api.anns) 90 | if total_num_valid_anns < total_num_anns: 91 | logger.warning( 92 | f"{json_file} contains {total_num_anns} annotations, but only " 93 | f"{total_num_valid_anns} of them match to images in the file." 94 | ) 95 | 96 | vids_anns = list(zip(vids, anns)) 97 | logger.info("Loaded {} videos in YTVIS format from {}".format(len(vids_anns), json_file)) 98 | 99 | dataset_dicts = [] 100 | 101 | ann_keys = ["iscrowd", "category_id", "id"] + (extra_annotation_keys or []) 102 | 103 | num_instances_without_valid_segmentation = 0 104 | 105 | for (vid_dict, anno_dict_list) in vids_anns: 106 | record = {} 107 | #record["file_names"] = [os.path.join(image_root, vid_dict["file_names"][i]) for i in range(vid_dict["length"])] 108 | record["file_names"] = [os.path.join(image_root, '/'.join(vid_dict["file_names"][i].split('\\')[-2:])) for i in range(vid_dict["length"])] 109 | record["height"] = vid_dict["height"] 110 | record["width"] = vid_dict["width"] 111 | record["length"] = vid_dict["length"] 112 | video_id = record["video_id"] = vid_dict["id"] 113 | 114 | video_objs = [] 115 | for frame_idx in range(record["length"]): 116 | frame_objs = [] 117 | for anno in anno_dict_list: 118 | assert anno["video_id"] == video_id 119 | 120 | obj = {key: anno[key] for key in ann_keys if key in anno} 121 | 122 | _segm = anno.get("segmentations", None) 123 | 124 | if not ( _segm and _segm[frame_idx]): 125 | continue 126 | 127 | segm = _segm[frame_idx] 128 | 129 | 130 | if isinstance(segm, dict): 131 | if isinstance(segm["counts"], list): 132 | # convert to compressed RLE 133 | segm = mask_util.frPyObjects(segm, *segm["size"]) 134 | elif segm: 135 | # filter out invalid polygons (< 3 points) 136 | segm = [poly for poly in segm if len(poly) % 2 == 0 and len(poly) >= 6] 137 | if len(segm) == 0: 138 | num_instances_without_valid_segmentation += 1 139 | continue # ignore this instance 140 | obj["segmentation"] = segm 141 | 142 | if id_map: 143 | obj["category_id"] = id_map[obj["category_id"]] 144 | frame_objs.append(obj) 145 | video_objs.append(frame_objs) 146 | record["annotations"] = video_objs 147 | dataset_dicts.append(record) 148 | 149 | if num_instances_without_valid_segmentation > 0: 150 | logger.warning( 151 | "Filtered out {} instances without valid segmentation. ".format( 152 | num_instances_without_valid_segmentation 153 | ) 154 | + "There might be issues in your dataset generation process. " 155 | "A valid polygon should be a list[float] with even length >= 6." 156 | ) 157 | return dataset_dicts 158 | 159 | 160 | def register_burst_instances(name, metadata, json_file, image_root): 161 | """ 162 | Register a dataset in YTVIS's json annotation format for 163 | instance tracking. 164 | 165 | Args: 166 | name (str): the name that identifies a dataset, e.g. "ytvis_train". 167 | metadata (dict): extra metadata associated with this dataset. You can 168 | leave it as an empty dict. 169 | json_file (str): path to the json instance annotation file. 170 | image_root (str or path-like): directory which contains all the images. 171 | """ 172 | assert isinstance(name, str), name 173 | assert isinstance(json_file, (str, os.PathLike)), json_file 174 | assert isinstance(image_root, (str, os.PathLike)), image_root 175 | # 1. register a function which returns dicts 176 | DatasetCatalog.register(name, lambda: load_burst_json(json_file, image_root, name)) 177 | 178 | # 2. Optionally, add metadata about this dataset, 179 | # since they might be useful in evaluation, visualization or logging 180 | MetadataCatalog.get(name).set( 181 | json_file=json_file, image_root=image_root, evaluator_type="ytvis", **metadata 182 | ) 183 | 184 | --------------------------------------------------------------------------------