├── .gitattributes ├── .gitignore ├── DATASET.md ├── README.md ├── __init__.py ├── configs ├── dinov_sam_ade_eval.yaml ├── dinov_sam_coco_swinl_train.yaml └── dinov_sam_coco_train.yaml ├── datasets ├── __init__.py ├── build.py ├── custom_dataset_dataloader.py ├── dataset_mappers │ ├── __init__.py │ ├── bdd_semseg_dataset_mapper.py │ ├── coco_instance_new_baseline_dataset_mapper.py │ ├── coco_interactive_panoptic_new_baseline_dataset_mapper.py │ ├── coco_panoptic_new_baseline_dataset_mapper.py │ ├── davis_dataset_mapper.py │ ├── inference_mapper_with_gt.py │ ├── instance_inference_mapper_with_gt.py │ ├── lvis_dataset_mapper.py │ ├── lvis_dataset_mapper_with_gt.py │ ├── mask_former_instance_dataset_mapper.py │ ├── mask_former_interactive_panoptic_dataset_mapper.py │ ├── mask_former_panoptic_dataset_mapper.py │ ├── mask_former_semantic_dataset_mapper.py │ ├── o365_instance_new_baseline_dataset_mapper.py │ ├── object365_dataset_mapper.py │ ├── part_data_filter_whole_new_instance_dataset_mapper.py │ ├── pascal_instance_new_baseline_dataset_mapper.py │ ├── pascalcontext_dataset_mapper.py │ ├── sam_baseline_dataset_mapper.py │ ├── sam_baseline_dataset_mapper_content.py │ ├── sam_baseline_dataset_mapper_json.py │ ├── scannet_dataset_mapper.py │ ├── scannet_pano_dataset_mapper.py │ ├── seginw_dataset_mapper.py │ ├── sunrgbd_dataset_mapper.py │ └── ytvos_dataset_mapper.py ├── evaluation │ ├── __init__.py │ ├── instance_evaluation.py │ ├── interactive_evaluation.py │ ├── panoptic_evaluation.py │ ├── pascal_part_evaluation.py │ └── segmentation_evaluation.py ├── registration │ ├── __init__.py │ ├── register_ade20k_full.py │ ├── register_ade20k_instance.py │ ├── register_ade20k_panoptic.py │ ├── register_bdd100k_panoseg.py │ ├── register_bdd100k_semseg.py │ ├── register_coco_panoptic_annos_semseg.py │ ├── register_coco_panoptic_annos_semseg_interactive.py │ ├── register_coco_panoptic_annos_semseg_interactive_jointboxpoint.py │ ├── register_coco_stuff_10k.py │ ├── register_context_semseg.py │ ├── register_davis_dataset.py │ ├── register_lvis_eval.py │ ├── register_object365_od.py │ ├── register_odinw_od.py │ ├── register_paco_part_all.py │ ├── register_partimagenet_part_all.py │ ├── register_pascal_part_all.py │ ├── register_pascal_part_all_interactive.py │ ├── register_sam.py │ ├── register_sam_json.py │ ├── register_scannet_panoptic.py │ ├── register_scannet_semseg.py │ ├── register_seginw_instance.py │ ├── register_sunrgbd_semseg.py │ └── register_ytvos_dataset.py ├── semseg_loader.py ├── shapes │ ├── __init__.py │ ├── mask_generators.py │ ├── sampler.py │ ├── scribble.py │ └── simpleclick_sampler.py └── utils │ └── tsv │ ├── __init__.py │ ├── io_common.py │ └── tsv_io.py ├── demo ├── __init__.py ├── examples │ ├── bags.jpg │ ├── corgi2.jpg │ ├── img.png │ └── ref_cat.jpeg └── openset_task.py ├── demo_openset.py ├── dinov ├── BaseModel.py ├── __init__.py ├── architectures │ ├── __init__.py │ ├── build.py │ ├── dinov.py │ └── registry.py ├── backbone │ ├── __init__.py │ ├── backbone.py │ ├── build.py │ ├── focal.py │ ├── focal_dw.py │ ├── registry.py │ └── swin.py ├── body │ ├── __init__.py │ ├── build.py │ ├── decoder │ │ ├── __init__.py │ │ ├── build.py │ │ ├── dinov_openset_decoder.py │ │ ├── dinov_refer_decoder.py │ │ ├── registry.py │ │ └── utils │ │ │ ├── __init__.py │ │ │ ├── dino_decoder.py │ │ │ └── utils.py │ ├── encoder │ │ ├── __init__.py │ │ ├── build.py │ │ ├── encoder_deform.py │ │ ├── ops │ │ │ ├── functions │ │ │ │ ├── __init__.py │ │ │ │ └── ms_deform_attn_func.py │ │ │ ├── modules │ │ │ │ ├── __init__.py │ │ │ │ └── ms_deform_attn.py │ │ │ ├── setup.py │ │ │ ├── src │ │ │ │ ├── cpu │ │ │ │ │ ├── ms_deform_attn_cpu.cpp │ │ │ │ │ └── ms_deform_attn_cpu.h │ │ │ │ ├── cuda │ │ │ │ │ ├── ms_deform_attn_cuda.cu │ │ │ │ │ ├── ms_deform_attn_cuda.h │ │ │ │ │ └── ms_deform_im2col_cuda.cuh │ │ │ │ ├── ms_deform_attn.h │ │ │ │ └── vision.cpp │ │ │ └── test.py │ │ ├── registry.py │ │ └── transformer_encoder_fpn.py │ ├── general_head.py │ ├── registry.py │ └── transformer_blocks.py ├── language │ ├── __init__.py │ └── build.py ├── modules │ ├── __init__.py │ ├── criterion_visual_openset.py │ ├── criterion_visual_refer_many2many.py │ ├── criterion_visual_refer_one2one.py │ ├── matcher.py │ ├── matcher_many2many.py │ ├── position_encoding.py │ └── postprocessing.py └── utils │ ├── __init__.py │ ├── box_ops.py │ ├── config.py │ └── misc.py ├── repo.diff ├── requirements.txt ├── train_net.py └── utils ├── Config.py ├── __init__.py ├── arguments.py ├── constants.py ├── dist.py ├── lvis_cat.py ├── misc.py ├── model.py ├── sam_utils ├── __init__.py ├── amg.py ├── onnx.py └── transforms.py └── visualizer.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # IntelliJ project files 2 | #repo.diff 3 | .idea 4 | .vscode 5 | .amltignore 6 | *.iml 7 | out 8 | gen 9 | visinf 10 | coco_caption 11 | ### Vim template 12 | [._]*.s[a-w][a-z] 13 | [._]s[a-w][a-z] 14 | *.un~ 15 | Session.vim 16 | .netrwhist 17 | *~ 18 | *.sh 19 | vis_scribble 20 | 21 | ### IPythonNotebook template 22 | # Temporary data 23 | .ipynb_checkpoints/ 24 | 25 | ### Python template 26 | # Byte-compiled / optimized / DLL files 27 | __pycache__/ 28 | *.py[cod] 29 | *$py.class 30 | 31 | # C extensions 32 | *.so 33 | 34 | # Distribution / packaging 35 | .Python 36 | env/ 37 | build/ 38 | develop-eggs/ 39 | dist/ 40 | downloads/ 41 | eggs/ 42 | .eggs/ 43 | #lib/ 44 | #lib64/ 45 | parts/ 46 | sdist/ 47 | var/ 48 | *.egg-info/ 49 | .installed.cfg 50 | *.egg 51 | 52 | # PyInstaller 53 | # Usually these files are written by a python script from a template 54 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 55 | *.manifest 56 | *.spec 57 | 58 | # Installer logs 59 | pip-log.txt 60 | pip-delete-this-directory.txt 61 | 62 | # Unit test / coverage reports 63 | htmlcov/ 64 | .tox/ 65 | .coverage 66 | .coverage.* 67 | .cache 68 | nosetests.xml 69 | coverage.xml 70 | *,cover 71 | 72 | # Translations 73 | *.mo 74 | *.pot 75 | 76 | # Django stuff: 77 | *.log 78 | 79 | # Sphinx documentation 80 | docs/_build/ 81 | 82 | # PyBuilder 83 | target/ 84 | 85 | *.ipynb 86 | *.params 87 | # *.json 88 | #.vscode/ 89 | *.code-workspace/ 90 | 91 | lib/pycocotools/_mask.c 92 | lib/nms/cpu_nms.c 93 | 94 | OUTPUT 95 | OUTPUT/* 96 | models/* 97 | DATASET 98 | DATASET/* 99 | external/ 100 | MODELS 101 | MODELS/* 102 | 103 | kill.sh 104 | 105 | draws/ 106 | plot/ 107 | 108 | 109 | 110 | 111 | *venv/* 112 | *.pt 113 | *.pth 114 | -------------------------------------------------------------------------------- /DATASET.md: -------------------------------------------------------------------------------- 1 | # Preparing Dataset 2 | Our dataloader follows [Detectron2](https://github.com/facebookresearch/detectron2) contains (1) A dataset registrator. (2) A dataset mapper. (3) A dataset loader. We modify the dataset registrator and mapper for different datasets. 3 | 4 | ## Training Dataset Note 5 | 6 | There are overlap between COCO2017, COCO-Karpathy and REF-COCO dataset, and ref-coco is all overalp with the COCO2017 training data, we have exclude the refcocog-umd validation, coco-karpathy test split during training. 7 | 8 | ## ADE20K, Cityscapes, COCO 9 | Please Refer to [Mask2Former](https://github.com/facebookresearch/Mask2Former/tree/main/datasets). 10 | 11 | ## BDD100K 12 | Please download the 10k split of BDD100k at https://doc.bdd100k.com/download.html#id1 13 | 14 | ### Expected dataset structure for cityscapes: 15 | ``` 16 | . 17 | └── bdd100k/ 18 | ├── images/ 19 | │ └── 10k/ 20 | │ ├── test 21 | │ ├── train 22 | │ └── val 23 | └── labels/ 24 | ├── ins_seg 25 | ├── pan_seg 26 | └── sem_seg 27 | ``` 28 | 29 | ## RefCOCO 30 | Please download the original refcoco datasets at https://github.com/lichengunc/refer. 31 | 32 | ### Expected dataset structure for refcoco: 33 | ``` 34 | . 35 | └── refcocoseg/ 36 | └── refcocog/ 37 | ├── instances.json 38 | ├── refs(google).p 39 | └── refs(umd).p 40 | ``` 41 | 42 | Also download the coco dataset at https://cocodataset.org/#home: 43 | ### Expected dataset structure for coco: 44 | ``` 45 | . 46 | └── coco/ 47 | ├── annotations 48 | ├── train2017 49 | └── val2017 50 | ``` 51 | 52 | After preparing the dataset, run the following command: 53 | 54 | ```sh 55 | # NOTE: Please modify coco_root and ref_root 56 | python3 refcoco2json.py 57 | ``` 58 | 59 | ## SUN-RGBD 60 | 61 | 62 | ## SCAN-Net 63 | 64 | 65 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UX-Decoder/DINOv/53bf20d5cfdbb86fa35141a1cff432d4923599f2/__init__.py -------------------------------------------------------------------------------- /datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from . import registration 2 | from .build import * -------------------------------------------------------------------------------- /datasets/dataset_mappers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from .coco_instance_new_baseline_dataset_mapper import COCOInstanceNewBaselineDatasetMapper 3 | from .coco_panoptic_new_baseline_dataset_mapper import COCOPanopticNewBaselineDatasetMapper 4 | from .coco_interactive_panoptic_new_baseline_dataset_mapper import COCOInteractivePanopticNewBaselineDatasetMapper 5 | from .mask_former_instance_dataset_mapper import MaskFormerInstanceDatasetMapper 6 | from .mask_former_panoptic_dataset_mapper import MaskFormerPanopticDatasetMapper 7 | from .mask_former_interactive_panoptic_dataset_mapper import MaskFormerPanopticDatasetMapperInteractive 8 | from .mask_former_semantic_dataset_mapper import MaskFormerSemanticDatasetMapper 9 | from .sunrgbd_dataset_mapper import SunRGBDSegDatasetMapper 10 | from .scannet_dataset_mapper import ScanNetSegDatasetMapper 11 | from .bdd_semseg_dataset_mapper import BDDSemDatasetMapper 12 | from .scannet_pano_dataset_mapper import ScanNetPanoDatasetMapper 13 | from .o365_instance_new_baseline_dataset_mapper import O365InstanceNewBaselineDatasetMapper 14 | from .sam_baseline_dataset_mapper import build_transform_gen as sam_transform_gen 15 | from .sam_baseline_dataset_mapper import SamBaselineDatasetMapper 16 | from .sam_baseline_dataset_mapper_json import SamBaselineDatasetMapperJSON 17 | from .sam_baseline_dataset_mapper_content import SamBaselineDatasetMapperContent 18 | from .pascal_instance_new_baseline_dataset_mapper import PascalInstanceNewBaselineDatasetMapper 19 | from .part_data_filter_whole_new_instance_dataset_mapper import PartFilterWholeInstanceNewBaselineDatasetMapper 20 | from .inference_mapper_with_gt import CoCoInferenceDatasetMapper 21 | from .instance_inference_mapper_with_gt import InstanceInferenceDatasetMapperGT 22 | 23 | from .davis_dataset_mapper import DAVISDatasetMapper 24 | from .ytvos_dataset_mapper import YTVOSDatasetMapper 25 | from .seginw_dataset_mapper import SeginWDatasetMapper 26 | from .lvis_dataset_mapper_with_gt import LVISInferenceMapperWithGT 27 | from .pascalcontext_dataset_mapper import PascalContextSegDatasetMapper -------------------------------------------------------------------------------- /datasets/dataset_mappers/bdd_semseg_dataset_mapper.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # X-Decoder -- Generalized Decoding for Pixel, Image, and Language 3 | # Copyright (c) 2022 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Modified by Xueyan Zou (xueyan@cs.wisc.edu) 6 | # -------------------------------------------------------- 7 | # Copyright (c) Facebook, Inc. and its affiliates. 8 | import copy 9 | 10 | import scipy.io 11 | import numpy as np 12 | import torch 13 | from PIL import Image 14 | 15 | from torchvision import transforms 16 | from dinov.utils import configurable 17 | 18 | __all__ = ["BDDSemDatasetMapper"] 19 | 20 | 21 | # This is specifically designed for the COCO dataset. 22 | class BDDSemDatasetMapper: 23 | """ 24 | A callable which takes a dataset dict in Detectron2 Dataset format, 25 | and map it into a format used by MaskFormer. 26 | 27 | This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation. 28 | 29 | The callable currently does the following: 30 | 31 | 1. Read the image from "file_name" 32 | 2. Applies geometric transforms to the image and annotation 33 | 3. Find and applies suitable cropping to the image and annotation 34 | 4. Prepare image and annotation to Tensors 35 | """ 36 | 37 | @configurable 38 | def __init__( 39 | self, 40 | is_train=True, 41 | min_size_test=None, 42 | max_size_test=None, 43 | mean=None, 44 | std=None, 45 | ): 46 | """ 47 | NOTE: this interface is experimental. 48 | Args: 49 | is_train: for training or inference 50 | augmentations: a list of augmentations or deterministic transforms to apply 51 | tfm_gens: data augmentation 52 | image_format: an image format supported by :func:`detection_utils.read_image`. 53 | """ 54 | self.is_train = is_train 55 | self.min_size_test = min_size_test 56 | self.max_size_test = max_size_test 57 | self.pixel_mean = torch.tensor(mean)[:,None,None] 58 | self.pixel_std = torch.tensor(std)[:,None,None] 59 | 60 | t = [] 61 | t.append(transforms.Resize(self.min_size_test, interpolation=Image.BICUBIC)) 62 | self.transform = transforms.Compose(t) 63 | 64 | @classmethod 65 | def from_config(cls, cfg, is_train=True): 66 | ret = { 67 | "is_train": is_train, 68 | "min_size_test": cfg['INPUT']['MIN_SIZE_TEST'], 69 | "max_size_test": cfg['INPUT']['MAX_SIZE_TEST'], 70 | "mean": cfg['INPUT']['PIXEL_MEAN'], 71 | "std": cfg['INPUT']['PIXEL_STD'], 72 | } 73 | return ret 74 | 75 | def read_semseg(self, file_name): 76 | if '.png' in file_name: 77 | semseg = np.asarray(Image.open(file_name)) 78 | elif '.mat' in file_name: 79 | semseg = scipy.io.loadmat(file_name)['LabelMap'] 80 | return semseg 81 | 82 | def __call__(self, dataset_dict): 83 | """ 84 | Args: 85 | dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. 86 | 87 | Returns: 88 | dict: a format that builtin models in detectron2 accept 89 | """ 90 | dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below 91 | file_name = dataset_dict['file_name'] 92 | semseg_name = dataset_dict['sem_seg_file_name'] 93 | image = Image.open(file_name).convert('RGB') 94 | 95 | dataset_dict['width'] = image.size[0] 96 | dataset_dict['height'] = image.size[1] 97 | 98 | if self.is_train == False: 99 | image = self.transform(image) 100 | image = torch.from_numpy(np.asarray(image).copy()) 101 | image = image.permute(2,0,1) 102 | 103 | semseg = self.read_semseg(semseg_name) 104 | semseg = torch.from_numpy(semseg.astype(np.int32)) 105 | dataset_dict['image'] = image 106 | dataset_dict['semseg'] = semseg 107 | return dataset_dict -------------------------------------------------------------------------------- /datasets/dataset_mappers/coco_panoptic_new_baseline_dataset_mapper.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Copyright (c) 2022 IDEA. All Rights Reserved. 3 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 4 | # ------------------------------------------------------------------------ 5 | # Modified from Mask2Former https://github.com/facebookresearch/Mask2Former by Feng Li. 6 | import copy 7 | import logging 8 | 9 | import numpy as np 10 | import torch 11 | 12 | from detectron2.config import configurable 13 | from detectron2.data import detection_utils as utils 14 | from detectron2.data import transforms as T 15 | from detectron2.data.transforms import TransformGen 16 | from detectron2.structures import BitMasks, Boxes, Instances 17 | 18 | __all__ = ["COCOPanopticNewBaselineDatasetMapper"] 19 | 20 | 21 | def build_transform_gen(cfg, is_train): 22 | """ 23 | Create a list of default :class:`Augmentation` from config. 24 | Now it includes resizing and flipping. 25 | Returns: 26 | list[Augmentation] 27 | """ 28 | assert is_train, "Only support training augmentation" 29 | image_size = cfg.INPUT.IMAGE_SIZE 30 | min_scale = cfg.INPUT.MIN_SCALE 31 | max_scale = cfg.INPUT.MAX_SCALE 32 | 33 | augmentation = [] 34 | 35 | if cfg.INPUT.RANDOM_FLIP != "none": 36 | augmentation.append( 37 | T.RandomFlip( 38 | horizontal=cfg.INPUT.RANDOM_FLIP == "horizontal", 39 | vertical=cfg.INPUT.RANDOM_FLIP == "vertical", 40 | ) 41 | ) 42 | 43 | augmentation.extend([ 44 | T.ResizeScale( 45 | min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size 46 | ), 47 | T.FixedSizeCrop(crop_size=(image_size, image_size)), 48 | ]) 49 | 50 | return augmentation 51 | 52 | 53 | # This is specifically designed for the COCO dataset. 54 | class COCOPanopticNewBaselineDatasetMapper: 55 | """ 56 | A callable which takes a dataset dict in Detectron2 Dataset format, 57 | and map it into a format used by MaskFormer. 58 | 59 | This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation. 60 | 61 | The callable currently does the following: 62 | 63 | 1. Read the image from "file_name" 64 | 2. Applies geometric transforms to the image and annotation 65 | 3. Find and applies suitable cropping to the image and annotation 66 | 4. Prepare image and annotation to Tensors 67 | """ 68 | 69 | @configurable 70 | def __init__( 71 | self, 72 | is_train=True, 73 | *, 74 | tfm_gens, 75 | image_format, 76 | ): 77 | """ 78 | NOTE: this interface is experimental. 79 | Args: 80 | is_train: for training or inference 81 | augmentations: a list of augmentations or deterministic transforms to apply 82 | crop_gen: crop augmentation 83 | tfm_gens: data augmentation 84 | image_format: an image format supported by :func:`detection_utils.read_image`. 85 | """ 86 | self.tfm_gens = tfm_gens 87 | logging.getLogger(__name__).info( 88 | "[COCOPanopticNewBaselineDatasetMapper] Full TransformGens used in training: {}".format( 89 | str(self.tfm_gens) 90 | ) 91 | ) 92 | 93 | self.img_format = image_format 94 | self.is_train = is_train 95 | 96 | @classmethod 97 | def from_config(cls, cfg, is_train=True): 98 | # Build augmentation 99 | tfm_gens = build_transform_gen(cfg, is_train) 100 | 101 | ret = { 102 | "is_train": is_train, 103 | "tfm_gens": tfm_gens, 104 | "image_format": cfg.INPUT.FORMAT, 105 | } 106 | return ret 107 | 108 | def __call__(self, dataset_dict): 109 | """ 110 | Args: 111 | dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. 112 | 113 | Returns: 114 | dict: a format that builtin models in detectron2 accept 115 | """ 116 | dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below 117 | image = utils.read_image(dataset_dict["file_name"], format=self.img_format) 118 | utils.check_image_size(dataset_dict, image) 119 | 120 | image, transforms = T.apply_transform_gens(self.tfm_gens, image) 121 | image_shape = image.shape[:2] # h, w 122 | 123 | # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory, 124 | # but not efficient on large generic data structures due to the use of pickle & mp.Queue. 125 | # Therefore it's important to use torch.Tensor. 126 | dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))) 127 | 128 | if not self.is_train: 129 | # USER: Modify this if you want to keep them for some reason. 130 | dataset_dict.pop("annotations", None) 131 | return dataset_dict 132 | 133 | if "pan_seg_file_name" in dataset_dict: 134 | pan_seg_gt = utils.read_image(dataset_dict.pop("pan_seg_file_name"), "RGB") 135 | segments_info = dataset_dict["segments_info"] 136 | 137 | # apply the same transformation to panoptic segmentation 138 | pan_seg_gt = transforms.apply_segmentation(pan_seg_gt) 139 | 140 | from panopticapi.utils import rgb2id 141 | 142 | pan_seg_gt = rgb2id(pan_seg_gt) 143 | 144 | instances = Instances(image_shape) 145 | classes = [] 146 | masks = [] 147 | for segment_info in segments_info: 148 | class_id = segment_info["category_id"] 149 | if not segment_info["iscrowd"]: 150 | classes.append(class_id) 151 | masks.append(pan_seg_gt == segment_info["id"]) 152 | 153 | classes = np.array(classes) 154 | instances.gt_classes = torch.tensor(classes, dtype=torch.int64) 155 | if len(masks) == 0: 156 | # Some image does not have annotation (all ignored) 157 | instances.gt_masks = torch.zeros((0, pan_seg_gt.shape[-2], pan_seg_gt.shape[-1])) 158 | instances.gt_boxes = Boxes(torch.zeros((0, 4))) 159 | else: 160 | masks = BitMasks( 161 | torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks]) 162 | ) 163 | instances.gt_masks = masks.tensor 164 | instances.gt_boxes = masks.get_bounding_boxes() 165 | 166 | dataset_dict["instances"] = instances 167 | 168 | return dataset_dict 169 | -------------------------------------------------------------------------------- /datasets/dataset_mappers/lvis_dataset_mapper.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/d2/detr/dataset_mapper.py 3 | import copy 4 | import random 5 | 6 | import scipy.io 7 | import numpy as np 8 | import torch 9 | from PIL import Image 10 | 11 | from torchvision import transforms 12 | 13 | from pycocotools import mask 14 | from detectron2.data import detection_utils as utils 15 | from detectron2.data import transforms as T 16 | from detectron2.data import MetadataCatalog 17 | 18 | # from ...Networks.Mask2Former.utils import configurable 19 | 20 | __all__ = ["LVISDatasetMapper"] 21 | 22 | def build_transform_gen(cfg, is_train): 23 | """ 24 | Create a list of default :class:`Augmentation` from config. 25 | Now it includes resizing and flipping. 26 | Returns: 27 | list[Augmentation] 28 | """ 29 | assert is_train, "Only support training augmentation" 30 | cfg_input = cfg['INPUT'] 31 | image_size = cfg_input['IMAGE_SIZE'] 32 | min_scale = cfg_input['MIN_SCALE'] 33 | max_scale = cfg_input['MAX_SCALE'] 34 | 35 | augmentation = [] 36 | 37 | 38 | if cfg_input['RANDOM_FLIP'] != "none": 39 | augmentation.append( 40 | T.RandomFlip( 41 | horizontal=cfg_input['RANDOM_FLIP'] == "horizontal", 42 | vertical=cfg_input['RANDOM_FLIP'] == "vertical", 43 | ) 44 | ) 45 | 46 | augmentation.extend([ 47 | T.ResizeScale( 48 | min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size 49 | ), 50 | T.FixedSizeCrop(crop_size=(image_size, image_size)), 51 | ]) 52 | 53 | return augmentation 54 | 55 | 56 | # This is specifically designed for the COCO dataset. 57 | class LVISDatasetMapper: 58 | """ 59 | A callable which takes a dataset dict in Detectron2 Dataset format, 60 | and map it into a format used by MaskFormer. 61 | 62 | This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation. 63 | 64 | The callable currently does the following: 65 | 66 | 1. Read the image from "file_name" 67 | 2. Applies geometric transforms to the image and annotation 68 | 3. Find and applies suitable cropping to the image and annotation 69 | 4. Prepare image and annotation to Tensors 70 | """ 71 | 72 | @configurable 73 | def __init__( 74 | self, 75 | is_train=True, 76 | tfm_gens=None, 77 | image_format=None, 78 | min_size_test=None, 79 | max_size_test=None, 80 | mean=None, 81 | std=None, 82 | max_len=None, 83 | ): 84 | """ 85 | NOTE: this interface is experimental. 86 | Args: 87 | is_train: for training or inference 88 | augmentations: a list of augmentations or deterministic transforms to apply 89 | tfm_gens: data augmentation 90 | image_format: an image format supported by :func:`detection_utils.read_image`. 91 | """ 92 | self.tfm_gens = tfm_gens 93 | self.img_format = image_format 94 | self.is_train = is_train 95 | self.min_size_test = min_size_test 96 | self.max_size_test = max_size_test 97 | self.pixel_mean = torch.tensor(mean)[:,None,None] 98 | self.pixel_std = torch.tensor(std)[:,None,None] 99 | self.max_grounding_num = max_len 100 | 101 | t = [] 102 | t.append(transforms.Resize(self.min_size_test, interpolation=Image.BICUBIC)) 103 | self.transform = transforms.Compose(t) 104 | self.categories = torch.load(MetadataCatalog.get('logistic').get('cat_root')) 105 | 106 | @classmethod 107 | def from_config(cls, cfg, is_train=True): 108 | # Build augmentation 109 | if is_train: 110 | tfm_gens = build_transform_gen(cfg, is_train) 111 | else: 112 | tfm_gens = None 113 | 114 | ret = { 115 | "is_train": is_train, 116 | "tfm_gens": tfm_gens, 117 | "image_format": cfg['INPUT']['FORMAT'], 118 | "min_size_test": cfg['INPUT']['MIN_SIZE_TEST'], 119 | "max_size_test": cfg['INPUT']['MAX_SIZE_TEST'], 120 | "mean": cfg['INPUT']['PIXEL_MEAN'], 121 | "std": cfg['INPUT']['PIXEL_STD'], 122 | "max_len": cfg['MODEL']['DECODER']['GROUNDING']['MAX_LEN'], 123 | } 124 | return ret 125 | 126 | def __call__(self, dataset_dict): 127 | """ 128 | Args: 129 | dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. 130 | 131 | Returns: 132 | dict: a format that builtin models in detectron2 accept 133 | """ 134 | dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below 135 | file_name = dataset_dict['file_name'] 136 | if self.is_train == False: 137 | assert False, "Only support training." 138 | else: 139 | image = utils.read_image(dataset_dict["file_name"], format=self.img_format) 140 | utils.check_image_size(dataset_dict, image) 141 | image, transforms = T.apply_transform_gens(self.tfm_gens, image) 142 | image_shape = image.shape[:2] # h, w 143 | dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))) 144 | 145 | assert len(dataset_dict['instance']) > 0 146 | masks_grd = [] 147 | texts_grd = [] 148 | boxes_grd = [] 149 | hash_grd = [] 150 | for inst, label in zip(dataset_dict['instance'], dataset_dict['labels']): 151 | rle = mask.frPyObjects(inst, dataset_dict['height'], dataset_dict['width']) 152 | m = mask.decode(rle) 153 | # sometimes there are multiple binary map (corresponding to multiple segs) 154 | m = np.sum(m, axis=2) 155 | m = m.astype(np.uint8) # convert to np.uint8 156 | m = transforms.apply_segmentation(m[:,:,None])[:,:,0] 157 | masks_grd += [m] 158 | label_names = self.categories[label] 159 | rand_id = random.randint(0, len(label_names)-1) 160 | texts_grd.append(label_names[rand_id].lower()) 161 | hash_grd.append(hash(label_names[rand_id].lower())) 162 | 163 | indices = torch.randperm(len(hash_grd))[:self.max_grounding_num] 164 | masks_grd = torch.from_numpy(np.stack(masks_grd))[indices] 165 | boxes_grd = torch.tensor(boxes_grd) 166 | texts_grd = np.array(texts_grd)[indices.numpy()].tolist() 167 | hash_grd = np.array(hash_grd)[indices.numpy()].tolist() 168 | groundings = {'masks': masks_grd, 'texts': texts_grd, 'hash': hash_grd, 'mode': 'text'} 169 | dataset_dict["groundings"] = groundings 170 | return dataset_dict -------------------------------------------------------------------------------- /datasets/dataset_mappers/mask_former_panoptic_dataset_mapper.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import copy 3 | import logging 4 | 5 | import numpy as np 6 | import torch 7 | from torch.nn import functional as F 8 | 9 | from detectron2.data import detection_utils as utils 10 | from detectron2.data import transforms as T 11 | from detectron2.structures import BitMasks, Instances 12 | 13 | from .mask_former_semantic_dataset_mapper import MaskFormerSemanticDatasetMapper 14 | from dinov.utils import configurable 15 | 16 | 17 | 18 | __all__ = ["MaskFormerPanopticDatasetMapper"] 19 | 20 | 21 | class MaskFormerPanopticDatasetMapper(MaskFormerSemanticDatasetMapper): 22 | """ 23 | A callable which takes a dataset dict in Detectron2 Dataset format, 24 | and map it into a format used by MaskFormer for panoptic segmentation. 25 | 26 | The callable currently does the following: 27 | 28 | 1. Read the image from "file_name" 29 | 2. Applies geometric transforms to the image and annotation 30 | 3. Find and applies suitable cropping to the image and annotation 31 | 4. Prepare image and annotation to Tensors 32 | """ 33 | 34 | @configurable 35 | def __init__( 36 | self, 37 | is_train=True, 38 | *, 39 | augmentations, 40 | image_format, 41 | ignore_label, 42 | size_divisibility, 43 | ): 44 | """ 45 | NOTE: this interface is experimental. 46 | Args: 47 | is_train: for training or inference 48 | augmentations: a list of augmentations or deterministic transforms to apply 49 | image_format: an image format supported by :func:`detection_utils.read_image`. 50 | ignore_label: the label that is ignored to evaluation 51 | size_divisibility: pad image size to be divisible by this value 52 | """ 53 | super().__init__( 54 | is_train, 55 | augmentations=augmentations, 56 | image_format=image_format, 57 | ignore_label=ignore_label, 58 | size_divisibility=size_divisibility, 59 | ) 60 | 61 | def __call__(self, dataset_dict): 62 | """ 63 | Args: 64 | dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. 65 | 66 | Returns: 67 | dict: a format that builtin models in detectron2 accept 68 | """ 69 | assert self.is_train, "MaskFormerPanopticDatasetMapper should only be used for training!" 70 | 71 | dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below 72 | image = utils.read_image(dataset_dict["file_name"], format=self.img_format) 73 | utils.check_image_size(dataset_dict, image) 74 | 75 | # semantic segmentation 76 | if "sem_seg_file_name" in dataset_dict: 77 | # PyTorch transformation not implemented for uint16, so converting it to double first 78 | sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name")).astype("double") 79 | else: 80 | sem_seg_gt = None 81 | 82 | # panoptic segmentation 83 | if "pan_seg_file_name" in dataset_dict: 84 | pan_seg_gt = utils.read_image(dataset_dict.pop("pan_seg_file_name"), "RGB") 85 | segments_info = dataset_dict["segments_info"] 86 | else: 87 | pan_seg_gt = None 88 | segments_info = None 89 | 90 | if pan_seg_gt is None: 91 | raise ValueError( 92 | "Cannot find 'pan_seg_file_name' for panoptic segmentation dataset {}.".format( 93 | dataset_dict["file_name"] 94 | ) 95 | ) 96 | 97 | aug_input = T.AugInput(image, sem_seg=sem_seg_gt) 98 | aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input) 99 | image = aug_input.image 100 | if sem_seg_gt is not None: 101 | sem_seg_gt = aug_input.sem_seg 102 | 103 | # apply the same transformation to panoptic segmentation 104 | pan_seg_gt = transforms.apply_segmentation(pan_seg_gt) 105 | 106 | from panopticapi.utils import rgb2id 107 | 108 | pan_seg_gt = rgb2id(pan_seg_gt) 109 | 110 | # Pad image and segmentation label here! 111 | image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))) 112 | if sem_seg_gt is not None: 113 | sem_seg_gt = torch.as_tensor(sem_seg_gt.astype("long")) 114 | pan_seg_gt = torch.as_tensor(pan_seg_gt.astype("long")) 115 | 116 | if self.size_divisibility > 0: 117 | image_size = (image.shape[-2], image.shape[-1]) 118 | padding_size = [ 119 | 0, 120 | self.size_divisibility - image_size[1], 121 | 0, 122 | self.size_divisibility - image_size[0], 123 | ] 124 | image = F.pad(image, padding_size, value=128).contiguous() 125 | if sem_seg_gt is not None: 126 | sem_seg_gt = F.pad(sem_seg_gt, padding_size, value=self.ignore_label).contiguous() 127 | pan_seg_gt = F.pad( 128 | pan_seg_gt, padding_size, value=0 129 | ).contiguous() # 0 is the VOID panoptic label 130 | 131 | image_shape = (image.shape[-2], image.shape[-1]) # h, w 132 | 133 | # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory, 134 | # but not efficient on large generic data structures due to the use of pickle & mp.Queue. 135 | # Therefore it's important to use torch.Tensor. 136 | dataset_dict["image"] = image 137 | if sem_seg_gt is not None: 138 | dataset_dict["sem_seg"] = sem_seg_gt.long() 139 | 140 | if "annotations" in dataset_dict: 141 | raise ValueError("Pemantic segmentation dataset should not have 'annotations'.") 142 | 143 | # Prepare per-category binary masks 144 | pan_seg_gt = pan_seg_gt.numpy() 145 | instances = Instances(image_shape) 146 | classes = [] 147 | masks = [] 148 | for segment_info in segments_info: 149 | class_id = segment_info["category_id"] 150 | if not segment_info["iscrowd"]: 151 | classes.append(class_id) 152 | masks.append(pan_seg_gt == segment_info["id"]) 153 | 154 | classes = np.array(classes) 155 | instances.gt_classes = torch.tensor(classes, dtype=torch.int64) 156 | if len(masks) == 0: 157 | # Some image does not have annotation (all ignored) 158 | instances.gt_masks = torch.zeros((0, pan_seg_gt.shape[-2], pan_seg_gt.shape[-1])) 159 | else: 160 | masks = BitMasks( 161 | torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks]) 162 | ) 163 | instances.gt_masks = masks.tensor 164 | instances.gt_boxes = masks.get_bounding_boxes() 165 | 166 | dataset_dict["instances"] = instances 167 | 168 | return dataset_dict 169 | -------------------------------------------------------------------------------- /datasets/dataset_mappers/scannet_dataset_mapper.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # X-Decoder -- Generalized Decoding for Pixel, Image, and Language 3 | # Copyright (c) 2022 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Modified by Xueyan Zou (xueyan@cs.wisc.edu) 6 | # -------------------------------------------------------- 7 | # Copyright (c) Facebook, Inc. and its affiliates. 8 | import copy 9 | 10 | import scipy.io 11 | import numpy as np 12 | import torch 13 | from PIL import Image 14 | 15 | from torchvision import transforms 16 | from dinov.utils import configurable 17 | 18 | __all__ = ["ScanNetSegDatasetMapper"] 19 | 20 | 21 | # This is specifically designed for the COCO dataset. 22 | class ScanNetSegDatasetMapper: 23 | """ 24 | A callable which takes a dataset dict in Detectron2 Dataset format, 25 | and map it into a format used by MaskFormer. 26 | 27 | This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation. 28 | 29 | The callable currently does the following: 30 | 31 | 1. Read the image from "file_name" 32 | 2. Applies geometric transforms to the image and annotation 33 | 3. Find and applies suitable cropping to the image and annotation 34 | 4. Prepare image and annotation to Tensors 35 | """ 36 | 37 | @configurable 38 | def __init__( 39 | self, 40 | is_train=True, 41 | min_size_test=None, 42 | max_size_test=None, 43 | mean=None, 44 | std=None, 45 | ): 46 | """ 47 | NOTE: this interface is experimental. 48 | Args: 49 | is_train: for training or inference 50 | augmentations: a list of augmentations or deterministic transforms to apply 51 | tfm_gens: data augmentation 52 | image_format: an image format supported by :func:`detection_utils.read_image`. 53 | """ 54 | self.is_train = is_train 55 | self.min_size_test = min_size_test 56 | self.max_size_test = max_size_test 57 | self.pixel_mean = torch.tensor(mean)[:,None,None] 58 | self.pixel_std = torch.tensor(std)[:,None,None] 59 | 60 | t = [] 61 | t.append(transforms.Resize(self.min_size_test, interpolation=Image.BICUBIC)) 62 | self.transform = transforms.Compose(t) 63 | 64 | @classmethod 65 | def from_config(cls, cfg, is_train=True): 66 | ret = { 67 | "is_train": is_train, 68 | "min_size_test": cfg['INPUT']['MIN_SIZE_TEST'], 69 | "max_size_test": cfg['INPUT']['MAX_SIZE_TEST'], 70 | "mean": cfg['INPUT']['PIXEL_MEAN'], 71 | "std": cfg['INPUT']['PIXEL_STD'], 72 | } 73 | return ret 74 | 75 | def read_semseg(self, file_name): 76 | if '.png' in file_name: 77 | semseg = np.asarray(Image.open(file_name)) 78 | elif '.mat' in file_name: 79 | semseg = scipy.io.loadmat(file_name)['LabelMap'] 80 | return semseg 81 | 82 | def __call__(self, dataset_dict): 83 | """ 84 | Args: 85 | dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. 86 | 87 | Returns: 88 | dict: a format that builtin models in detectron2 accept 89 | """ 90 | dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below 91 | file_name = dataset_dict['file_name'] 92 | semseg_name = dataset_dict['sem_seg_file_name'] 93 | image = Image.open(file_name).convert('RGB') 94 | 95 | dataset_dict['width'] = image.size[0] 96 | dataset_dict['height'] = image.size[1] 97 | 98 | if self.is_train == False: 99 | image = self.transform(image) 100 | image = torch.from_numpy(np.asarray(image).copy()) 101 | image = image.permute(2,0,1) 102 | 103 | semseg = self.read_semseg(semseg_name) 104 | semseg = torch.from_numpy(semseg.astype(np.int32)) 105 | dataset_dict['image'] = image 106 | dataset_dict['semseg'] = semseg 107 | return dataset_dict -------------------------------------------------------------------------------- /datasets/dataset_mappers/scannet_pano_dataset_mapper.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # X-Decoder -- Generalized Decoding for Pixel, Image, and Language 3 | # Copyright (c) 2022 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Modified by Xueyan Zou (xueyan@cs.wisc.edu) 6 | # -------------------------------------------------------- 7 | # Copyright (c) Facebook, Inc. and its affiliates. 8 | import copy 9 | 10 | import scipy.io 11 | import numpy as np 12 | import torch 13 | from PIL import Image 14 | 15 | from torchvision import transforms 16 | from dinov.utils import configurable 17 | 18 | __all__ = ["ScanNetPanoDatasetMapper"] 19 | 20 | 21 | # This is specifically designed for the COCO dataset. 22 | class ScanNetPanoDatasetMapper: 23 | """ 24 | A callable which takes a dataset dict in Detectron2 Dataset format, 25 | and map it into a format used by MaskFormer. 26 | 27 | This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation. 28 | 29 | The callable currently does the following: 30 | 31 | 1. Read the image from "file_name" 32 | 2. Applies geometric transforms to the image and annotation 33 | 3. Find and applies suitable cropping to the image and annotation 34 | 4. Prepare image and annotation to Tensors 35 | """ 36 | 37 | @configurable 38 | def __init__( 39 | self, 40 | is_train=True, 41 | min_size_test=None, 42 | max_size_test=None, 43 | mean=None, 44 | std=None, 45 | ): 46 | """ 47 | NOTE: this interface is experimental. 48 | Args: 49 | is_train: for training or inference 50 | augmentations: a list of augmentations or deterministic transforms to apply 51 | tfm_gens: data augmentation 52 | image_format: an image format supported by :func:`detection_utils.read_image`. 53 | """ 54 | self.is_train = is_train 55 | self.min_size_test = min_size_test 56 | self.max_size_test = max_size_test 57 | self.pixel_mean = torch.tensor(mean)[:,None,None] 58 | self.pixel_std = torch.tensor(std)[:,None,None] 59 | 60 | t = [] 61 | t.append(transforms.Resize(self.min_size_test, interpolation=Image.BICUBIC)) 62 | self.transform = transforms.Compose(t) 63 | 64 | @classmethod 65 | def from_config(cls, cfg, is_train=True): 66 | ret = { 67 | "is_train": is_train, 68 | "min_size_test": cfg['INPUT']['MIN_SIZE_TEST'], 69 | "max_size_test": cfg['INPUT']['MAX_SIZE_TEST'], 70 | "mean": cfg['INPUT']['PIXEL_MEAN'], 71 | "std": cfg['INPUT']['PIXEL_STD'], 72 | } 73 | return ret 74 | 75 | def __call__(self, dataset_dict): 76 | """ 77 | Args: 78 | dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. 79 | 80 | Returns: 81 | dict: a format that builtin models in detectron2 accept 82 | """ 83 | dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below 84 | file_name = dataset_dict['file_name'] 85 | image = Image.open(file_name).convert('RGB') 86 | 87 | dataset_dict['file_name'] = '_'.join(file_name.split('/')[-3:]) # HACK for /tmp file storage on predictions. 88 | dataset_dict['width'] = image.size[0] 89 | dataset_dict['height'] = image.size[1] 90 | 91 | image = self.transform(image) 92 | image = torch.from_numpy(np.asarray(image).copy()) 93 | image = image.permute(2,0,1) 94 | dataset_dict['image'] = image 95 | return dataset_dict -------------------------------------------------------------------------------- /datasets/dataset_mappers/sunrgbd_dataset_mapper.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # X-Decoder -- Generalized Decoding for Pixel, Image, and Language 3 | # Copyright (c) 2022 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Modified by Xueyan Zou (xueyan@cs.wisc.edu) 6 | # -------------------------------------------------------- 7 | # Copyright (c) Facebook, Inc. and its affiliates. 8 | import copy 9 | 10 | import scipy.io 11 | import numpy as np 12 | import torch 13 | from PIL import Image 14 | 15 | from torchvision import transforms 16 | from dinov.utils import configurable 17 | 18 | __all__ = ["SunRGBDSegDatasetMapper"] 19 | 20 | 21 | # This is specifically designed for the COCO dataset. 22 | class SunRGBDSegDatasetMapper: 23 | """ 24 | A callable which takes a dataset dict in Detectron2 Dataset format, 25 | and map it into a format used by MaskFormer. 26 | 27 | This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation. 28 | 29 | The callable currently does the following: 30 | 31 | 1. Read the image from "file_name" 32 | 2. Applies geometric transforms to the image and annotation 33 | 3. Find and applies suitable cropping to the image and annotation 34 | 4. Prepare image and annotation to Tensors 35 | """ 36 | 37 | @configurable 38 | def __init__( 39 | self, 40 | is_train=True, 41 | min_size_test=None, 42 | max_size_test=None, 43 | mean=None, 44 | std=None, 45 | ): 46 | """ 47 | NOTE: this interface is experimental. 48 | Args: 49 | is_train: for training or inference 50 | augmentations: a list of augmentations or deterministic transforms to apply 51 | tfm_gens: data augmentation 52 | image_format: an image format supported by :func:`detection_utils.read_image`. 53 | """ 54 | self.is_train = is_train 55 | self.min_size_test = min_size_test 56 | self.max_size_test = max_size_test 57 | self.pixel_mean = torch.tensor(mean)[:,None,None] 58 | self.pixel_std = torch.tensor(std)[:,None,None] 59 | 60 | t = [] 61 | t.append(transforms.Resize(self.min_size_test, interpolation=Image.BICUBIC)) 62 | self.transform = transforms.Compose(t) 63 | 64 | @classmethod 65 | def from_config(cls, cfg, is_train=True): 66 | ret = { 67 | "is_train": is_train, 68 | "min_size_test": cfg['INPUT']['MIN_SIZE_TEST'], 69 | "max_size_test": cfg['INPUT']['MAX_SIZE_TEST'], 70 | "mean": cfg['INPUT']['PIXEL_MEAN'], 71 | "std": cfg['INPUT']['PIXEL_STD'], 72 | } 73 | return ret 74 | 75 | def read_semseg(self, file_name): 76 | if '.png' in file_name: 77 | semseg = np.asarray(Image.open(file_name)) 78 | elif '.mat' in file_name: 79 | semseg = scipy.io.loadmat(file_name)['LabelMap'] 80 | return semseg 81 | 82 | def __call__(self, dataset_dict): 83 | """ 84 | Args: 85 | dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. 86 | 87 | Returns: 88 | dict: a format that builtin models in detectron2 accept 89 | """ 90 | dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below 91 | file_name = dataset_dict['file_name'] 92 | semseg_name = dataset_dict['sem_seg_file_name'] 93 | image = Image.open(file_name).convert('RGB') 94 | 95 | dataset_dict['width'] = image.size[0] 96 | dataset_dict['height'] = image.size[1] 97 | 98 | if self.is_train == False: 99 | image = self.transform(image) 100 | image = torch.from_numpy(np.asarray(image).copy()) 101 | image = image.permute(2,0,1) 102 | 103 | semseg = self.read_semseg(semseg_name) 104 | semseg = torch.from_numpy(semseg.astype(np.int32)) 105 | dataset_dict['image'] = image 106 | dataset_dict['semseg'] = semseg 107 | return dataset_dict -------------------------------------------------------------------------------- /datasets/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | from .instance_evaluation import * 2 | from .segmentation_evaluation import * 3 | from .panoptic_evaluation import * 4 | from .interactive_evaluation import * -------------------------------------------------------------------------------- /datasets/evaluation/instance_evaluation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import contextlib 3 | import copy 4 | import io 5 | import itertools 6 | import json 7 | import logging 8 | import numpy as np 9 | import os 10 | import pickle 11 | from collections import OrderedDict 12 | import pycocotools.mask as mask_util 13 | import torch 14 | from pycocotools.coco import COCO 15 | from pycocotools.cocoeval import COCOeval 16 | from tabulate import tabulate 17 | 18 | import detectron2.utils.comm as comm 19 | from detectron2.config import CfgNode 20 | from detectron2.data import MetadataCatalog 21 | from detectron2.data.datasets.coco import convert_to_coco_json 22 | from detectron2.evaluation.coco_evaluation import COCOEvaluator, _evaluate_predictions_on_coco 23 | from detectron2.evaluation.fast_eval_api import COCOeval_opt 24 | from detectron2.structures import Boxes, BoxMode, pairwise_iou 25 | from detectron2.utils.file_io import PathManager 26 | from detectron2.utils.logger import create_small_table 27 | 28 | 29 | # modified from COCOEvaluator for instance segmetnat 30 | class InstanceSegEvaluator(COCOEvaluator): 31 | """ 32 | Evaluate AR for object proposals, AP for instance detection/segmentation, AP 33 | for keypoint detection outputs using COCO's metrics. 34 | See http://cocodataset.org/#detection-eval and 35 | http://cocodataset.org/#keypoints-eval to understand its metrics. 36 | The metrics range from 0 to 100 (instead of 0 to 1), where a -1 or NaN means 37 | the metric cannot be computed (e.g. due to no predictions made). 38 | 39 | In addition to COCO, this evaluator is able to support any bounding box detection, 40 | instance segmentation, or keypoint detection dataset. 41 | """ 42 | 43 | def _eval_predictions(self, predictions, img_ids=None): 44 | """ 45 | Evaluate predictions. Fill self._results with the metrics of the tasks. 46 | """ 47 | self._logger.info("Preparing results for COCO format ...") 48 | coco_results = list(itertools.chain(*[x["instances"] for x in predictions])) 49 | tasks = self._tasks or self._tasks_from_predictions(coco_results) 50 | 51 | # unmap the category ids for COCO 52 | if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"): 53 | dataset_id_to_contiguous_id = self._metadata.thing_dataset_id_to_contiguous_id 54 | # all_contiguous_ids = list(dataset_id_to_contiguous_id.values()) 55 | # num_classes = len(all_contiguous_ids) 56 | # assert min(all_contiguous_ids) == 0 and max(all_contiguous_ids) == num_classes - 1 57 | 58 | reverse_id_mapping = {v: k for k, v in dataset_id_to_contiguous_id.items()} 59 | for result in coco_results: 60 | category_id = result["category_id"] 61 | # assert category_id < num_classes, ( 62 | # f"A prediction has class={category_id}, " 63 | # f"but the dataset only has {num_classes} classes and " 64 | # f"predicted class id should be in [0, {num_classes - 1}]." 65 | # ) 66 | assert category_id in reverse_id_mapping, ( 67 | f"A prediction has class={category_id}, " 68 | f"but the dataset only has class ids in {dataset_id_to_contiguous_id}." 69 | ) 70 | result["category_id"] = reverse_id_mapping[category_id] 71 | 72 | if self._output_dir: 73 | file_path = os.path.join(self._output_dir, "coco_instances_results.json") 74 | self._logger.info("Saving results to {}".format(file_path)) 75 | with PathManager.open(file_path, "w") as f: 76 | f.write(json.dumps(coco_results)) 77 | f.flush() 78 | 79 | if not self._do_evaluation: 80 | self._logger.info("Annotations are not available for evaluation.") 81 | return 82 | 83 | self._logger.info( 84 | "Evaluating predictions with {} COCO API...".format( 85 | "unofficial" if self._use_fast_impl else "official" 86 | ) 87 | ) 88 | for task in sorted(tasks): 89 | assert task in {"bbox", "segm", "keypoints"}, f"Got unknown task: {task}!" 90 | coco_eval = ( 91 | _evaluate_predictions_on_coco( 92 | self._coco_api, 93 | coco_results, 94 | task, 95 | kpt_oks_sigmas=self._kpt_oks_sigmas, 96 | use_fast_impl=self._use_fast_impl, 97 | img_ids=img_ids, 98 | max_dets_per_image=self._max_dets_per_image, 99 | ) 100 | if len(coco_results) > 0 101 | else None # cocoapi does not handle empty results very well 102 | ) 103 | 104 | res = self._derive_coco_results( 105 | coco_eval, task, class_names=self._metadata.get("thing_classes") 106 | ) 107 | self._results[task] = res 108 | -------------------------------------------------------------------------------- /datasets/evaluation/pascal_part_evaluation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import contextlib 3 | import copy 4 | import io 5 | import itertools 6 | import json 7 | import logging 8 | import numpy as np 9 | import os 10 | import pickle 11 | from collections import OrderedDict 12 | import pycocotools.mask as mask_util 13 | import torch 14 | from pycocotools.coco import COCO 15 | from pycocotools.cocoeval import COCOeval 16 | from tabulate import tabulate 17 | 18 | import detectron2.utils.comm as comm 19 | from detectron2.config import CfgNode 20 | from detectron2.data import MetadataCatalog 21 | from detectron2.data.datasets.coco import convert_to_coco_json 22 | from detectron2.evaluation.coco_evaluation import COCOEvaluator 23 | from detectron2.structures import Boxes, BoxMode, pairwise_iou 24 | from detectron2.utils.file_io import PathManager 25 | from detectron2.utils.logger import create_small_table 26 | from ..registration.register_pascal_part_all import ( 27 | PASCAL_PART_BASE_CATEGORIES as categories_seen, 28 | PASCAL_PART_NOVEL_CATEGORIES as categories_unseen, 29 | ) 30 | 31 | 32 | class PASCALPARTEvaluator(COCOEvaluator): 33 | """ 34 | PASCALPARTEvaluator on open_vocabulary 35 | """ 36 | 37 | def _derive_coco_results(self, coco_eval, iou_type, class_names=None): 38 | """ 39 | Additionally plot mAP for 'seen classes' and 'unseen classes' 40 | """ 41 | 42 | metrics = { 43 | "bbox": ["AP", "AP50", "AP75", "APs", "APm", "APl"], 44 | "segm": ["AP", "AP50", "AP75", "APs", "APm", "APl"], 45 | "keypoints": ["AP", "AP50", "AP75", "APm", "APl"], 46 | }[iou_type] 47 | 48 | if coco_eval is None: 49 | self._logger.warn("No predictions from the model!") 50 | return {metric: float("nan") for metric in metrics} 51 | 52 | # the standard metrics 53 | results = { 54 | metric: float(coco_eval.stats[idx] * 100 if coco_eval.stats[idx] >= 0 else "nan") 55 | for idx, metric in enumerate(metrics) 56 | } 57 | self._logger.info( 58 | "Evaluation results for {}: \n".format(iou_type) + create_small_table(results) 59 | ) 60 | if not np.isfinite(sum(results.values())): 61 | self._logger.info("Some metrics cannot be computed and is shown as NaN.") 62 | 63 | if class_names is None or len(class_names) <= 1: 64 | return results 65 | # Compute per-category AP 66 | # from https://github.com/facebookresearch/Detectron/blob/a6a835f5b8208c45d0dce217ce9bbda915f44df7/detectron/datasets/json_dataset_evaluator.py#L222-L252 # noqa 67 | precisions = coco_eval.eval["precision"] 68 | # precision has dims (iou, recall, cls, area range, max dets) 69 | assert len(class_names) == precisions.shape[2] 70 | 71 | seen_names = set([x['name'] for x in categories_seen]) 72 | unseen_names = set([x['name'] for x in categories_unseen]) 73 | results_per_category = [] 74 | results_per_category50 = [] 75 | results_per_category_seen = [] 76 | results_per_category_unseen = [] 77 | results_per_category50_seen = [] 78 | results_per_category50_unseen = [] 79 | for idx, name in enumerate(class_names): 80 | # area range index 0: all area ranges 81 | # max dets index -1: typically 100 per image 82 | precision = precisions[:, :, idx, 0, -1] 83 | precision = precision[precision > -1] 84 | ap = np.mean(precision) if precision.size else float("nan") 85 | results_per_category.append(("{}".format(name), float(ap * 100))) 86 | precision50 = precisions[0, :, idx, 0, -1] 87 | precision50 = precision50[precision50 > -1] 88 | ap50 = np.mean(precision50) if precision50.size else float("nan") 89 | results_per_category50.append(("{}".format(name), float(ap50 * 100))) 90 | if name in seen_names: 91 | results_per_category_seen.append(float(ap * 100)) 92 | results_per_category50_seen.append(float(ap50 * 100)) 93 | if name in unseen_names: 94 | results_per_category_unseen.append(float(ap * 100)) 95 | results_per_category50_unseen.append(float(ap50 * 100)) 96 | 97 | # tabulate it 98 | N_COLS = min(6, len(results_per_category) * 2) 99 | results_flatten = list(itertools.chain(*results_per_category)) 100 | results_2d = itertools.zip_longest(*[results_flatten[i::N_COLS] for i in range(N_COLS)]) 101 | table = tabulate( 102 | results_2d, 103 | tablefmt="pipe", 104 | floatfmt=".3f", 105 | headers=["category", "AP"] * (N_COLS // 2), 106 | numalign="left", 107 | ) 108 | self._logger.info("Per-category {} AP: \n".format(iou_type) + table) 109 | 110 | N_COLS = min(6, len(results_per_category50) * 2) 111 | results_flatten = list(itertools.chain(*results_per_category50)) 112 | results_2d = itertools.zip_longest(*[results_flatten[i::N_COLS] for i in range(N_COLS)]) 113 | table = tabulate( 114 | results_2d, 115 | tablefmt="pipe", 116 | floatfmt=".3f", 117 | headers=["category", "AP50"] * (N_COLS // 2), 118 | numalign="left", 119 | ) 120 | self._logger.info("Per-category {} AP50: \n".format(iou_type) + table) 121 | 122 | self._logger.info( 123 | "Seen {} AP: {}".format( 124 | iou_type, 125 | sum(results_per_category_seen) / len(results_per_category_seen), 126 | )) 127 | self._logger.info( 128 | "Unseen {} AP: {}".format( 129 | iou_type, 130 | sum(results_per_category_unseen) / len(results_per_category_unseen), 131 | )) 132 | 133 | self._logger.info( 134 | "Seen {} AP50: {}".format( 135 | iou_type, 136 | sum(results_per_category50_seen) / len(results_per_category50_seen), 137 | )) 138 | self._logger.info( 139 | "Unseen {} AP50: {}".format( 140 | iou_type, 141 | sum(results_per_category50_unseen) / len(results_per_category50_unseen), 142 | )) 143 | 144 | results.update({"AP-" + name: ap for name, ap in results_per_category}) 145 | results["AP-seen"] = sum(results_per_category_seen) / len(results_per_category_seen) 146 | results["AP-unseen"] = sum(results_per_category_unseen) / len(results_per_category_unseen) 147 | results["AP50-seen"] = sum(results_per_category50_seen) / len(results_per_category50_seen) 148 | results["AP50-unseen"] = sum(results_per_category50_unseen) / len(results_per_category50_unseen) 149 | return results -------------------------------------------------------------------------------- /datasets/registration/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from . import ( 3 | register_ade20k_full, 4 | register_ade20k_panoptic, 5 | register_coco_stuff_10k, 6 | register_coco_panoptic_annos_semseg, 7 | register_coco_panoptic_annos_semseg_interactive, 8 | register_coco_panoptic_annos_semseg_interactive_jointboxpoint, 9 | register_ade20k_instance, 10 | register_sam, 11 | register_sunrgbd_semseg, 12 | register_scannet_semseg, 13 | register_bdd100k_semseg, 14 | register_scannet_panoptic, 15 | register_bdd100k_panoseg, 16 | register_object365_od, 17 | register_pascal_part_all, 18 | register_pascal_part_all_interactive, 19 | register_paco_part_all, 20 | register_partimagenet_part_all, 21 | ) 22 | 23 | from . import ( 24 | register_ytvos_dataset, 25 | register_davis_dataset, 26 | register_seginw_instance, 27 | register_lvis_eval, 28 | register_context_semseg, 29 | register_odinw_od, 30 | ) -------------------------------------------------------------------------------- /datasets/registration/register_ade20k_instance.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import json 3 | import logging 4 | import numpy as np 5 | import os 6 | from PIL import Image 7 | 8 | from detectron2.data import DatasetCatalog, MetadataCatalog 9 | from detectron2.data.datasets.coco import load_coco_json, register_coco_instances 10 | from detectron2.utils.file_io import PathManager 11 | 12 | ADE_CATEGORIES = [{'id': 7, 'name': 'bed'}, {'id': 8, 'name': 'windowpane'}, {'id': 10, 'name': 'cabinet'}, {'id': 12, 'name': 'person'}, {'id': 14, 'name': 'door'}, {'id': 15, 'name': 'table'}, {'id': 18, 'name': 'curtain'}, {'id': 19, 'name': 'chair'}, {'id': 20, 'name': 'car'}, {'id': 22, 'name': 'painting'}, {'id': 23, 'name': 'sofa'}, {'id': 24, 'name': 'shelf'}, {'id': 27, 'name': 'mirror'}, {'id': 30, 'name': 'armchair'}, {'id': 31, 'name': 'seat'}, {'id': 32, 'name': 'fence'}, {'id': 33, 'name': 'desk'}, {'id': 35, 'name': 'wardrobe'}, {'id': 36, 'name': 'lamp'}, {'id': 37, 'name': 'bathtub'}, {'id': 38, 'name': 'railing'}, {'id': 39, 'name': 'cushion'}, {'id': 41, 'name': 'box'}, {'id': 42, 'name': 'column'}, {'id': 43, 'name': 'signboard'}, {'id': 44, 'name': 'chest of drawers'}, {'id': 45, 'name': 'counter'}, {'id': 47, 'name': 'sink'}, {'id': 49, 'name': 'fireplace'}, {'id': 50, 'name': 'refrigerator'}, {'id': 53, 'name': 'stairs'}, {'id': 55, 'name': 'case'}, {'id': 56, 'name': 'pool table'}, {'id': 57, 'name': 'pillow'}, {'id': 58, 'name': 'screen door'}, {'id': 62, 'name': 'bookcase'}, {'id': 64, 'name': 'coffee table'}, {'id': 65, 'name': 'toilet'}, {'id': 66, 'name': 'flower'}, {'id': 67, 'name': 'book'}, {'id': 69, 'name': 'bench'}, {'id': 70, 'name': 'countertop'}, {'id': 71, 'name': 'stove'}, {'id': 72, 'name': 'palm'}, {'id': 73, 'name': 'kitchen island'}, {'id': 74, 'name': 'computer'}, {'id': 75, 'name': 'swivel chair'}, {'id': 76, 'name': 'boat'}, {'id': 78, 'name': 'arcade machine'}, {'id': 80, 'name': 'bus'}, {'id': 81, 'name': 'towel'}, {'id': 82, 'name': 'light'}, {'id': 83, 'name': 'truck'}, {'id': 85, 'name': 'chandelier'}, {'id': 86, 'name': 'awning'}, {'id': 87, 'name': 'streetlight'}, {'id': 88, 'name': 'booth'}, {'id': 89, 'name': 'television receiver'}, {'id': 90, 'name': 'airplane'}, {'id': 92, 'name': 'apparel'}, {'id': 93, 'name': 'pole'}, {'id': 95, 'name': 'bannister'}, {'id': 97, 'name': 'ottoman'}, {'id': 98, 'name': 'bottle'}, {'id': 102, 'name': 'van'}, {'id': 103, 'name': 'ship'}, {'id': 104, 'name': 'fountain'}, {'id': 107, 'name': 'washer'}, {'id': 108, 'name': 'plaything'}, {'id': 110, 'name': 'stool'}, {'id': 111, 'name': 'barrel'}, {'id': 112, 'name': 'basket'}, {'id': 115, 'name': 'bag'}, {'id': 116, 'name': 'minibike'}, {'id': 118, 'name': 'oven'}, {'id': 119, 'name': 'ball'}, {'id': 120, 'name': 'food'}, {'id': 121, 'name': 'step'}, {'id': 123, 'name': 'trade name'}, {'id': 124, 'name': 'microwave'}, {'id': 125, 'name': 'pot'}, {'id': 126, 'name': 'animal'}, {'id': 127, 'name': 'bicycle'}, {'id': 129, 'name': 'dishwasher'}, {'id': 130, 'name': 'screen'}, {'id': 132, 'name': 'sculpture'}, {'id': 133, 'name': 'hood'}, {'id': 134, 'name': 'sconce'}, {'id': 135, 'name': 'vase'}, {'id': 136, 'name': 'traffic light'}, {'id': 137, 'name': 'tray'}, {'id': 138, 'name': 'ashcan'}, {'id': 139, 'name': 'fan'}, {'id': 142, 'name': 'plate'}, {'id': 143, 'name': 'monitor'}, {'id': 144, 'name': 'bulletin board'}, {'id': 146, 'name': 'radiator'}, {'id': 147, 'name': 'glass'}, {'id': 148, 'name': 'clock'}, {'id': 149, 'name': 'flag'}] 13 | 14 | 15 | _PREDEFINED_SPLITS = { 16 | # point annotations without masks 17 | "ade20k_instance_train": ( 18 | "ADEChallengeData2016/images/training", 19 | "ADEChallengeData2016/ade20k_instance_train.json", 20 | ), 21 | "ade20k_instance_val": ( 22 | "ADEChallengeData2016/images/validation", 23 | "ADEChallengeData2016/ade20k_instance_val.json", 24 | ), 25 | } 26 | 27 | 28 | def _get_ade_instances_meta(): 29 | thing_ids = [k["id"] for k in ADE_CATEGORIES] 30 | assert len(thing_ids) == 100, len(thing_ids) 31 | # Mapping from the incontiguous ADE category id to an id in [0, 99] 32 | thing_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(thing_ids)} 33 | thing_classes = [k["name"] for k in ADE_CATEGORIES] 34 | ret = { 35 | "thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id, 36 | "thing_classes": thing_classes, 37 | } 38 | return ret 39 | 40 | 41 | def register_all_ade20k_instance(root): 42 | for key, (image_root, json_file) in _PREDEFINED_SPLITS.items(): 43 | # Assume pre-defined datasets live in `./datasets`. 44 | register_coco_instances( 45 | key, 46 | _get_ade_instances_meta(), 47 | os.path.join(root, json_file) if "://" not in json_file else json_file, 48 | os.path.join(root, image_root), 49 | ) 50 | 51 | 52 | _root = os.getenv("DETECTRON2_DATASETS", "datasets") 53 | if _root!='datasets': 54 | register_all_ade20k_instance(_root) 55 | -------------------------------------------------------------------------------- /datasets/registration/register_bdd100k_semseg.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # X-Decoder -- Generalized Decoding for Pixel, Image, and Language 3 | # Copyright (c) 2022 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Modified by Xueyan Zou (xueyan@cs.wisc.edu) 6 | # -------------------------------------------------------- 7 | # Copyright (c) Facebook, Inc. and its affiliates. 8 | import numpy as np 9 | import os 10 | import glob 11 | from typing import List, Tuple, Union 12 | 13 | from detectron2.data import DatasetCatalog, MetadataCatalog 14 | from detectron2.utils.file_io import PathManager 15 | 16 | from utils.constants import BDD_SEM 17 | 18 | __all__ = ["load_scannet_instances", "register_scannet_context"] 19 | 20 | 21 | def load_bdd_instances(name: str, dirname: str, split: str, class_names: Union[List[str], Tuple[str, ...]]): 22 | """ 23 | Load BDD annotations to Detectron2 format. 24 | 25 | Args: 26 | dirname: Contain "Annotations", "ImageSets", "JPEGImages" 27 | split (str): one of "train", "test", "val", "trainval" 28 | class_names: list or tuple of class names 29 | """ 30 | img_folder = os.path.join(dirname, 'images', '10k', split) 31 | img_pths = sorted(glob.glob(os.path.join(img_folder, '*.jpg'))) 32 | 33 | sem_folder = os.path.join(dirname, 'labels', 'sem_seg', 'masks', split) 34 | sem_pths = sorted(glob.glob(os.path.join(sem_folder, '*.png'))) 35 | 36 | assert len(img_pths) == len(sem_pths) 37 | 38 | dicts = [] 39 | for img_pth, sem_pth in zip(img_pths, sem_pths): 40 | r = { 41 | "file_name": img_pth, 42 | "sem_seg_file_name": sem_pth, 43 | "image_id": img_pth.split('/')[-1].split('.')[0], 44 | } 45 | dicts.append(r) 46 | return dicts 47 | 48 | 49 | def register_bdd_context(name, dirname, split, class_names=BDD_SEM): 50 | DatasetCatalog.register(name, lambda: load_bdd_instances(name, dirname, split, class_names)) 51 | MetadataCatalog.get(name).set( 52 | stuff_classes=class_names, 53 | dirname=dirname, 54 | split=split, 55 | ignore_label=[255], 56 | thing_dataset_id_to_contiguous_id={}, 57 | class_offset=0, 58 | keep_sem_bgd=False 59 | ) 60 | 61 | 62 | def register_all_sunrgbd_seg(root): 63 | SPLITS = [ 64 | ("bdd10k_val_sem_seg", "bdd100k", "val"), 65 | ] 66 | 67 | for name, dirname, split in SPLITS: 68 | register_bdd_context(name, os.path.join(root, dirname), split) 69 | MetadataCatalog.get(name).evaluator_type = "sem_seg" 70 | 71 | 72 | _root = os.getenv("DATSETW", "datasets") 73 | if _root!='datasets': 74 | register_all_sunrgbd_seg(_root) -------------------------------------------------------------------------------- /datasets/registration/register_context_semseg.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | import numpy as np 4 | import os 5 | import xml.etree.ElementTree as ET 6 | from typing import List, Tuple, Union 7 | 8 | from detectron2.data import DatasetCatalog, MetadataCatalog 9 | from detectron2.structures import BoxMode 10 | from detectron2.utils.file_io import PathManager 11 | 12 | from utils.constants import PASCAL_CONTEXT_459, PASCAL_CONTEXT_59, PASCAL_CONTEXT_33 13 | 14 | __all__ = ["load_context_instances", "register_pascal_context"] 15 | dataset2class = {"context_459_val_seg": PASCAL_CONTEXT_459, 16 | "context_59_val_seg": PASCAL_CONTEXT_59} 17 | dataset2labelfolder = {"context_459_val_seg": "trainval", 18 | "context_59_val_seg": "59_context_labels"} 19 | dataset2postfix = {"context_459_val_seg": ".mat", 20 | "context_59_val_seg": ".png"} 21 | dataset2segloader = {"context_459_val_seg": "MAT", 22 | "context_59_val_seg": "PIL"} 23 | 24 | 25 | def load_context_instances(name: str, dirname: str, split: str, class_names: Union[List[str], Tuple[str, ...]]): 26 | """ 27 | Load Pascal VOC detection annotations to Detectron2 format. 28 | 29 | Args: 30 | dirname: Contain "Annotations", "ImageSets", "JPEGImages" 31 | split (str): one of "train", "test", "val", "trainval" 32 | class_names: list or tuple of class names 33 | """ 34 | with PathManager.open(os.path.join(dirname, "VOC2010", "ImageSets", "Main", split + ".txt")) as f: 35 | fileids = np.loadtxt(f, dtype=np.str) 36 | 37 | # Needs to read many small annotation files. Makes sense at local 38 | image_dirname = PathManager.get_local_path(os.path.join(dirname, "VOC2010")) 39 | semseg_dirname = PathManager.get_local_path(os.path.join(dirname, dataset2labelfolder[name])) 40 | 41 | dicts = [] 42 | for fileid in fileids: 43 | jpeg_file = os.path.join(image_dirname, "JPEGImages", fileid + ".jpg") 44 | seg_file = os.path.join(semseg_dirname, fileid + dataset2postfix[name]) 45 | 46 | r = { 47 | "file_name": jpeg_file, 48 | "sem_seg_file_name": seg_file, 49 | "image_id": fileid, 50 | } 51 | dicts.append(r) 52 | return dicts 53 | 54 | 55 | def register_pascal_context(name, dirname, split, year, class_names=dataset2class): 56 | DatasetCatalog.register(name, lambda: load_context_instances(name, dirname, split, class_names)) 57 | MetadataCatalog.get(name).set( 58 | stuff_classes=class_names[name], 59 | dirname=dirname, 60 | year=year, 61 | split=split, 62 | ignore_label=[0], 63 | thing_dataset_id_to_contiguous_id={}, 64 | class_offset=1, 65 | semseg_loader=dataset2segloader[name], 66 | keep_sem_bgd=False 67 | ) 68 | 69 | 70 | def register_all_context_seg(root): 71 | SPLITS = [ 72 | ("context_459_val_seg", "pascal_context", "val"), 73 | ("context_59_val_seg", "pascal_context", "val"), 74 | ] 75 | year = 2010 76 | for name, dirname, split in SPLITS: 77 | register_pascal_context(name, os.path.join(root, dirname), split, year) 78 | MetadataCatalog.get(name).evaluator_type = "sem_seg" 79 | 80 | 81 | _root = os.getenv("DATSETW", "datasets") 82 | if _root!='datasets': 83 | register_all_context_seg(_root) -------------------------------------------------------------------------------- /datasets/registration/register_davis_dataset.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | import os 4 | import glob 5 | import json 6 | from typing import List, Tuple, Union 7 | 8 | import cv2 9 | import numpy as np 10 | from scipy.io import loadmat 11 | 12 | from detectron2.data import DatasetCatalog, MetadataCatalog 13 | from detectron2.structures import BoxMode 14 | from detectron2.utils.file_io import PathManager 15 | 16 | 17 | __all__ = ["load_davis_instances", "register_davis_context"] 18 | 19 | def load_davis_instances(name: str, dirname: str, split: str, year: str): 20 | """ 21 | Load Pascal VOC detection annotations to Detectron2 format. 22 | 23 | Args: 24 | dirname: Contain "Annotations", "ImageSets", "JPEGImages" 25 | split (str): one of "train", "test", "val", "trainval" 26 | class_names: list or tuple of class names 27 | """ 28 | meta_txt = os.path.join(dirname, 'ImageSets', year, "{}.txt".format(split)) 29 | meta_json = os.path.join(dirname, 'video_objects_info.json') 30 | meta_json = json.load(open(meta_json))['videos'] 31 | video_names = [line.strip() for line in open(meta_txt).readlines()] 32 | 33 | video_dir = os.path.join(dirname, 'JPEGImages', '480p') 34 | mask_dir = os.path.join(dirname, 'Annotations', '480p') 35 | scibble_dir = os.path.join(dirname, 'Scribbles', '480p') 36 | semantic_dir = os.path.join(dirname, 'Annotations_semantics', '480p') 37 | 38 | dicts = [] 39 | for vid_name in video_names: 40 | objects = meta_json[vid_name]['objects'] 41 | r = { 42 | "file_name": os.path.join(video_dir, vid_name), 43 | "mask_name": os.path.join(mask_dir, vid_name), 44 | "scibble_name": os.path.join(scibble_dir, vid_name), 45 | "semantic_name": os.path.join(semantic_dir, vid_name), 46 | "objects": objects, 47 | } 48 | dicts.append(r) 49 | return dicts 50 | 51 | def register_davis_context(name, dirname, split, year): 52 | if not os.path.exists(dirname): 53 | print("not register for ", name) 54 | return -1 55 | load_davis_instances(name, dirname, split, year) 56 | DatasetCatalog.register("{}".format(name), lambda: load_davis_instances(name, dirname, split, year)) 57 | MetadataCatalog.get("{}".format(name)).set( 58 | dirname=dirname, 59 | thing_dataset_id_to_contiguous_id={}, 60 | ) 61 | 62 | def register_all_davis(root): 63 | SPLITS = [ 64 | ("davis17_val", "DAVIS17", "val", "2017"), 65 | ("davis16_val", "DAVIS17", "val", "2016"), 66 | ] 67 | 68 | for name, dirname, split, year in SPLITS: 69 | register_davis_context(name, os.path.join(root, dirname), split, year) 70 | MetadataCatalog.get("{}".format(name)).evaluator_type = None 71 | 72 | _root = os.getenv("TRACKING_DATASET", "datasets") 73 | if _root!='datasets': 74 | register_all_davis(_root) 75 | -------------------------------------------------------------------------------- /datasets/registration/register_lvis_eval.py: -------------------------------------------------------------------------------- 1 | from detectron2.data.datasets import get_lvis_instances_meta 2 | from detectron2.data import DatasetCatalog, MetadataCatalog 3 | from utils.lvis_cat import LVIS_CATEGORIES as LVIS_V1_CATEGORIES 4 | # from utils.constants import LVIS_CATEGORIES as LVIS_V1_CATEGORIES 5 | import logging 6 | import os 7 | from detectron2.utils.file_io import PathManager 8 | from fvcore.common.timer import Timer 9 | import json 10 | 11 | 12 | 13 | _PREDEFINED_SPLITS_LVIS = { 14 | "lvis_v1": { 15 | "lvis_v1_minival": ("coco/", "coco/annotations/lvis_v1_minival_inserted_image_name.json"), 16 | "lvis_train": ("coco/", "lvis/lvis_v1_train.json"), 17 | }, 18 | } 19 | 20 | def get_lvis_instances_meta_v1(): 21 | assert len(LVIS_V1_CATEGORIES) == 1203 22 | cat_ids = [k["id"] for k in LVIS_V1_CATEGORIES] 23 | assert min(cat_ids) == 1 and max(cat_ids) == len( 24 | cat_ids 25 | ), "Category ids are not in [1, #categories], as expected" 26 | # Ensure that the category list is sorted by id 27 | thing_ids = [k["id"] for k in LVIS_V1_CATEGORIES] 28 | # lvis_categories = sorted(LVIS_V1_CATEGORIES, key=lambda x: x["id"]) 29 | thing_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(thing_ids)} 30 | # thing_classes = [k["name"] for k in O365_CATEGORIES] 31 | def preprocess_name(name): 32 | name = name.lower().strip() 33 | name = name.replace('_', ' ') 34 | return name 35 | thing_classes = [preprocess_name(k["synonyms"][0]) for k in LVIS_V1_CATEGORIES] 36 | meta = { 37 | "thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id, 38 | "thing_classes": thing_classes, 39 | } 40 | return meta 41 | 42 | 43 | def register_lvis_instances(name, metadata, json_file, image_root): 44 | """ 45 | Register a dataset in LVIS's json annotation format for instance detection and segmentation. 46 | 47 | Args: 48 | name (str): a name that identifies the dataset, e.g. "lvis_v0.5_train". 49 | metadata (dict): extra metadata associated with this dataset. It can be an empty dict. 50 | json_file (str): path to the json instance annotation file. 51 | image_root (str or path-like): directory which contains all the images. 52 | """ 53 | DatasetCatalog.register(name, lambda: load_lvis_json(image_root, json_file, name)) 54 | MetadataCatalog.get(name).set( 55 | json_file=json_file, image_root=image_root, evaluator_type="lvis", **metadata 56 | ) 57 | 58 | 59 | def load_lvis_json(image_root, annot_json, metadata): 60 | """ 61 | Args: 62 | image_dir (str): path to the raw dataset. e.g., "~/coco/train2017". 63 | gt_dir (str): path to the raw annotations. e.g., "~/coco/panoptic_train2017". 64 | json_file (str): path to the json file. e.g., "~/coco/annotations/panoptic_train2017.json". 65 | Returns: 66 | list[dict]: a list of dicts in Detectron2 standard format. (See 67 | `Using Custom Datasets `_ ) 68 | """ 69 | with PathManager.open(annot_json) as f: 70 | json_info = json.load(f) 71 | 72 | imageid2seg = {} 73 | imageid2box = {} 74 | imageid2lable = {} 75 | for anno in json_info["annotations"]: 76 | image_id = anno['image_id'] 77 | seg = anno["segmentation"] 78 | bbox = anno["bbox"] 79 | label = anno["category_id"] 80 | if image_id not in imageid2seg: 81 | imageid2seg[image_id] = [] 82 | if image_id not in imageid2box: 83 | imageid2box[image_id] = [] 84 | if image_id not in imageid2lable: 85 | imageid2lable[image_id] = [] 86 | imageid2seg[image_id] += [seg] 87 | imageid2box[image_id] += [bbox] 88 | imageid2lable[image_id] += [label] 89 | 90 | ret = [] 91 | cnt_empty = 0 92 | for image in json_info["images"]: 93 | image_file = os.path.join(image_root ,'/'.join(image["coco_url"].split('/')[-2:])) 94 | image_id = image['id'] 95 | if image_id not in imageid2lable: 96 | cnt_empty += 1 97 | continue 98 | ret.append( 99 | { 100 | "file_name": image_file, 101 | "image_id": image_id, 102 | "height": image['height'], 103 | "width": image['width'], 104 | "instance": imageid2seg[image_id], 105 | "box": imageid2box[image_id], 106 | "labels": imageid2lable[image_id], 107 | } 108 | ) 109 | 110 | print("Empty annotations: {}".format(cnt_empty)) 111 | assert len(ret), f"No images found in {image_root}!" 112 | assert PathManager.isfile(ret[0]["file_name"]), ret[0]["file_name"] 113 | return ret 114 | 115 | 116 | def register_all_lvis(_root_eval, _root_train): 117 | for dataset_name, splits_per_dataset in _PREDEFINED_SPLITS_LVIS.items(): 118 | for key, (image_root, json_file) in splits_per_dataset.items(): 119 | if 'val' in key: 120 | root = _root_eval 121 | else: 122 | root = _root_train 123 | register_lvis_instances( 124 | key, 125 | get_lvis_instances_meta_v1(), 126 | os.path.join(root, json_file) if "://" not in json_file else json_file, 127 | os.path.join(root, image_root), 128 | ) 129 | 130 | 131 | _root_eval = os.getenv("DATASET3", "datasets") 132 | _root_train = os.getenv("DATASET", "datasets") 133 | if _root_train!='datasets' and _root_eval!='datasets': 134 | register_all_lvis(_root_eval, _root_train) -------------------------------------------------------------------------------- /datasets/registration/register_paco_part_all.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import logging 3 | import os 4 | from detectron2.data import DatasetCatalog, MetadataCatalog 5 | import copy 6 | # from detectron2.data.datasets.register_coco import register_coco_instances 7 | from detectron2.data.datasets.coco import load_coco_json 8 | import json 9 | 10 | 11 | def _get_paco_metadata(key): 12 | # if '_base' in key: 13 | # id_to_name = {x['id']: x['name'] for x in PASCAL_PART_BASE_CATEGORIES} 14 | # else: 15 | id_to_name = {x['id']: x['name'] for x in PACO_CATEGORIES} 16 | 17 | thing_classes_ = [id_to_name[k] for k in sorted(id_to_name)] 18 | PACO_CATEGORIES_=copy.deepcopy(PACO_CATEGORIES) 19 | for cat in PACO_CATEGORIES_: 20 | if ':' not in cat['name']: 21 | cat['name']=cat['name']+':whole' 22 | if '_(' in cat['name']: 23 | cat['name']=cat['name'][:cat['name'].find('_(')]+cat['name'][cat['name'].find(')')+1:] 24 | if '_' in cat['name']: 25 | cat['name']=cat['name'].replace('_',' ') 26 | id_to_name = {x['id']: x['name'] for x in PACO_CATEGORIES_} 27 | thing_dataset_id_to_contiguous_id = { 28 | x: i for i, x in enumerate(sorted(id_to_name))} 29 | thing_classes = [id_to_name[k] for k in sorted(id_to_name)] 30 | 31 | part_classes = [a.split(":")[1].lower() for a in thing_classes] 32 | thing_clases_id_to_part_id={v: sorted(set(part_classes)).index(n) for v, n in enumerate(part_classes)} 33 | whole_classes = [a.split(":")[0].lower() for a in thing_classes] 34 | 35 | no_part_index = sorted(set(part_classes)).index('whole') 36 | thing_classes_id_without_part = [k for k, v in thing_clases_id_to_part_id.items() if no_part_index==v] 37 | 38 | thing_clases_id_to_whole_id={v: sorted(set(whole_classes)).index(n) for v, n in enumerate(whole_classes)} 39 | thing_clases_id_to_flattened_wholepart = {tid: thing_clases_id_to_whole_id[tid]*len(set(part_classes))+pid for tid, pid in thing_clases_id_to_part_id.items()} 40 | return { 41 | "thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id, 42 | "thing_classes": thing_classes_, 43 | "thing_clases_id_to_part_id": thing_clases_id_to_part_id, 44 | "part_classes": sorted(set(part_classes)), 45 | "thing_clases_id_to_whole_id": thing_clases_id_to_whole_id, 46 | "whole_classes": sorted(set(whole_classes)), 47 | "thing_clases_id_to_flattened_wholepart": thing_clases_id_to_flattened_wholepart, 48 | "thing_classes_id_without_part": thing_classes_id_without_part, 49 | } 50 | 51 | 52 | def register_paco_part_instances(name, metadata, json_file, image_root): 53 | DatasetCatalog.register(name, lambda: load_coco_json( 54 | json_file, image_root, name)) 55 | MetadataCatalog.get(name).set( 56 | json_file=json_file, image_root=image_root, 57 | evaluator_type="pascal_part_interactive", **metadata 58 | ) 59 | 60 | _PACO = { 61 | "paco_train": ("coco", "paco/annotations/paco_lvis_v1_train.json"), 62 | # "pascal_part_train_one": ("pascal_part/VOCdevkit/VOC2010/JPEGImages", "pascal_part/train_one.json"), 63 | "paco_val_inter": ("coco", "paco/annotations/paco_lvis_v1_val_mini.json"), 64 | # "paco_test": ("paco/val2017", "paco/annotations/paco_lvis_v1_val.json"), 65 | # "pascal_part_base_train": ("pascal_part/VOCdevkit/VOC2010/JPEGImages", "pascal_part/train_base.json"), 66 | # "pascal_part_base_train_one": ("pascal_part/VOCdevkit/VOC2010/JPEGImages", "pascal_part/train_base_one.json"), 67 | # "imagenet_voc_parsed": ("imagenet/train", "imagenet/imagenet_voc_image_parsed.json"), 68 | # "imagenet_golden_pascal_parsed": ("imagenet/train", "imagenet/imagenet_golden_pascal_parsed.json"), 69 | # "imagenet_golden_pascal_parsed_swinbase": ("imagenet/train", "imagenet/imagenet_golden_pascal_parsed_swinbase.json"), 70 | } 71 | 72 | 73 | def register_paco_part(root): 74 | for key, (image_root, json_file) in _PACO.items(): 75 | register_paco_part_instances( 76 | key, 77 | _get_paco_metadata(key), 78 | os.path.join(root, json_file) if "://" not in json_file else json_file, 79 | os.path.join(root, image_root), 80 | ) 81 | 82 | _root = os.getenv("PACO", "datasets") 83 | if _root!="datasets": 84 | with open(os.path.join(_root,"paco/annotations/paco_lvis_v1_val.json")) as f: 85 | j=json.load(f) 86 | PACO_CATEGORIES=j['categories'] 87 | 88 | register_paco_part(_root) 89 | else: 90 | print("skip paco register") -------------------------------------------------------------------------------- /datasets/registration/register_partimagenet_part_all.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import logging 3 | import os 4 | from detectron2.data import DatasetCatalog, MetadataCatalog 5 | from detectron2.data.datasets.coco import load_coco_json 6 | 7 | PART_IN_CATEGORIES = [{'id': 0, 'name': 'Quadruped Head', 'supercategory': 'Quadruped'}, 8 | {'id': 1, 'name': 'Quadruped Body', 'supercategory': 'Quadruped'}, 9 | {'id': 2, 'name': 'Quadruped Foot', 'supercategory': 'Quadruped'}, 10 | {'id': 3, 'name': 'Quadruped Tail', 'supercategory': 'Quadruped'}, 11 | {'id': 4, 'name': 'Biped Head', 'supercategory': 'Biped'}, 12 | {'id': 5, 'name': 'Biped Body', 'supercategory': 'Biped'}, 13 | {'id': 6, 'name': 'Biped Hand', 'supercategory': 'Biped'}, 14 | {'id': 7, 'name': 'Biped Foot', 'supercategory': 'Biped'}, 15 | {'id': 8, 'name': 'Biped Tail', 'supercategory': 'Biped'}, 16 | {'id': 9, 'name': 'Fish Head', 'supercategory': 'Fish'}, 17 | {'id': 10, 'name': 'Fish Body', 'supercategory': 'Fish'}, 18 | {'id': 11, 'name': 'Fish Fin', 'supercategory': 'Fish'}, 19 | {'id': 12, 'name': 'Fish Tail', 'supercategory': 'Fish'}, 20 | {'id': 13, 'name': 'Bird Head', 'supercategory': 'Bird'}, 21 | {'id': 14, 'name': 'Bird Body', 'supercategory': 'Bird'}, 22 | {'id': 15, 'name': 'Bird Wing', 'supercategory': 'Bird'}, 23 | {'id': 16, 'name': 'Bird Foot', 'supercategory': 'Bird'}, 24 | {'id': 17, 'name': 'Bird Tail', 'supercategory': 'Bird'}, 25 | {'id': 18, 'name': 'Snake Head', 'supercategory': 'Snake'}, 26 | {'id': 19, 'name': 'Snake Body', 'supercategory': 'Snake'}, 27 | {'id': 20, 'name': 'Reptile Head', 'supercategory': 'Reptile'}, 28 | {'id': 21, 'name': 'Reptile Body', 'supercategory': 'Reptile'}, 29 | {'id': 22, 'name': 'Reptile Foot', 'supercategory': 'Reptile'}, 30 | {'id': 23, 'name': 'Reptile Tail', 'supercategory': 'Reptile'}, 31 | {'id': 24, 'name': 'Car Body', 'supercategory': 'Car'}, 32 | {'id': 25, 'name': 'Car Tier', 'supercategory': 'Car'}, 33 | {'id': 26, 'name': 'Car Side Mirror', 'supercategory': 'Car'}, 34 | {'id': 27, 'name': 'Bicycle Body', 'supercategory': 'Bicycle'}, 35 | {'id': 28, 'name': 'Bicycle Head', 'supercategory': 'Bicycle'}, 36 | {'id': 29, 'name': 'Bicycle Seat', 'supercategory': 'Bicycle'}, 37 | {'id': 30, 'name': 'Bicycle Tier', 'supercategory': 'Bicycle'}, 38 | {'id': 31, 'name': 'Boat Body', 'supercategory': 'Boat'}, 39 | {'id': 32, 'name': 'Boat Sail', 'supercategory': 'Boat'}, 40 | {'id': 33, 'name': 'Aeroplane Head', 'supercategory': 'Aeroplane'}, 41 | {'id': 34, 'name': 'Aeroplane Body', 'supercategory': 'Aeroplane'}, 42 | {'id': 35, 'name': 'Aeroplane Engine', 'supercategory': 'Aeroplane'}, 43 | {'id': 36, 'name': 'Aeroplane Wing', 'supercategory': 'Aeroplane'}, 44 | {'id': 37, 'name': 'Aeroplane Tail', 'supercategory': 'Aeroplane'}, 45 | {'id': 38, 'name': 'Bottle Mouth', 'supercategory': 'Bottle'}, 46 | {'id': 39, 'name': 'Bottle Body', 'supercategory': 'Bottle'}] 47 | 48 | 49 | def _get_partimagenet_metadata(key): 50 | id_to_name = {x['id']: x['name'] for x in PART_IN_CATEGORIES} 51 | thing_dataset_id_to_contiguous_id = { 52 | x: i for i, x in enumerate(sorted(id_to_name))} 53 | thing_classes = [id_to_name[k] for k in sorted(id_to_name)] 54 | 55 | part_classes = [a.split(" ")[1].lower() for a in thing_classes] 56 | thing_clases_id_to_part_id = {v: sorted(set(part_classes)).index(n) for v, n in enumerate(part_classes)} 57 | whole_classes = [a.split(" ")[0].lower() for a in thing_classes] 58 | thing_clases_id_to_whole_id = {v: sorted(set(whole_classes)).index(n) for v, n in enumerate(whole_classes)} 59 | thing_clases_id_to_flattened_wholepart = {tid: thing_clases_id_to_whole_id[tid] * len(set(part_classes)) + pid for 60 | tid, pid in thing_clases_id_to_part_id.items()} 61 | return { 62 | "thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id, 63 | "thing_classes": thing_classes, 64 | "thing_clases_id_to_part_id": thing_clases_id_to_part_id, 65 | "part_classes": sorted(set(part_classes)), 66 | "thing_clases_id_to_whole_id": thing_clases_id_to_whole_id, 67 | "whole_classes": sorted(set(whole_classes)), 68 | "thing_clases_id_to_flattened_wholepart": thing_clases_id_to_flattened_wholepart, 69 | } 70 | 71 | 72 | def register_partimagenet_part_instances(name, metadata, json_file, image_root): 73 | DatasetCatalog.register(name, lambda: load_coco_json( 74 | json_file, image_root, name)) 75 | MetadataCatalog.get(name).set( 76 | json_file=json_file, image_root=image_root, 77 | evaluator_type="pascal_part_interactive", **metadata 78 | ) 79 | 80 | 81 | _PART_IN = { 82 | "partimagenet_train": ("imagenet/train", "partimagenet/train_format.json"), 83 | "partimagenet_val_inter": ("imagenet/val", "partimagenet/val_format_mini.json"), 84 | } 85 | 86 | 87 | def register_partimagenet_part(root): 88 | for key, (image_root, json_file) in _PART_IN.items(): 89 | register_partimagenet_part_instances( 90 | key, 91 | _get_partimagenet_metadata(key), 92 | os.path.join(root, json_file) if "://" not in json_file else json_file, 93 | os.path.join(root, image_root), 94 | ) 95 | 96 | 97 | _root = os.getenv("PART_IN", "datasets") 98 | if _root!='datasets': 99 | register_partimagenet_part(_root) 100 | -------------------------------------------------------------------------------- /datasets/registration/register_sam.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2022 The IDEA Authors. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ------------------------------------------------------------------------------------------------ 16 | # Copyright (c) Facebook, Inc. and its affiliates. 17 | # ------------------------------------------------------------------------------------------------ 18 | # Modified from: 19 | # https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/data/datasets/register_ade20k_instance.py 20 | # ------------------------------------------------------------------------------------------------ 21 | 22 | import json 23 | import logging 24 | import numpy as np 25 | import os 26 | from PIL import Image 27 | 28 | from detectron2.data import DatasetCatalog, MetadataCatalog 29 | from detectron2.utils.file_io import PathManager 30 | import detectron2.utils.comm as comm 31 | import torch.distributed as dist 32 | 33 | import os.path as op 34 | 35 | SAM_CATEGORIES = [{'id': 1, 'name': 'stuff'}] 36 | 37 | _PREDEFINED_SPLITS = { 38 | # point annotations without masks 39 | "sam_train": ( 40 | "", 41 | ), 42 | "sam_val": ( 43 | "", 44 | ), 45 | } 46 | 47 | 48 | def _get_sam_instances_meta(): 49 | thing_ids = [k["id"] for k in SAM_CATEGORIES] 50 | assert len(thing_ids) == 1, len(thing_ids) 51 | # Mapping from the incontiguous ADE category id to an id in [0, 99] 52 | thing_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(thing_ids)} 53 | thing_classes = [k["name"] for k in SAM_CATEGORIES] 54 | ret = { 55 | "thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id, 56 | "thing_classes": thing_classes, 57 | } 58 | return ret 59 | 60 | def load_sam_index(tsv_file, dataset_name=None, extra_annotation_keys=None): 61 | """ 62 | Load a json file with COCO's instances annotation format. 63 | Currently supports instance detection, instance segmentation, 64 | and person keypoints annotations. 65 | """ 66 | dataset_dicts = [] 67 | tsv_id = 0 68 | files = os.listdir(tsv_file) 69 | start = int(os.getenv("SAM_SUBSET_START", "90")) 70 | end = int(os.getenv("SAM_SUBSET_END", "100")) 71 | if len(files)>0 and 'part' in files[0]: # for hgx 72 | files = [f for f in files if '.tsv' in f and int(f.split('.')[1].split('_')[-1])>=start and int(f.split('.')[1].split('_')[-1])=start and int(f.split('.')[0].split('-')[-1])`_ ) 48 | """ 49 | 50 | with PathManager.open(annot_json) as f: 51 | json_info = json.load(f) 52 | 53 | # build dictionary for grounding 54 | grd_dict = collections.defaultdict(list) 55 | for grd_ann in json_info['annotations']: 56 | image_id = int(grd_ann["image_id"]) 57 | grd_dict[image_id].append(grd_ann) 58 | 59 | ret = [] 60 | for image in json_info["images"]: 61 | image_id = int(image["id"]) 62 | image_file = os.path.join(image_root, image['file_name']) 63 | grounding_anno = grd_dict[image_id] 64 | 65 | if 'train' in name and len(grounding_anno) == 0: 66 | continue 67 | 68 | ret.append( 69 | { 70 | "file_name": image_file, 71 | "image_id": image_id, 72 | "inst_info": grounding_anno, 73 | } 74 | ) 75 | 76 | assert len(ret), f"No images found in {image_root}!" 77 | assert PathManager.isfile(ret[0]["file_name"]), ret[0]["file_name"] 78 | return ret 79 | 80 | 81 | def register_seginw( 82 | name, metadata, image_root, annot_json): 83 | DatasetCatalog.register( 84 | name, 85 | lambda: load_seginw_json(name, image_root, annot_json, metadata), 86 | ) 87 | MetadataCatalog.get(name).set( 88 | image_root=image_root, 89 | json_file=annot_json, 90 | evaluator_type="seginw", 91 | ignore_label=255, 92 | label_divisor=1000, 93 | **metadata, 94 | ) 95 | 96 | 97 | def register_all_seginw(root): 98 | for ( 99 | prefix, 100 | (split, folder_name, annot_name), 101 | ) in _PREDEFINED_SPLITS_SEGINW.items(): 102 | register_seginw( 103 | prefix, 104 | get_metadata(), 105 | os.path.join(root, folder_name, split), 106 | os.path.join(root, folder_name, split, annot_name), 107 | ) 108 | 109 | 110 | _root = os.getenv("DATSETW", "datasets") 111 | if _root!='datasets': 112 | register_all_seginw(_root) 113 | -------------------------------------------------------------------------------- /datasets/registration/register_sunrgbd_semseg.py: -------------------------------------------------------------------------------- 1 | 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # -------------------------------------------------------- 4 | # X-Decoder -- Generalized Decoding for Pixel, Image, and Language 5 | # Copyright (c) 2022 Microsoft 6 | # Licensed under The MIT License [see LICENSE for details] 7 | # Modified by Xueyan Zou (xueyan@cs.wisc.edu) 8 | # -------------------------------------------------------- 9 | import numpy as np 10 | import os 11 | import glob 12 | from typing import List, Tuple, Union 13 | 14 | from detectron2.data import DatasetCatalog, MetadataCatalog 15 | from detectron2.structures import BoxMode 16 | from detectron2.utils.file_io import PathManager 17 | 18 | from utils.constants import SUN_RGBD_37 19 | 20 | __all__ = ["load_sunrgbd_instances", "register_sunrgbd_context"] 21 | 22 | def load_sunrgbd_instances(name: str, dirname: str, split: str, class_names: Union[List[str], Tuple[str, ...]]): 23 | """ 24 | Load SUN-RGBD detection annotations to Detectron2 format. 25 | 26 | Args: 27 | dirname: Contain "Annotations", "ImageSets", "JPEGImages" 28 | split (str): one of "train", "test", "val", "trainval" 29 | class_names: list or tuple of class names 30 | """ 31 | if split == 'val': 32 | split = 'test' 33 | 34 | # Needs to read many small annotation files. Makes sense at local 35 | image_pths = sorted(glob.glob(os.path.join(dirname, 'image', split, '*.jpg'))) 36 | semseg_pths = sorted(glob.glob(os.path.join(dirname, 'label37', split, '*.png'))) 37 | 38 | assert len(image_pths) == len(semseg_pths) 39 | # 5k images 40 | dicts = [] 41 | for image_dir, semseg_dir in zip(image_pths, semseg_pths): 42 | r = { 43 | "file_name": image_dir, 44 | "sem_seg_file_name": semseg_dir, 45 | "image_id": semseg_dir.split('/')[-1].split('.')[0], 46 | } 47 | dicts.append(r) 48 | return dicts 49 | 50 | 51 | def register_sun_context(name, dirname, split, class_names=SUN_RGBD_37): 52 | DatasetCatalog.register(name, lambda: load_sunrgbd_instances(name, dirname, split, class_names)) 53 | MetadataCatalog.get(name).set( 54 | stuff_classes=class_names, 55 | dirname=dirname, 56 | split=split, 57 | ignore_label=[0], 58 | thing_dataset_id_to_contiguous_id={}, 59 | class_offset=1, 60 | keep_sem_bgd=False 61 | ) 62 | 63 | 64 | def register_all_sunrgbd_seg(root): 65 | SPLITS = [ 66 | ("sunrgbd_37_val_seg", "sun_rgbd", "val"), 67 | ] 68 | 69 | for name, dirname, split in SPLITS: 70 | register_sun_context(name, os.path.join(root, dirname), split) 71 | MetadataCatalog.get(name).evaluator_type = "sem_seg" 72 | 73 | 74 | _root = os.getenv("DATSETW", "datasets") 75 | if _root!='datasets': 76 | register_all_sunrgbd_seg(_root) -------------------------------------------------------------------------------- /datasets/registration/register_ytvos_dataset.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | import os 4 | import glob 5 | import json 6 | from typing import List, Tuple, Union 7 | 8 | import cv2 9 | import numpy as np 10 | from scipy.io import loadmat 11 | 12 | from detectron2.data import DatasetCatalog, MetadataCatalog 13 | from detectron2.structures import BoxMode 14 | from detectron2.utils.file_io import PathManager 15 | 16 | 17 | __all__ = ["load_ytovs_instances", "register_ytvos_context"] 18 | 19 | def load_ytvos_instances(name: str, dirname: str, split: str): 20 | """ 21 | Load Pascal VOC detection annotations to Detectron2 format. 22 | 23 | Args: 24 | dirname: Contain "Annotations", "ImageSets", "JPEGImages" 25 | split (str): one of "train", "test", "val", "trainval" 26 | class_names: list or tuple of class names 27 | """ 28 | meta_json = os.path.join(dirname, split, "meta.json") 29 | video_dir = os.path.join(dirname, split, 'JPEGImages') 30 | mask_dir = os.path.join(dirname, split, 'Annotations') 31 | video_names = os.listdir(video_dir) 32 | meta = json.load(open(meta_json))['videos'] 33 | 34 | dicts = [] 35 | for vid_name in video_names: 36 | objects = meta[vid_name]['objects'] 37 | r = { 38 | "file_name": os.path.join(video_dir, vid_name), 39 | "mask_name": os.path.join(mask_dir, vid_name), 40 | "objects": objects, 41 | } 42 | dicts.append(r) 43 | 44 | return dicts 45 | 46 | def register_ytvos_context(name, dirname, split): 47 | if not os.path.exists(dirname): 48 | print("not register for ", name) 49 | return -1 50 | DatasetCatalog.register("{}".format(name), lambda: load_ytvos_instances(name, dirname, split)) 51 | MetadataCatalog.get("{}".format(name)).set( 52 | dirname=dirname, 53 | thing_dataset_id_to_contiguous_id={}, 54 | ) 55 | 56 | def register_all_davis(root): 57 | SPLITS = [ 58 | ("ytvos19_val", "ytvos2019", "valid"), 59 | ("ytvos18_val", "ytvos2018", "valid"), 60 | ] 61 | 62 | for name, dirname, split in SPLITS: 63 | register_ytvos_context(name, os.path.join(root, dirname), split) 64 | MetadataCatalog.get("{}".format(name)).evaluator_type = None 65 | 66 | _root = os.getenv("TRACKING_DATASET", "datasets") 67 | if _root!='datasets': 68 | register_all_davis(_root) -------------------------------------------------------------------------------- /datasets/semseg_loader.py: -------------------------------------------------------------------------------- 1 | from PIL import Image 2 | import scipy.io 3 | import numpy as np 4 | 5 | def load_semseg(filename, loader_type): 6 | if loader_type == 'PIL': 7 | semseg = np.array(Image.open(filename), dtype=np.int) 8 | elif loader_type == 'MAT': 9 | semseg = scipy.io.loadmat(filename)['LabelMap'] 10 | return semseg -------------------------------------------------------------------------------- /datasets/shapes/__init__.py: -------------------------------------------------------------------------------- 1 | from .sampler import ShapeSampler 2 | from .simpleclick_sampler import SimpleClickSampler 3 | 4 | 5 | def build_shape_sampler(cfg, **kwargs): 6 | sampler_name = cfg['STROKE_SAMPLER']['EVAL']['MODE'] 7 | if sampler_name == 'random': 8 | return ShapeSampler(cfg, **kwargs) 9 | elif sampler_name == 'best': 10 | return SimpleClickSampler(cfg, **kwargs) 11 | else: 12 | assert False, "not implemented" -------------------------------------------------------------------------------- /datasets/shapes/sampler.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import random 3 | 4 | import torch 5 | import torch.nn as nn 6 | 7 | from dinov.utils import configurable 8 | from .scribble import Scribble 9 | 10 | 11 | class ShapeSampler(nn.Module): 12 | @configurable 13 | def __init__(self, max_candidate=1, shape_prob=[], shape_candidate=[], is_train=True): 14 | super().__init__() 15 | self.max_candidate = max_candidate 16 | self.shape_prob = shape_prob 17 | self.shape_candidate = shape_candidate 18 | self.is_train = is_train 19 | 20 | @classmethod 21 | def from_config(cls, cfg, is_train=True, mode=None): 22 | max_candidate = cfg['STROKE_SAMPLER']['MAX_CANDIDATE'] 23 | candidate_probs = cfg['STROKE_SAMPLER']['CANDIDATE_PROBS'] 24 | candidate_names = cfg['STROKE_SAMPLER']['CANDIDATE_NAMES'] 25 | candidate_classes = [getattr(sys.modules[__name__], class_name)(cfg, is_train) for class_name in candidate_names] 26 | 27 | # overwrite condidate_prob 28 | if not is_train: 29 | candidate_probs = [0.0 for x in range(len(candidate_names))] 30 | candidate_probs[candidate_names.index(mode)] = 1.0 31 | 32 | # Build augmentation 33 | return { 34 | "max_candidate": max_candidate, 35 | "shape_prob": candidate_probs, 36 | "shape_candidate": candidate_classes, 37 | "is_train": is_train, 38 | } 39 | 40 | def forward(self, masks, boxes, max_candidate=50): 41 | # masks = instances.gt_masks.tensor 42 | # boxes = instances.gt_boxes.tensor 43 | 44 | if len(masks) == 0: 45 | gt_masks = torch.zeros(masks.shape[-2:]).bool() 46 | rand_masks = torch.zeros(masks.shape[-2:]).bool() 47 | return {'gt_masks': gt_masks[None,:], 'rand_shape': torch.stack([rand_masks]), 'types': ['none']} 48 | indices = [x for x in range(len(masks))] 49 | 50 | if self.is_train: 51 | # random.shuffle(indices) 52 | candidate_mask = masks[indices[:max_candidate]] 53 | # candidate_box = boxes[indices[:max_candidate]] 54 | else: 55 | candidate_mask = masks 56 | candidate_box = boxes 57 | 58 | draw_funcs = random.choices(self.shape_candidate, weights=self.shape_prob, k=len(candidate_mask)) # sample one shape, i.e., point 59 | rand_shapes = [d.draw(x, b).cuda() for d,x, b in zip(draw_funcs, candidate_mask, candidate_box)] 60 | types = [repr(x) for x in draw_funcs] 61 | for i in range(0, len(rand_shapes)): 62 | if rand_shapes[i].sum() == 0: 63 | candidate_mask[i] = candidate_mask[i] * 0 64 | types[i] = 'none' 65 | 66 | # candidate_mask: (c,h,w), bool. rand_shape: (c, iter, h, w), bool. types: list(c) 67 | try: 68 | rand_shapess=torch.stack(rand_shapes) 69 | except RuntimeError: 70 | for r in rand_shapes: 71 | print('r ', r.device()) 72 | print(candidate_mask.device()) 73 | return {'gt_masks': candidate_mask, 'rand_shape': torch.stack(rand_shapes), 'types': types, 'sampler': self} 74 | 75 | def build_shape_sampler(cfg, **kwargs): 76 | return ShapeSampler(cfg, **kwargs) -------------------------------------------------------------------------------- /datasets/shapes/scribble.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import torch 4 | 5 | from .mask_generators import get_mask_by_input_strokes 6 | 7 | 8 | class Scribble: 9 | def __init__(self, cfg, is_train): 10 | self.num_stroke = cfg['STROKE_SAMPLER']['SCRIBBLE']['NUM_STROKES'] 11 | self.stroke_preset = cfg['STROKE_SAMPLER']['SCRIBBLE']['STROKE_PRESET'] 12 | self.stroke_prob = cfg['STROKE_SAMPLER']['SCRIBBLE']['STROKE_PROB'] 13 | self.eval_stroke = cfg['STROKE_SAMPLER']['EVAL']['MAX_ITER'] 14 | self.is_train = is_train 15 | 16 | @staticmethod 17 | def get_stroke_preset(stroke_preset): 18 | if stroke_preset == 'rand_curve': 19 | return { 20 | "nVertexBound": [20, 50], 21 | "maxHeadSpeed": 30, 22 | "maxHeadAcceleration": (30, 0.5), 23 | "brushWidthBound": (3, 15), 24 | "nMovePointRatio": 0.5, 25 | "maxPiontMove": 6, 26 | "maxLineAcceleration": (9, 0.5), 27 | "boarderGap": None, 28 | "maxInitSpeed": 10 29 | } 30 | elif stroke_preset == 'rand_curve_small': 31 | return { 32 | "nVertexBound": [6, 22], 33 | "maxHeadSpeed": 12, 34 | "maxHeadAcceleration": (8, 0.5), 35 | "brushWidthBound": (2.5, 5), 36 | "nMovePointRatio": 0.5, 37 | "maxPiontMove": 1.5, 38 | "maxLineAcceleration": (3, 0.5), 39 | "boarderGap": None, 40 | "maxInitSpeed": 3 41 | } 42 | else: 43 | raise NotImplementedError(f'The stroke presetting "{stroke_preset}" does not exist.') 44 | 45 | def get_random_points_from_mask(self, mask, n=5): 46 | h,w = mask.shape 47 | view_mask = mask.reshape(h*w) 48 | non_zero_idx = view_mask.nonzero()[:,0] 49 | selected_idx = torch.randperm(len(non_zero_idx))[:n] 50 | non_zero_idx = non_zero_idx[selected_idx] 51 | y = (non_zero_idx // w)*1.0 52 | x = (non_zero_idx % w)*1.0 53 | return torch.cat((x[:,None], y[:,None]), dim=1).cpu().numpy() 54 | 55 | def draw(self, mask=None, box=None): 56 | if mask.sum() < 1: 57 | return torch.zeros(mask.shape).bool().cuda() # if mask is empty 58 | if not self.is_train: 59 | return self.draw_eval(mask=mask, box=box) 60 | stroke_preset_name = random.choices(self.stroke_preset, weights=self.stroke_prob, k=1)[0] 61 | preset = Scribble.get_stroke_preset(stroke_preset_name) 62 | nStroke = random.randint(1, min(self.num_stroke, mask.sum().item())) 63 | h,w = mask.shape 64 | points = self.get_random_points_from_mask(mask, n=nStroke) 65 | rand_mask = get_mask_by_input_strokes( 66 | init_points=points, 67 | imageWidth=w, imageHeight=h, nStroke=min(nStroke, len(points)), **preset) 68 | rand_mask = (~torch.from_numpy(rand_mask)).cuda() * mask 69 | return rand_mask 70 | 71 | def draw_eval(self, mask=None, box=None): 72 | stroke_preset_name = random.choices(self.stroke_preset, weights=self.stroke_prob, k=1)[0] 73 | preset = Scribble.get_stroke_preset(stroke_preset_name) 74 | nStroke = min(self.eval_stroke, mask.sum().item()) 75 | h,w = mask.shape 76 | points = self.get_random_points_from_mask(mask, n=nStroke) 77 | rand_masks = [] 78 | for i in range(len(points)): 79 | rand_mask = get_mask_by_input_strokes( 80 | init_points=points[:i+1], 81 | imageWidth=w, imageHeight=h, nStroke=min(i, len(points)), **preset) 82 | rand_mask = (~torch.from_numpy(rand_mask)).cuda() * mask 83 | rand_masks += [rand_mask] 84 | return torch.stack(rand_masks) 85 | 86 | @staticmethod 87 | def draw_by_points(points, mask, h, w): 88 | preset = Scribble.get_stroke_preset('rand_curve_small') 89 | rand_mask = get_mask_by_input_strokes( 90 | init_points=points, 91 | imageWidth=w, imageHeight=h, nStroke=len(points), **preset)[None,] 92 | rand_masks = (~torch.from_numpy(rand_mask)) * mask 93 | return rand_masks 94 | 95 | def __repr__(self,): 96 | return 'scribble' -------------------------------------------------------------------------------- /datasets/shapes/simpleclick_sampler.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import random 3 | 4 | import cv2 5 | import numpy as np 6 | from scipy import ndimage 7 | import torch 8 | import torch.nn as nn 9 | import torch.nn.functional as F 10 | from kornia.contrib import distance_transform 11 | 12 | from .scribble import Scribble 13 | from dinov.utils import configurable 14 | 15 | 16 | class SimpleClickSampler(nn.Module): 17 | @configurable 18 | def __init__(self, mask_mode='point', sample_negtive=False, is_train=True, dilation=None, dilation_kernel=None): 19 | super().__init__() 20 | self.mask_mode = mask_mode 21 | self.sample_negtive = sample_negtive 22 | self.is_train = is_train 23 | self.dilation = dilation 24 | self.register_buffer("dilation_kernel", dilation_kernel) 25 | 26 | @classmethod 27 | def from_config(cls, cfg, is_train=True, mode=None): 28 | mask_mode = mode 29 | sample_negtive = cfg['STROKE_SAMPLER']['EVAL']['NEGATIVE'] 30 | 31 | dilation = cfg['STROKE_SAMPLER']['DILATION'] 32 | dilation_kernel = torch.ones((1, 1, dilation, dilation), device=torch.cuda.current_device()) 33 | 34 | # Build augmentation 35 | return { 36 | "mask_mode": mask_mode, 37 | "sample_negtive": sample_negtive, 38 | "is_train": is_train, 39 | "dilation": dilation, 40 | "dilation_kernel": dilation_kernel, 41 | } 42 | 43 | def forward_scribble(self, instances, pred_masks=None, prev_masks=None): 44 | gt_masks_batch = instances.gt_masks 45 | _,h,w = gt_masks_batch.shape 46 | 47 | rand_shapes = [] 48 | for i in range(len(gt_masks_batch)): 49 | gt_masks = gt_masks_batch[i:i+1] 50 | assert len(gt_masks) == 1 # it only supports a single image, with a single candidate mask. 51 | # pred_masks is after padding 52 | 53 | # We only consider positive points 54 | pred_masks = torch.zeros(gt_masks.shape).bool() if pred_masks is None else pred_masks[:,:h,:w] 55 | prev_masks = torch.zeros(gt_masks.shape).bool() if prev_masks is None else prev_masks 56 | 57 | fp = gt_masks & (~(gt_masks & pred_masks)) & (~prev_masks) 58 | next_mask = torch.zeros(gt_masks.shape).bool() 59 | 60 | mask_dt = torch.from_numpy(cv2.distanceTransform(fp[0].numpy().astype(np.uint8), cv2.DIST_L2, 0)[None,:]) 61 | max_value = mask_dt.max() 62 | next_mask[(mask_dt==max_value).nonzero()[0:1].t().tolist()] = True 63 | 64 | points = next_mask[0].nonzero().flip(dims=[-1]) 65 | next_mask = Scribble.draw_by_points(points, gt_masks, h, w) 66 | rand_shapes += [(prev_masks | next_mask)] 67 | 68 | types = ['scribble' for i in range(len(gt_masks_batch))] 69 | return {'gt_masks': instances.gt_masks, 'rand_shape': rand_shapes, 'types': types, 'sampler': self} 70 | 71 | def forward(self, instances, *args, **kwargs): 72 | if self.mask_mode == 'Point': 73 | return self.forward_point(instances, *args, **kwargs) 74 | elif self.mask_mode == 'Circle': 75 | assert False, "Circle not support best path." 76 | elif self.mask_mode == 'Scribble': 77 | assert False, "Scribble not support best path." 78 | elif self.mask_mode == 'Polygon': 79 | assert False, "Polygon not support best path." 80 | -------------------------------------------------------------------------------- /datasets/utils/tsv/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Yihao Chen 3 | # @Date: 2021-08-16 16:56:22 4 | # @Last Modified by: Yihao Chen 5 | # @Last Modified time: 2021-08-16 17:00:28 6 | 7 | from .io_common import FileProgressingbar, img_from_base64, generate_lineidx 8 | from .tsv_io import TSVFile 9 | 10 | __all__ = [ 11 | 'FileProgressingbar', 'img_from_base64', 'generate_lineidx', 'TSVFile' 12 | ] -------------------------------------------------------------------------------- /datasets/utils/tsv/io_common.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Yihao Chen 3 | # @Date: 2021-08-13 14:35:27 4 | # @Last Modified by: Yihao Chen 5 | # @Last Modified time: 2022-04-24 11:38:58 6 | 7 | import os 8 | import base64 9 | from io import BytesIO 10 | from PIL import Image 11 | 12 | import cv2 13 | import yaml 14 | import progressbar 15 | import numpy as np 16 | import torchvision.transforms as T 17 | 18 | class FileProgressingbar: 19 | fileobj = None 20 | pbar = None 21 | def __init__(self, fileobj, msg): 22 | fileobj.seek(0, os.SEEK_END) 23 | flen = fileobj.tell() 24 | fileobj.seek(0, os.SEEK_SET) 25 | self.fileobj = fileobj 26 | widgets = [msg, progressbar.AnimatedMarker(), ' ', progressbar.Percentage(), ' ', progressbar.Bar(), ' ', progressbar.ETA()] 27 | self.pbar = progressbar.ProgressBar(widgets=widgets, maxval=flen).start() 28 | 29 | def update(self): 30 | self.pbar.update(self.fileobj.tell()) 31 | 32 | 33 | def img_from_base64(imagestring): 34 | jpgbytestring = base64.b64decode(imagestring) 35 | image = BytesIO(jpgbytestring) 36 | image = Image.open(image).convert("RGB") 37 | return image 38 | 39 | # jpgbytestring = base64.b64decode(imagestring) 40 | # nparr = np.frombuffer(jpgbytestring, np.uint8) 41 | # try: 42 | # r = cv2.imdecode(nparr, cv2.IMREAD_COLOR) 43 | # # r = cv2.cvtColor(r, cv2.COLOR_BGR2RGB) 44 | # return r 45 | # except: 46 | # return None 47 | 48 | 49 | def generate_lineidx(filein, idxout): 50 | assert not os.path.isfile(idxout) 51 | with open(filein, 'r') as tsvin, open(idxout, 'w') as tsvout: 52 | bar = FileProgressingbar(tsvin, 'Generating lineidx {0}: '.format(idxout)) 53 | fsize = os.fstat(tsvin.fileno()).st_size 54 | fpos = 0 55 | while fpos != fsize: 56 | tsvout.write(str(fpos)+"\n") 57 | tsvin.readline() 58 | fpos = tsvin.tell() 59 | bar.update() 60 | -------------------------------------------------------------------------------- /datasets/utils/tsv/tsv_io.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Yihao Chen 3 | # @Date: 2021-08-13 14:26:21 4 | # @Last Modified by: Yihao Chen 5 | # @Last Modified time: 2022-08-17 00:57:51 6 | import time 7 | import os 8 | import os.path as op 9 | from .io_common import generate_lineidx, FileProgressingbar 10 | 11 | 12 | class TSVFile(object): 13 | def __init__(self, tsv_file, silence=True): 14 | self.tsv_file = tsv_file 15 | self.lineidx = op.splitext(tsv_file)[0] + '.lineidx' 16 | 17 | self.label_file = op.splitext(tsv_file)[0] + '.label' 18 | self.label_lineidx = op.splitext(tsv_file)[0] + '.label.lineidx' 19 | 20 | if os.path.exists(self.label_file): 21 | self.split_label = True 22 | else: 23 | self.split_label = False 24 | 25 | self._fp = None 26 | self._lineidx = None 27 | 28 | self._label_fp = None 29 | self._label_lineidx = None 30 | 31 | self.pid = None 32 | self.silence = silence 33 | self._ensure_lineidx_loaded() 34 | 35 | def num_rows(self): 36 | return len(self._lineidx) 37 | 38 | def seek(self, idx): 39 | self._ensure_tsv_opened() 40 | pos = self._lineidx[idx] 41 | self._fp.seek(pos) 42 | tsv_info = [s.strip() for s in self._fp.readline().split('\t')] 43 | 44 | if self.split_label: 45 | label_pos = self._label_lineidx[idx] 46 | self._label_fp.seek(label_pos) 47 | label_info = [s.strip() for s in self._label_fp.readline().split('\t')] 48 | 49 | assert tsv_info[0] == label_info[0] 50 | tsv_info = [tsv_info[0], label_info[-1], tsv_info[-1]] 51 | 52 | return tsv_info 53 | 54 | def close(self): 55 | if self._fp is not None: 56 | self._fp.close() 57 | del self._fp 58 | del self._lineidx 59 | 60 | self._fp = None 61 | self._lineidx = None 62 | 63 | def _ensure_lineidx_loaded(self): 64 | if not op.isfile(self.lineidx) and not op.islink(self.lineidx): 65 | generate_lineidx(self.tsv_file, self.lineidx) 66 | 67 | if self._lineidx is None: 68 | with open(self.lineidx, 'r') as fp: 69 | lines = fp.readlines() 70 | self._lineidx = [int(i.strip().split()[0]) for i in lines] 71 | 72 | if self.split_label: 73 | with open(self.label_lineidx, 'r') as fp: 74 | lines = fp.readlines() 75 | self._label_lineidx = [int(i.strip().split()[0]) for i in lines] 76 | 77 | 78 | def _ensure_tsv_opened(self): 79 | self._ensure_lineidx_loaded() 80 | if self._fp is None: 81 | self._fp = open(self.tsv_file, 'r') 82 | self.pid = os.getpid() 83 | 84 | if self.split_label: 85 | self._label_fp = open(self.label_file, 'r') 86 | 87 | if self.pid != os.getpid(): 88 | print('re-open {} because the process id changed'.format(self.tsv_file)) 89 | self._fp = open(self.tsv_file, 'r') 90 | self.pid = os.getpid() 91 | 92 | if self.split_label: 93 | self._label_fp = open(self.label_file, 'r') 94 | -------------------------------------------------------------------------------- /demo/__init__.py: -------------------------------------------------------------------------------- 1 | from .openset_task import task_openset -------------------------------------------------------------------------------- /demo/examples/bags.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UX-Decoder/DINOv/53bf20d5cfdbb86fa35141a1cff432d4923599f2/demo/examples/bags.jpg -------------------------------------------------------------------------------- /demo/examples/corgi2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UX-Decoder/DINOv/53bf20d5cfdbb86fa35141a1cff432d4923599f2/demo/examples/corgi2.jpg -------------------------------------------------------------------------------- /demo/examples/img.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UX-Decoder/DINOv/53bf20d5cfdbb86fa35141a1cff432d4923599f2/demo/examples/img.png -------------------------------------------------------------------------------- /demo/examples/ref_cat.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UX-Decoder/DINOv/53bf20d5cfdbb86fa35141a1cff432d4923599f2/demo/examples/ref_cat.jpeg -------------------------------------------------------------------------------- /demo/openset_task.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Semantic-SAM: Segment and Recognize Anything at Any Granularity 3 | # Copyright (c) 2023 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Hao Zhang (hzhangcx@connect.ust.hk) 6 | # -------------------------------------------------------- 7 | # Copyright (c) 2024 Microsoft 8 | # Licensed under The MIT License [see LICENSE for details] 9 | # Written by Feng Li (fliay@connect.ust.hk) 10 | # -------------------------------------------------------- 11 | 12 | import torch 13 | import numpy as np 14 | from torchvision import transforms 15 | from utils.visualizer import Visualizer 16 | from typing import Tuple 17 | from PIL import Image 18 | from detectron2.data import MetadataCatalog 19 | import os 20 | import cv2 21 | 22 | metadata = MetadataCatalog.get('coco_2017_train_panoptic') 23 | 24 | 25 | def inverse_sigmoid(x, eps=1e-5): 26 | x = x.clamp(min=0, max=1) 27 | x1 = x.clamp(min=eps) 28 | x2 = (1 - x).clamp(min=eps) 29 | return torch.log(x1/x2) 30 | 31 | def task_openset(model,generic_vp1, generic_vp2, generic_vp3, generic_vp4, 32 | generic_vp5, generic_vp6, generic_vp7, generic_vp8, image_tgt=None, text_size=640,hole_scale=100,island_scale=100): 33 | in_context_examples = [generic_vp1, generic_vp2, generic_vp3, generic_vp4, 34 | generic_vp5, generic_vp6, generic_vp7, generic_vp8] 35 | in_context_examples = [x for x in in_context_examples if x is not None] 36 | t = [] 37 | t.append(transforms.Resize(int(text_size), interpolation=Image.BICUBIC)) 38 | def prepare_image(image_ori): 39 | width = image_ori.size[0] 40 | height = image_ori.size[1] 41 | image_ori = np.asarray(image_ori) 42 | images = torch.from_numpy(image_ori.copy()).permute(2,0,1).cuda() 43 | return images, height, width 44 | transform1 = transforms.Compose(t) 45 | image_ori_tgt = transform1(image_tgt) 46 | images_tgt, height_tgt, width_tgt = prepare_image(image_ori_tgt) 47 | data_tgt = {"image": images_tgt, "height": height_tgt, "width": width_tgt} 48 | batched_inputs = [] 49 | batched_inputs_tgt = [data_tgt] 50 | multi_scale_features2, mask_features2, _, _ = model.model.get_encoder_feature(batched_inputs_tgt) 51 | input_query_label_content_all = [] 52 | point_coords = torch.ones(1, 4).cuda().float() 53 | point_coords[:, :2] = 0. 54 | input_query_bbox_content_init = inverse_sigmoid(point_coords[None]) 55 | for image in in_context_examples: 56 | image_ori = transform1(image['image']) 57 | mask_ori = transform1(image['mask']) 58 | images, height, width = prepare_image(image_ori) 59 | 60 | data = {"image": images, "height": height, "width": width} 61 | data['seg_image'] = data_tgt 62 | 63 | mask_ori = np.asarray(mask_ori)[:,:,0:1].copy() 64 | mask_ori = torch.from_numpy(mask_ori).permute(2,0,1) 65 | 66 | data['targets'] = [dict()] 67 | data['targets'][0]['rand_shape']=mask_ori 68 | data['targets'][0]['pb']=torch.tensor([1.]) # FIXME 0 or 1 69 | 70 | frame = data 71 | rand_shape = mask_ori 72 | frame['targets'][0]['rand_shape'] = rand_shape 73 | 74 | batched_inputs.append(frame) 75 | 76 | multi_scale_features, _, padded_h, padded_w = model.model.get_encoder_feature([frame]) 77 | input_query_label_content, input_query_bbox_content, attn_mask_content = model.model. \ 78 | get_visual_prompt_content_feature(multi_scale_features, frame['targets'][0]['rand_shape'], padded_h, padded_w) 79 | input_query_label_content_all.append(input_query_label_content) 80 | 81 | # prompt to tgt image 82 | input_query_label_content_current = torch.stack(input_query_label_content_all).mean(0) 83 | masks, ious, ori_masks, scores_per_image_openset = model.model.evaluate_demo_content_openset_multi_with_content_features( 84 | batched_inputs_tgt, mask_features2, multi_scale_features2, input_query_label_content_current, 85 | input_query_bbox_content_init, attn_mask_content, padded_h, padded_w) 86 | if len(ious.shape)>1: 87 | ious=ious[0] 88 | ids=torch.argsort(scores_per_image_openset,descending=True) 89 | areas=[] 90 | image_ori = image_ori_tgt 91 | new_pred_mask = [] 92 | new_pred_class_score = [] 93 | for i in ids: 94 | new_pred_class_score.append(scores_per_image_openset[i]) 95 | new_pred_mask.append(masks[i]) 96 | pred_masks_poses = new_pred_mask 97 | ious = new_pred_class_score 98 | visual = Visualizer(image_ori, metadata=metadata) 99 | for i,(pred_masks_pos,iou, _, _) in enumerate(zip(pred_masks_poses,ious, pred_masks_poses, pred_masks_poses)): 100 | iou=round(float(iou),2) 101 | texts=f'{iou}' 102 | mask=(pred_masks_pos>0.0).cpu().numpy() 103 | area=mask.sum() 104 | areas.append(area) 105 | # uncomment for additional postprocessing 106 | # mask,_=remove_small_regions(mask,int(hole_scale),mode="holes") 107 | # mask,_=remove_small_regions(mask,int(island_scale),mode="islands") 108 | mask=(mask).astype(np.float) 109 | color=[0.,0.,1.0] 110 | color=[0.502, 0.0, 0.502] 111 | demo = visual.draw_binary_mask(mask, text='', alpha=0.7, edge_color=color) 112 | res = demo.get_image() 113 | 114 | torch.cuda.empty_cache() 115 | 116 | return res 117 | 118 | def remove_small_regions( 119 | mask: np.ndarray, area_thresh: float, mode: str 120 | ) -> Tuple[np.ndarray, bool]: 121 | """ 122 | Removes small disconnected regions and holes in a mask. Returns the 123 | mask and an indicator of if the mask has been modified. 124 | """ 125 | import cv2 # type: ignore 126 | 127 | assert mode in ["holes", "islands"] 128 | correct_holes = mode == "holes" 129 | working_mask = (correct_holes ^ mask).astype(np.uint8) 130 | n_labels, regions, stats, _ = cv2.connectedComponentsWithStats(working_mask, 8) 131 | sizes = stats[:, -1][1:] # Row 0 is background label 132 | small_regions = [i + 1 for i, s in enumerate(sizes) if s < area_thresh] 133 | if len(small_regions) == 0: 134 | return mask, False 135 | fill_labels = [0] + small_regions 136 | if not correct_holes: 137 | fill_labels = [i for i in range(n_labels) if i not in fill_labels] 138 | # If every region is below threshold, keep largest 139 | if len(fill_labels) == 0: 140 | fill_labels = [int(np.argmax(sizes)) + 1] 141 | mask = np.isin(regions, fill_labels) 142 | return mask, True -------------------------------------------------------------------------------- /demo_openset.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Semantic-SAM: Segment and Recognize Anything at Any Granularity 3 | # Copyright (c) 2023 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Hao Zhang (hzhangcx@connect.ust.hk) 6 | # -------------------------------------------------------- 7 | # Copyright (c) 2024 Microsoft 8 | # Licensed under The MIT License [see LICENSE for details] 9 | # Written by Feng Li (fliay@connect.ust.hk) 10 | # -------------------------------------------------------- 11 | 12 | 13 | import gradio as gr 14 | import torch 15 | import argparse 16 | 17 | from dinov.BaseModel import BaseModel 18 | from dinov import build_model 19 | from utils.arguments import load_opt_from_config_file 20 | 21 | from demo import task_openset 22 | 23 | def parse_option(): 24 | parser = argparse.ArgumentParser('DINOv Demo', add_help=False) 25 | parser.add_argument('--conf_files', default="configs/dinov_sam_coco_swinl_train.yaml", metavar="FILE", help='path to config file', ) 26 | parser.add_argument('--ckpt', default="", metavar="FILE", help='path to ckpt', required=True) 27 | parser.add_argument('--port', default=6099, type=int, help='path to ckpt', ) 28 | args = parser.parse_args() 29 | 30 | return args 31 | 32 | 33 | class ImageMask(gr.components.Image): 34 | """ 35 | Sets: source="canvas", tool="sketch" 36 | """ 37 | 38 | is_template = True 39 | 40 | def __init__(self, **kwargs): 41 | super().__init__(source="upload", tool="sketch", interactive=True, **kwargs) 42 | 43 | def preprocess(self, x): 44 | return super().preprocess(x) 45 | 46 | 47 | ''' 48 | build args 49 | ''' 50 | args = parse_option() 51 | 52 | ''' 53 | build model 54 | ''' 55 | 56 | sam_cfg=args.conf_files 57 | 58 | opt = load_opt_from_config_file(sam_cfg) 59 | 60 | model_sam = BaseModel(opt, build_model(opt)).from_pretrained(args.ckpt).eval().cuda() 61 | 62 | @torch.no_grad() 63 | def inference(generic_vp1, generic_vp2, generic_vp3, generic_vp4, 64 | generic_vp5, generic_vp6, generic_vp7, generic_vp8, image2,*args, **kwargs): 65 | with torch.autocast(device_type='cuda', dtype=torch.float16): 66 | model=model_sam 67 | a= task_openset(model, generic_vp1, generic_vp2, generic_vp3, generic_vp4, 68 | generic_vp5, generic_vp6, generic_vp7, generic_vp8, image2, *args, **kwargs) 69 | return a 70 | 71 | 72 | ''' 73 | launch app 74 | ''' 75 | title = "DINOv: Visual In-Context Prompting" 76 | 77 | article = "The Demo is Run on DINOv." 78 | 79 | demo = gr.Blocks() 80 | image_tgt=gr.components.Image(label="Target Image ",type="pil",brush_radius=15.0) 81 | gallery_output=gr.components.Image(label="Results Image ",type="pil",brush_radius=15.0) 82 | 83 | generic_vp1 = ImageMask(label="scribble on refer Image 1",type="pil",brush_radius=15.0) 84 | generic_vp2 = ImageMask(label="scribble on refer Image 2",type="pil",brush_radius=15.0) 85 | generic_vp3 = ImageMask(label="scribble on refer Image 3",type="pil",brush_radius=15.0) 86 | generic_vp4 = ImageMask(label="scribble on refer Image 5",type="pil",brush_radius=15.0) 87 | generic_vp5 = ImageMask(label="scribble on refer Image 6",type="pil",brush_radius=15.0) 88 | generic_vp6 = ImageMask(label="scribble on refer Image 7",type="pil",brush_radius=15.0) 89 | generic_vp7 = ImageMask(label="scribble on refer Image 8",type="pil",brush_radius=15.0) 90 | generic_vp8 = ImageMask(label="scribble on refer Image 9",type="pil",brush_radius=15.0) 91 | generic = gr.TabbedInterface([ 92 | generic_vp1, generic_vp2, generic_vp3, generic_vp4, 93 | generic_vp5, generic_vp6, generic_vp7, generic_vp8 94 | ], ["1", "2", "3", "4", "5", "6", "7", "8"]) 95 | 96 | title=''' 97 | # DINOv: Visual In-Context Prompting 98 | 99 | # [[Read our arXiv Paper](https://arxiv.org/pdf/2311.13601.pdf)\]   \[[Github page](https://github.com/UX-Decoder/DINOv)\] 100 | ''' 101 | 102 | with demo: 103 | with gr.Row(): 104 | with gr.Column(scale=3.0): 105 | generation_tittle = gr.Markdown(title) 106 | image_tgt.render() 107 | generic.render() 108 | with gr.Row(scale=2.0): 109 | clearBtn = gr.ClearButton( 110 | components=[image_tgt]) 111 | runBtn = gr.Button("Run") 112 | with gr.Column(scale=5.0): 113 | 114 | gallery_tittle = gr.Markdown("# Open-set results.") 115 | with gr.Row(scale=9.0): 116 | gallery_output.render() 117 | 118 | example = gr.Examples( 119 | examples=[ 120 | ["demo/examples/bags.jpg"], 121 | ["demo/examples/img.png"], 122 | ["demo/examples/corgi2.jpg"], 123 | ["demo/examples/ref_cat.jpeg"], 124 | ], 125 | inputs=image_tgt, 126 | cache_examples=False, 127 | ) 128 | 129 | title = title, 130 | article = article, 131 | allow_flagging = 'never', 132 | 133 | runBtn.click(inference, inputs=[generic_vp1, generic_vp2, generic_vp3, generic_vp4, 134 | generic_vp5, generic_vp6, generic_vp7, generic_vp8, image_tgt], 135 | outputs = [gallery_output]) 136 | 137 | 138 | 139 | demo.queue().launch(share=True,server_port=args.port) 140 | 141 | -------------------------------------------------------------------------------- /dinov/BaseModel.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | 4 | import torch 5 | import torch.nn as nn 6 | 7 | from utils.model import align_and_update_state_dicts 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | class BaseModel(nn.Module): 13 | def __init__(self, opt, module: nn.Module): 14 | super(BaseModel, self).__init__() 15 | self.opt = opt 16 | self.model = module 17 | 18 | def forward(self, *inputs, **kwargs): 19 | outputs = self.model(*inputs, **kwargs) 20 | return outputs 21 | 22 | def from_pretrained(self, load_dir): 23 | state_dict = torch.load(load_dir, map_location='cpu') 24 | if 'model' in state_dict: 25 | state_dict=state_dict['model'] 26 | state_dict={k[6:]:v for k,v in state_dict.items() if k.startswith('model.')} 27 | state_dict = align_and_update_state_dicts(self.model.state_dict(), state_dict) 28 | self.model.load_state_dict(state_dict, strict=False) 29 | return self -------------------------------------------------------------------------------- /dinov/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | from .architectures import build_model 6 | from utils.dist import get_world_size, all_gather -------------------------------------------------------------------------------- /dinov/architectures/__init__.py: -------------------------------------------------------------------------------- 1 | from .dinov import * 2 | from .build import build_model 3 | 4 | -------------------------------------------------------------------------------- /dinov/architectures/build.py: -------------------------------------------------------------------------------- 1 | from .registry import model_entrypoints 2 | from .registry import is_model 3 | 4 | 5 | def build_model(config, **kwargs): 6 | model_name = config['MODEL']['NAME'] 7 | 8 | if not is_model(model_name): 9 | raise ValueError(f'Unkown model: {model_name}') 10 | 11 | return model_entrypoints(model_name)(config, **kwargs) -------------------------------------------------------------------------------- /dinov/architectures/registry.py: -------------------------------------------------------------------------------- 1 | _model_entrypoints = {} 2 | 3 | def register_model(fn): 4 | module_name_split = fn.__module__.split('.') 5 | model_name = module_name_split[-1] 6 | _model_entrypoints[model_name] = fn 7 | return fn 8 | 9 | def model_entrypoints(model_name): 10 | return _model_entrypoints[model_name] 11 | 12 | def is_model(model_name): 13 | return model_name in _model_entrypoints -------------------------------------------------------------------------------- /dinov/backbone/__init__.py: -------------------------------------------------------------------------------- 1 | from .build import build_backbone 2 | 3 | from .focal import * 4 | from .focal_dw import * 5 | from .swin import * 6 | from .backbone import * -------------------------------------------------------------------------------- /dinov/backbone/backbone.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import torch.nn as nn 3 | 4 | from detectron2.modeling import ShapeSpec 5 | 6 | __all__ = ["Backbone"] 7 | 8 | 9 | class Backbone(nn.Module): 10 | """ 11 | Abstract base class for network backbones. 12 | """ 13 | 14 | def __init__(self): 15 | """ 16 | The `__init__` method of any subclass can specify its own set of arguments. 17 | """ 18 | super().__init__() 19 | 20 | def forward(self): 21 | """ 22 | Subclasses must override this method, but adhere to the same return type. 23 | 24 | Returns: 25 | dict[str->Tensor]: mapping from feature name (e.g., "res2") to tensor 26 | """ 27 | pass 28 | 29 | @property 30 | def size_divisibility(self) -> int: 31 | """ 32 | Some backbones require the input height and width to be divisible by a 33 | specific integer. This is typically true for encoder / decoder type networks 34 | with lateral connection (e.g., FPN) for which feature maps need to match 35 | dimension in the "bottom up" and "top down" paths. Set to 0 if no specific 36 | input size divisibility is required. 37 | """ 38 | return 0 39 | 40 | def output_shape(self): 41 | """ 42 | Returns: 43 | dict[str->ShapeSpec] 44 | """ 45 | # this is a backward-compatible default 46 | return { 47 | name: ShapeSpec( 48 | channels=self._out_feature_channels[name], stride=self._out_feature_strides[name] 49 | ) 50 | for name in self._out_features 51 | } 52 | -------------------------------------------------------------------------------- /dinov/backbone/build.py: -------------------------------------------------------------------------------- 1 | from .registry import model_entrypoints 2 | from .registry import is_model 3 | 4 | from .backbone import * 5 | 6 | def build_backbone(config, **kwargs): 7 | model_name = config['MODEL']['BACKBONE']['NAME'] 8 | if not is_model(model_name): 9 | raise ValueError(f'Unkown model: {model_name}') 10 | 11 | return model_entrypoints(model_name)(config, **kwargs) -------------------------------------------------------------------------------- /dinov/backbone/registry.py: -------------------------------------------------------------------------------- 1 | _model_entrypoints = {} 2 | 3 | 4 | def register_backbone(fn): 5 | module_name_split = fn.__module__.split('.') 6 | model_name = module_name_split[-1] 7 | _model_entrypoints[model_name] = fn 8 | return fn 9 | 10 | def model_entrypoints(model_name): 11 | return _model_entrypoints[model_name] 12 | 13 | def is_model(model_name): 14 | return model_name in _model_entrypoints 15 | -------------------------------------------------------------------------------- /dinov/body/__init__.py: -------------------------------------------------------------------------------- 1 | from .build import build_openseed_head -------------------------------------------------------------------------------- /dinov/body/build.py: -------------------------------------------------------------------------------- 1 | from .registry import model_entrypoints 2 | from .registry import is_model 3 | from .general_head import * 4 | 5 | 6 | def build_openseed_head(config, *args, **kwargs): 7 | model_name = config['MODEL']['HEAD'] 8 | if not is_model(model_name): 9 | raise ValueError(f'Unkown model: {model_name}') 10 | 11 | body = model_entrypoints(model_name)(config, *args, **kwargs) 12 | return body -------------------------------------------------------------------------------- /dinov/body/decoder/__init__.py: -------------------------------------------------------------------------------- 1 | from .build import build_decoder 2 | from .dinov_openset_decoder import * 3 | -------------------------------------------------------------------------------- /dinov/body/decoder/build.py: -------------------------------------------------------------------------------- 1 | from .registry import model_entrypoints 2 | from .registry import is_model 3 | 4 | 5 | def build_decoder(config, *args, **kwargs): 6 | model_name = config['MODEL']['DECODER']['NAME'] 7 | 8 | if not is_model(model_name): 9 | raise ValueError(f'Unkown model: {model_name}') 10 | 11 | return model_entrypoints(model_name)(config, *args, **kwargs) -------------------------------------------------------------------------------- /dinov/body/decoder/registry.py: -------------------------------------------------------------------------------- 1 | _model_entrypoints = {} 2 | 3 | def register_decoder(fn): 4 | module_name_split = fn.__module__.split('.') 5 | model_name = module_name_split[-1] 6 | _model_entrypoints[model_name] = fn 7 | return fn 8 | 9 | def model_entrypoints(model_name): 10 | return _model_entrypoints[model_name] 11 | 12 | def is_model(model_name): 13 | return model_name in _model_entrypoints -------------------------------------------------------------------------------- /dinov/body/decoder/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .utils import * -------------------------------------------------------------------------------- /dinov/body/decoder/utils/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import copy 3 | from torch import nn, Tensor 4 | import os 5 | 6 | import math 7 | import torch.nn.functional as F 8 | from torch import nn 9 | 10 | 11 | class MLP(nn.Module): 12 | """ Very simple multi-layer perceptron (also called FFN)""" 13 | 14 | def __init__(self, input_dim, hidden_dim, output_dim, num_layers): 15 | super().__init__() 16 | self.num_layers = num_layers 17 | h = [hidden_dim] * (num_layers - 1) 18 | self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) 19 | 20 | def forward(self, x): 21 | for i, layer in enumerate(self.layers): 22 | x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x) 23 | return x 24 | 25 | 26 | def inverse_sigmoid(x, eps=1e-5): 27 | x = x.clamp(min=0, max=1) 28 | x1 = x.clamp(min=eps) 29 | x2 = (1 - x).clamp(min=eps) 30 | return torch.log(x1/x2) 31 | 32 | 33 | def gen_encoder_output_proposals(memory:Tensor, memory_padding_mask:Tensor, spatial_shapes:Tensor): 34 | """ 35 | Input: 36 | - memory: bs, \sum{hw}, d_model 37 | - memory_padding_mask: bs, \sum{hw} 38 | - spatial_shapes: nlevel, 2 39 | Output: 40 | - output_memory: bs, \sum{hw}, d_model 41 | - output_proposals: bs, \sum{hw}, 4 42 | """ 43 | N_, S_, C_ = memory.shape 44 | base_scale = 4.0 45 | proposals = [] 46 | _cur = 0 47 | for lvl, (H_, W_) in enumerate(spatial_shapes): 48 | mask_flatten_ = memory_padding_mask[:, _cur:(_cur + H_ * W_)].view(N_, H_, W_, 1) 49 | valid_H = torch.sum(~mask_flatten_[:, :, 0, 0], 1) 50 | valid_W = torch.sum(~mask_flatten_[:, 0, :, 0], 1) 51 | 52 | grid_y, grid_x = torch.meshgrid(torch.linspace(0, H_ - 1, H_, dtype=torch.float32, device=memory.device), 53 | torch.linspace(0, W_ - 1, W_, dtype=torch.float32, device=memory.device)) 54 | grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1) 55 | 56 | scale = torch.cat([valid_W.unsqueeze(-1), valid_H.unsqueeze(-1)], 1).view(N_, 1, 1, 2) 57 | grid = (grid.unsqueeze(0).expand(N_, -1, -1, -1) + 0.5) / scale 58 | wh = torch.ones_like(grid) * 0.05 * (2.0 ** lvl) 59 | proposal = torch.cat((grid, wh), -1).view(N_, -1, 4) 60 | proposals.append(proposal) 61 | _cur += (H_ * W_) 62 | output_proposals = torch.cat(proposals, 1) 63 | output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True) 64 | output_proposals = torch.log(output_proposals / (1 - output_proposals)) 65 | output_proposals = output_proposals.masked_fill(memory_padding_mask.unsqueeze(-1), float('inf')) 66 | output_proposals = output_proposals.masked_fill(~output_proposals_valid, float('inf')) 67 | 68 | output_memory = memory 69 | output_memory = output_memory.masked_fill(memory_padding_mask.unsqueeze(-1), float(0)) 70 | output_memory = output_memory.masked_fill(~output_proposals_valid, float(0)) 71 | return output_memory, output_proposals 72 | 73 | 74 | def gen_sineembed_for_position(pos_tensor, dim=128): 75 | # n_query, bs, _ = pos_tensor.size() 76 | # sineembed_tensor = torch.zeros(n_query, bs, 256) 77 | scale = 2 * math.pi 78 | dim_t = torch.arange(dim, dtype=torch.float32, device=pos_tensor.device) 79 | dim_t = 10000 ** (2 * (dim_t // 2) / dim) 80 | x_embed = pos_tensor[:, :, 0] * scale 81 | y_embed = pos_tensor[:, :, 1] * scale 82 | pos_x = x_embed[:, :, None] / dim_t 83 | pos_y = y_embed[:, :, None] / dim_t 84 | pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2) 85 | pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=3).flatten(2) 86 | if pos_tensor.size(-1) == 2: 87 | pos = torch.cat((pos_y, pos_x), dim=2) 88 | elif pos_tensor.size(-1) == 4: 89 | w_embed = pos_tensor[:, :, 2] * scale 90 | pos_w = w_embed[:, :, None] / dim_t 91 | pos_w = torch.stack((pos_w[:, :, 0::2].sin(), pos_w[:, :, 1::2].cos()), dim=3).flatten(2) 92 | 93 | h_embed = pos_tensor[:, :, 3] * scale 94 | pos_h = h_embed[:, :, None] / dim_t 95 | pos_h = torch.stack((pos_h[:, :, 0::2].sin(), pos_h[:, :, 1::2].cos()), dim=3).flatten(2) 96 | 97 | pos = torch.cat((pos_y, pos_x, pos_w, pos_h), dim=2) 98 | else: 99 | raise ValueError("Unknown pos_tensor shape(-1):{}".format(pos_tensor.size(-1))) 100 | return pos 101 | 102 | 103 | def _get_activation_fn(activation): 104 | """Return an activation function given a string""" 105 | if activation == "relu": 106 | return F.relu 107 | if activation == "gelu": 108 | return F.gelu 109 | if activation == "glu": 110 | return F.glu 111 | if activation == "prelu": 112 | return nn.PReLU() 113 | if activation == "selu": 114 | return F.selu 115 | raise RuntimeError(F"activation should be relu/gelu, not {activation}.") 116 | 117 | 118 | def _get_clones(module, N, layer_share=False): 119 | 120 | if layer_share: 121 | return nn.ModuleList([module for i in range(N)]) 122 | else: 123 | return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) 124 | 125 | def from_divisablity(x, div): 126 | if x % div == 0: 127 | return x 128 | return (int(x / div) + 1) * div 129 | 130 | def getIdx(a, id_start): 131 | co = a.unsqueeze(0) - a.unsqueeze(1) 132 | uniquer = co.unique(dim=0) 133 | out = [] 134 | for r in uniquer: 135 | cover = torch.arange(a.size(0)).to(a) 136 | mask = r == 0 137 | idx = cover[mask] 138 | out.append(idx) 139 | out = [o + id_start for o in out] 140 | return {str(k.cpu().numpy()): v for k, v in zip(a.unique(), out[::-1])} 141 | 142 | def get_world_size(): 143 | if torch.distributed.is_initialized(): 144 | return torch.distributed.get_world_size() 145 | return 1 146 | 147 | def all_gather(x): 148 | if get_world_size() > 1: 149 | all_x = [torch.zeros_like(x) for _ in range(get_world_size())] 150 | torch.distributed.all_gather(all_x, x.detach()) 151 | all_x[torch.distributed.get_rank()] = x 152 | x = torch.stack(all_x, dim=0) 153 | return x 154 | 155 | 156 | def get_unpadded_tensor(tensors, num_examples): 157 | new_tensor_list = [] 158 | for i, tensor in enumerate(tensors): 159 | new_tensor_list.append(tensor[:num_examples[i]]) 160 | return new_tensor_list -------------------------------------------------------------------------------- /dinov/body/encoder/__init__.py: -------------------------------------------------------------------------------- 1 | from .build import build_encoder -------------------------------------------------------------------------------- /dinov/body/encoder/build.py: -------------------------------------------------------------------------------- 1 | from .registry import model_entrypoints 2 | from .registry import is_model 3 | from .encoder_deform import * 4 | 5 | def build_encoder(config, *args, **kwargs): 6 | model_name = config['MODEL']['ENCODER']['NAME'] 7 | 8 | if not is_model(model_name): 9 | raise ValueError(f'Unkown model: {model_name}') 10 | 11 | return model_entrypoints(model_name)(config, *args, **kwargs) -------------------------------------------------------------------------------- /dinov/body/encoder/ops/functions/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | from .ms_deform_attn_func import MSDeformAttnFunction 13 | 14 | -------------------------------------------------------------------------------- /dinov/body/encoder/ops/functions/ms_deform_attn_func.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | from __future__ import absolute_import 13 | from __future__ import print_function 14 | from __future__ import division 15 | 16 | import torch 17 | import torch.nn.functional as F 18 | from torch.autograd import Function 19 | from torch.autograd.function import once_differentiable 20 | 21 | try: 22 | import MultiScaleDeformableAttention as MSDA 23 | except ModuleNotFoundError as e: 24 | info_string = ( 25 | "\n\nPlease compile MultiScaleDeformableAttention CUDA op with the following commands:\n" 26 | "\t`cd mask2former/modeling/pixel_decoder/ops`\n" 27 | "\t`sh make.sh`\n" 28 | ) 29 | raise ModuleNotFoundError(info_string) 30 | 31 | 32 | class MSDeformAttnFunction(Function): 33 | @staticmethod 34 | def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step): 35 | ctx.im2col_step = im2col_step 36 | output = MSDA.ms_deform_attn_forward( 37 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step) 38 | ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights) 39 | return output 40 | 41 | @staticmethod 42 | @once_differentiable 43 | def backward(ctx, grad_output): 44 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors 45 | grad_value, grad_sampling_loc, grad_attn_weight = \ 46 | MSDA.ms_deform_attn_backward( 47 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step) 48 | 49 | return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None 50 | 51 | 52 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights): 53 | # for debug and test only, 54 | # need to use cuda version instead 55 | N_, S_, M_, D_ = value.shape 56 | _, Lq_, M_, L_, P_, _ = sampling_locations.shape 57 | value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1) 58 | sampling_grids = 2 * sampling_locations - 1 59 | sampling_value_list = [] 60 | for lid_, (H_, W_) in enumerate(value_spatial_shapes): 61 | # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_ 62 | value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_) 63 | # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2 64 | sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1) 65 | # N_*M_, D_, Lq_, P_ 66 | sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_, 67 | mode='bilinear', padding_mode='zeros', align_corners=False) 68 | sampling_value_list.append(sampling_value_l_) 69 | # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_) 70 | attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_) 71 | output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_) 72 | return output.transpose(1, 2).contiguous() 73 | -------------------------------------------------------------------------------- /dinov/body/encoder/ops/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | from .ms_deform_attn import MSDeformAttn 13 | -------------------------------------------------------------------------------- /dinov/body/encoder/ops/setup.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | import os 13 | import glob 14 | 15 | import torch 16 | 17 | from torch.utils.cpp_extension import CUDA_HOME 18 | from torch.utils.cpp_extension import CppExtension 19 | from torch.utils.cpp_extension import CUDAExtension 20 | 21 | from setuptools import find_packages 22 | from setuptools import setup 23 | 24 | requirements = ["torch", "torchvision"] 25 | 26 | def get_extensions(): 27 | this_dir = os.path.dirname(os.path.abspath(__file__)) 28 | extensions_dir = os.path.join(this_dir, "src") 29 | 30 | main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) 31 | source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) 32 | source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) 33 | 34 | sources = main_file + source_cpu 35 | extension = CppExtension 36 | extra_compile_args = {"cxx": []} 37 | define_macros = [] 38 | 39 | # Force cuda since torch ask for a device, not if cuda is in fact available. 40 | if (os.environ.get('FORCE_CUDA') or torch.cuda.is_available()) and CUDA_HOME is not None: 41 | extension = CUDAExtension 42 | sources += source_cuda 43 | define_macros += [("WITH_CUDA", None)] 44 | extra_compile_args["nvcc"] = [ 45 | "-DCUDA_HAS_FP16=1", 46 | "-D__CUDA_NO_HALF_OPERATORS__", 47 | "-D__CUDA_NO_HALF_CONVERSIONS__", 48 | "-D__CUDA_NO_HALF2_OPERATORS__", 49 | ] 50 | else: 51 | if CUDA_HOME is None: 52 | raise NotImplementedError('CUDA_HOME is None. Please set environment variable CUDA_HOME.') 53 | else: 54 | raise NotImplementedError('No CUDA runtime is found. Please set FORCE_CUDA=1 or test it by running torch.cuda.is_available().') 55 | 56 | sources = [os.path.join(extensions_dir, s) for s in sources] 57 | include_dirs = [extensions_dir] 58 | ext_modules = [ 59 | extension( 60 | "MultiScaleDeformableAttention", 61 | sources, 62 | include_dirs=include_dirs, 63 | define_macros=define_macros, 64 | extra_compile_args=extra_compile_args, 65 | ) 66 | ] 67 | return ext_modules 68 | 69 | setup( 70 | name="MultiScaleDeformableAttention", 71 | version="1.0", 72 | author="Weijie Su", 73 | url="https://github.com/fundamentalvision/Deformable-DETR", 74 | description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention", 75 | packages=find_packages(exclude=("configs", "tests",)), 76 | ext_modules=get_extensions(), 77 | cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, 78 | ) 79 | -------------------------------------------------------------------------------- /dinov/body/encoder/ops/src/cpu/ms_deform_attn_cpu.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #include 17 | 18 | #include 19 | #include 20 | 21 | 22 | at::Tensor 23 | ms_deform_attn_cpu_forward( 24 | const at::Tensor &value, 25 | const at::Tensor &spatial_shapes, 26 | const at::Tensor &level_start_index, 27 | const at::Tensor &sampling_loc, 28 | const at::Tensor &attn_weight, 29 | const int im2col_step) 30 | { 31 | AT_ERROR("Not implement on cpu"); 32 | } 33 | 34 | std::vector 35 | ms_deform_attn_cpu_backward( 36 | const at::Tensor &value, 37 | const at::Tensor &spatial_shapes, 38 | const at::Tensor &level_start_index, 39 | const at::Tensor &sampling_loc, 40 | const at::Tensor &attn_weight, 41 | const at::Tensor &grad_output, 42 | const int im2col_step) 43 | { 44 | AT_ERROR("Not implement on cpu"); 45 | } 46 | 47 | -------------------------------------------------------------------------------- /dinov/body/encoder/ops/src/cpu/ms_deform_attn_cpu.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #pragma once 17 | #include 18 | 19 | at::Tensor 20 | ms_deform_attn_cpu_forward( 21 | const at::Tensor &value, 22 | const at::Tensor &spatial_shapes, 23 | const at::Tensor &level_start_index, 24 | const at::Tensor &sampling_loc, 25 | const at::Tensor &attn_weight, 26 | const int im2col_step); 27 | 28 | std::vector 29 | ms_deform_attn_cpu_backward( 30 | const at::Tensor &value, 31 | const at::Tensor &spatial_shapes, 32 | const at::Tensor &level_start_index, 33 | const at::Tensor &sampling_loc, 34 | const at::Tensor &attn_weight, 35 | const at::Tensor &grad_output, 36 | const int im2col_step); 37 | 38 | 39 | -------------------------------------------------------------------------------- /dinov/body/encoder/ops/src/cuda/ms_deform_attn_cuda.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #pragma once 17 | #include 18 | 19 | at::Tensor ms_deform_attn_cuda_forward( 20 | const at::Tensor &value, 21 | const at::Tensor &spatial_shapes, 22 | const at::Tensor &level_start_index, 23 | const at::Tensor &sampling_loc, 24 | const at::Tensor &attn_weight, 25 | const int im2col_step); 26 | 27 | std::vector ms_deform_attn_cuda_backward( 28 | const at::Tensor &value, 29 | const at::Tensor &spatial_shapes, 30 | const at::Tensor &level_start_index, 31 | const at::Tensor &sampling_loc, 32 | const at::Tensor &attn_weight, 33 | const at::Tensor &grad_output, 34 | const int im2col_step); 35 | 36 | -------------------------------------------------------------------------------- /dinov/body/encoder/ops/src/ms_deform_attn.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #pragma once 17 | 18 | #include "cpu/ms_deform_attn_cpu.h" 19 | 20 | #ifdef WITH_CUDA 21 | #include "cuda/ms_deform_attn_cuda.h" 22 | #endif 23 | 24 | 25 | at::Tensor 26 | ms_deform_attn_forward( 27 | const at::Tensor &value, 28 | const at::Tensor &spatial_shapes, 29 | const at::Tensor &level_start_index, 30 | const at::Tensor &sampling_loc, 31 | const at::Tensor &attn_weight, 32 | const int im2col_step) 33 | { 34 | if (value.type().is_cuda()) 35 | { 36 | #ifdef WITH_CUDA 37 | return ms_deform_attn_cuda_forward( 38 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step); 39 | #else 40 | AT_ERROR("Not compiled with GPU support"); 41 | #endif 42 | } 43 | AT_ERROR("Not implemented on the CPU"); 44 | } 45 | 46 | std::vector 47 | ms_deform_attn_backward( 48 | const at::Tensor &value, 49 | const at::Tensor &spatial_shapes, 50 | const at::Tensor &level_start_index, 51 | const at::Tensor &sampling_loc, 52 | const at::Tensor &attn_weight, 53 | const at::Tensor &grad_output, 54 | const int im2col_step) 55 | { 56 | if (value.type().is_cuda()) 57 | { 58 | #ifdef WITH_CUDA 59 | return ms_deform_attn_cuda_backward( 60 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step); 61 | #else 62 | AT_ERROR("Not compiled with GPU support"); 63 | #endif 64 | } 65 | AT_ERROR("Not implemented on the CPU"); 66 | } 67 | 68 | -------------------------------------------------------------------------------- /dinov/body/encoder/ops/src/vision.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #include "ms_deform_attn.h" 17 | 18 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 19 | m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward"); 20 | m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward"); 21 | } 22 | -------------------------------------------------------------------------------- /dinov/body/encoder/ops/test.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | from __future__ import absolute_import 13 | from __future__ import print_function 14 | from __future__ import division 15 | 16 | import time 17 | import torch 18 | import torch.nn as nn 19 | from torch.autograd import gradcheck 20 | 21 | from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch 22 | 23 | 24 | N, M, D = 1, 2, 2 25 | Lq, L, P = 2, 2, 2 26 | shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda() 27 | level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1])) 28 | S = sum([(H*W).item() for H, W in shapes]) 29 | 30 | 31 | torch.manual_seed(3) 32 | 33 | 34 | @torch.no_grad() 35 | def check_forward_equal_with_pytorch_double(): 36 | value = torch.rand(N, S, M, D).cuda() * 0.01 37 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 38 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 39 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 40 | im2col_step = 2 41 | output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu() 42 | output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu() 43 | fwdok = torch.allclose(output_cuda, output_pytorch) 44 | max_abs_err = (output_cuda - output_pytorch).abs().max() 45 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() 46 | 47 | print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') 48 | 49 | 50 | @torch.no_grad() 51 | def check_forward_equal_with_pytorch_float(): 52 | value = torch.rand(N, S, M, D).cuda() * 0.01 53 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 54 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 55 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 56 | im2col_step = 2 57 | output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu() 58 | output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu() 59 | fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3) 60 | max_abs_err = (output_cuda - output_pytorch).abs().max() 61 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() 62 | 63 | print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') 64 | 65 | 66 | def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True): 67 | 68 | value = torch.rand(N, S, M, channels).cuda() * 0.01 69 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 70 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 71 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 72 | im2col_step = 2 73 | func = MSDeformAttnFunction.apply 74 | 75 | value.requires_grad = grad_value 76 | sampling_locations.requires_grad = grad_sampling_loc 77 | attention_weights.requires_grad = grad_attn_weight 78 | 79 | gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step)) 80 | 81 | print(f'* {gradok} check_gradient_numerical(D={channels})') 82 | 83 | 84 | if __name__ == '__main__': 85 | check_forward_equal_with_pytorch_double() 86 | check_forward_equal_with_pytorch_float() 87 | 88 | for channels in [30, 32, 64, 71, 1025, 2048, 3096]: 89 | check_gradient_numerical(channels, True, True, True) 90 | 91 | 92 | 93 | -------------------------------------------------------------------------------- /dinov/body/encoder/registry.py: -------------------------------------------------------------------------------- 1 | _model_entrypoints = {} 2 | 3 | def register_encoder(fn): 4 | module_name_split = fn.__module__.split('.') 5 | model_name = module_name_split[-1] 6 | _model_entrypoints[model_name] = fn 7 | return fn 8 | 9 | def model_entrypoints(model_name): 10 | return _model_entrypoints[model_name] 11 | 12 | def is_model(model_name): 13 | return model_name in _model_entrypoints 14 | -------------------------------------------------------------------------------- /dinov/body/general_head.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Copyright (c) MicroSoft, Inc. and its affiliates. 3 | # Modified from DINO https://github.com/IDEA-Research/MaskDINO by Feng Li. 4 | # ------------------------------------------------------------------------ 5 | import logging 6 | from typing import Callable, Dict, List, Optional, Tuple, Union 7 | 8 | from torch import nn 9 | 10 | from detectron2.layers import Conv2d, ShapeSpec, get_norm 11 | 12 | from .registry import register_body 13 | from .encoder import build_encoder 14 | from .decoder import build_decoder 15 | from ..utils import configurable 16 | 17 | 18 | class IMaskDINOHead(nn.Module): 19 | @configurable 20 | def __init__( 21 | self, 22 | input_shape: Dict[str, ShapeSpec], 23 | *, 24 | num_classes: int, 25 | pixel_decoder: nn.Module, 26 | loss_weight: float = 1.0, 27 | ignore_value: int = -1, 28 | transformer_predictor: nn.Module, 29 | ): 30 | """ 31 | Args: 32 | input_shape: shapes (channels and stride) of the input features 33 | num_classes: number of classes to predict 34 | pixel_decoder: the pixel decoder module 35 | loss_weight: loss weight 36 | ignore_value: category id to be ignored during training. 37 | transformer_predictor: the transformer decoder that makes prediction 38 | transformer_in_feature: input feature name to the transformer_predictor 39 | """ 40 | super().__init__() 41 | input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride) 42 | self.in_features = [k for k, v in input_shape] 43 | self.ignore_value = ignore_value 44 | self.common_stride = 4 45 | self.loss_weight = loss_weight 46 | 47 | self.pixel_decoder = pixel_decoder 48 | self.predictor = transformer_predictor 49 | 50 | self.num_classes = num_classes 51 | # store processed features 52 | self.processed_features = None 53 | 54 | @classmethod 55 | def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec], lang_encoder: nn.Module, extra: dict): 56 | enc_cfg = cfg['MODEL']['ENCODER'] 57 | dec_cfg = cfg['MODEL']['DECODER'] 58 | transformer_predictor_in_channels = enc_cfg['CONVS_DIM'] 59 | 60 | return { 61 | "input_shape": { 62 | k: v for k, v in input_shape.items() if k in enc_cfg['IN_FEATURES'] 63 | }, 64 | "ignore_value": enc_cfg['IGNORE_VALUE'], 65 | "num_classes": enc_cfg.get('NUM_CLASSES', None), 66 | "pixel_decoder": build_encoder(cfg, input_shape), 67 | "loss_weight": enc_cfg['LOSS_WEIGHT'], 68 | "transformer_predictor": build_decoder( 69 | cfg, 70 | transformer_predictor_in_channels, 71 | lang_encoder, 72 | mask_classification=True, 73 | extra=extra, 74 | ), 75 | } 76 | 77 | def forward_encoder(self, features, mask=None,targets=None, target_queries=None, target_vlp=None, prediction_switch=None, task='seg', extra={}): 78 | mask_features, transformer_encoder_features, multi_scale_features = self.pixel_decoder.forward_features( 79 | features, mask) 80 | self.processed_features = (mask_features, transformer_encoder_features, multi_scale_features) 81 | 82 | def forward_decoder(self, features, mask=None,targets=None, target_queries=None, target_vlp=None, prediction_switch=None, task='seg', extra={}): 83 | assert self.processed_features is not None, "need to precess features first" 84 | mask_features, transformer_encoder_features, multi_scale_features = self.processed_features 85 | predictions = self.predictor(multi_scale_features, mask_features, mask, targets=targets, 86 | target_queries=target_queries, target_vlp=target_vlp, task=task, extra=extra) 87 | return predictions 88 | 89 | def forward(self, features, mask=None, targets=None, target_queries=None, target_vlp=None, task='seg', extra={}): 90 | return self.layers(features, mask, targets=targets, target_queries=target_queries, target_vlp=target_vlp, task=task, extra=extra) 91 | 92 | def layers(self, features, mask=None,targets=None, target_queries=None, target_vlp=None, prediction_switch=None, task='seg', extra={}): 93 | mask_features, transformer_encoder_features, multi_scale_features = self.pixel_decoder.forward_features(features, mask) 94 | predictions = self.predictor(multi_scale_features, mask_features, mask, targets=targets, 95 | target_queries=target_queries, target_vlp=target_vlp, task=task, extra=extra) 96 | return predictions 97 | 98 | 99 | @register_body 100 | def get_interactive_maskdino_head(cfg, input_shape, lang_encoder, extra): 101 | return IMaskDINOHead(cfg, input_shape, lang_encoder, extra) -------------------------------------------------------------------------------- /dinov/body/registry.py: -------------------------------------------------------------------------------- 1 | _model_entrypoints = {} 2 | 3 | 4 | def register_body(fn): 5 | module_name_split = fn.__module__.split('.') 6 | model_name = module_name_split[-1] 7 | _model_entrypoints[model_name] = fn 8 | return fn 9 | 10 | def model_entrypoints(model_name): 11 | return _model_entrypoints[model_name] 12 | 13 | def is_model(model_name): 14 | return model_name in _model_entrypoints -------------------------------------------------------------------------------- /dinov/language/__init__.py: -------------------------------------------------------------------------------- 1 | from .build import build_language_encoder -------------------------------------------------------------------------------- /dinov/language/build.py: -------------------------------------------------------------------------------- 1 | """ 2 | placeholder for language open-set or grounding 3 | """ 4 | 5 | 6 | def build_language_encoder(config, **kwargs): 7 | model_name = config['MODEL']['TEXT']['ARCH'] 8 | if model_name=='noencoder': 9 | return None 10 | else: 11 | raise NotImplementedError -------------------------------------------------------------------------------- /dinov/modules/__init__.py: -------------------------------------------------------------------------------- 1 | from .position_encoding import * 2 | from .postprocessing import * 3 | from .matcher import * 4 | from .criterion_visual_refer_one2one import * 5 | from .criterion_visual_openset import * 6 | from .criterion_visual_refer_many2many import * 7 | from .matcher_many2many import * 8 | 9 | -------------------------------------------------------------------------------- /dinov/modules/position_encoding.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py 3 | """ 4 | Various positional encodings for the transformer. 5 | """ 6 | import math 7 | 8 | import torch 9 | from torch import nn 10 | 11 | 12 | class PositionEmbeddingSine(nn.Module): 13 | """ 14 | This is a more standard version of the position embedding, very similar to the one 15 | used by the Attention is all you need paper, generalized to work on images. 16 | """ 17 | 18 | def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None): 19 | super().__init__() 20 | self.num_pos_feats = num_pos_feats 21 | self.temperature = temperature 22 | self.normalize = normalize 23 | if scale is not None and normalize is False: 24 | raise ValueError("normalize should be True if scale is passed") 25 | if scale is None: 26 | scale = 2 * math.pi 27 | self.scale = scale 28 | 29 | def forward(self, x, mask=None): 30 | if mask is None: 31 | mask = torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool) 32 | not_mask = ~mask 33 | y_embed = not_mask.cumsum(1, dtype=x.dtype) 34 | x_embed = not_mask.cumsum(2, dtype=x.dtype) 35 | if self.normalize: 36 | eps = 1e-6 37 | y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale 38 | x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale 39 | 40 | dim_t = torch.arange(self.num_pos_feats, dtype=x.dtype, device=x.device) 41 | dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) 42 | 43 | pos_x = x_embed[:, :, :, None] / dim_t 44 | pos_y = y_embed[:, :, :, None] / dim_t 45 | pos_x = torch.stack( 46 | (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4 47 | ).flatten(3) 48 | pos_y = torch.stack( 49 | (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4 50 | ).flatten(3) 51 | pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) 52 | return pos 53 | 54 | def __repr__(self, _repr_indent=4): 55 | head = "Positional encoding " + self.__class__.__name__ 56 | body = [ 57 | "num_pos_feats: {}".format(self.num_pos_feats), 58 | "temperature: {}".format(self.temperature), 59 | "normalize: {}".format(self.normalize), 60 | "scale: {}".format(self.scale), 61 | ] 62 | # _repr_indent = 4 63 | lines = [head] + [" " * _repr_indent + line for line in body] 64 | return "\n".join(lines) 65 | -------------------------------------------------------------------------------- /dinov/modules/postprocessing.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import torch 3 | from torch.nn import functional as F 4 | 5 | from detectron2.structures import Instances, ROIMasks 6 | 7 | 8 | def sem_seg_postprocess(result, img_size, output_height, output_width): 9 | """ 10 | Return semantic segmentation predictions in the original resolution. 11 | 12 | The input images are often resized when entering semantic segmentor. Moreover, in same 13 | cases, they also padded inside segmentor to be divisible by maximum network stride. 14 | As a result, we often need the predictions of the segmentor in a different 15 | resolution from its inputs. 16 | 17 | Args: 18 | result (Tensor): semantic segmentation prediction logits. A tensor of shape (C, H, W), 19 | where C is the number of classes, and H, W are the height and width of the prediction. 20 | img_size (tuple): image size that segmentor is taking as input. 21 | output_height, output_width: the desired output resolution. 22 | 23 | Returns: 24 | semantic segmentation prediction (Tensor): A tensor of the shape 25 | (C, output_height, output_width) that contains per-pixel soft predictions. 26 | """ 27 | if len(result.shape)>3: 28 | result = result[:, :, : img_size[0], : img_size[1]].expand(1, -1, -1, -1) 29 | else: 30 | result = result[:, : img_size[0], : img_size[1]].expand(1, -1, -1, -1) 31 | result = F.interpolate( 32 | result, size=(output_height, output_width), mode="bicubic", align_corners=False, antialias=True 33 | )[0] 34 | return result 35 | -------------------------------------------------------------------------------- /dinov/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .config import * 2 | from .misc import * 3 | from .box_ops import * -------------------------------------------------------------------------------- /dinov/utils/box_ops.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | """ 3 | Utilities for bounding box manipulation and GIoU. 4 | """ 5 | import torch 6 | from torchvision.ops.boxes import box_area 7 | import numpy as np 8 | 9 | 10 | def build_point_grid(n_per_side: int) -> np.ndarray: 11 | """Generates a 2D grid of points evenly spaced in [0,1]x[0,1].""" 12 | offset = 1 / (2 * n_per_side) 13 | points_one_side = np.linspace(offset, 1 - offset, n_per_side) 14 | points_x = np.tile(points_one_side[None, :], (n_per_side, 1)) 15 | points_y = np.tile(points_one_side[:, None], (1, n_per_side)) 16 | points = np.stack([points_x, points_y], axis=-1).reshape(-1, 2) 17 | return points 18 | 19 | def box_cxcywh_to_xyxy(x): 20 | x_c, y_c, w, h = x.unbind(-1) 21 | b = [(x_c - 0.5 * w), (y_c - 0.5 * h), 22 | (x_c + 0.5 * w), (y_c + 0.5 * h)] 23 | return torch.stack(b, dim=-1) 24 | 25 | 26 | def box_xyxy_to_cxcywh(x): 27 | x0, y0, x1, y1 = x.unbind(-1) 28 | b = [(x0 + x1) / 2, (y0 + y1) / 2, 29 | (x1 - x0), (y1 - y0)] 30 | return torch.stack(b, dim=-1) 31 | 32 | def box_xywh_to_xyxy(x): 33 | x0, y0, x1, y1 = x.unbind(-1) 34 | b = [x0, y0, (x0 + x1), (y0 + y1)] 35 | return torch.stack(b, dim=-1) 36 | 37 | 38 | # modified from torchvision to also return the union 39 | def box_iou(boxes1, boxes2): 40 | area1 = box_area(boxes1) 41 | area2 = box_area(boxes2) 42 | 43 | lt = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2] 44 | rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2] 45 | 46 | wh = (rb - lt).clamp(min=0) # [N,M,2] 47 | inter = wh[:, :, 0] * wh[:, :, 1] # [N,M] 48 | 49 | union = area1[:, None] + area2 - inter 50 | 51 | iou = inter / (union+1e-6) 52 | return iou, union 53 | 54 | 55 | def generalized_box_iou(boxes1, boxes2): 56 | """ 57 | Generalized IoU from https://giou.stanford.edu/ 58 | 59 | The boxes should be in [x0, y0, x1, y1] format 60 | 61 | Returns a [N, M] pairwise matrix, where N = len(boxes1) 62 | and M = len(boxes2) 63 | """ 64 | # degenerate boxes gives inf / nan results 65 | # so do an early check 66 | assert (boxes1[:, 2:] >= boxes1[:, :2]).all() 67 | assert (boxes2[:, 2:] >= boxes2[:, :2]).all() 68 | iou, union = box_iou(boxes1, boxes2) 69 | 70 | lt = torch.min(boxes1[:, None, :2], boxes2[:, :2]) 71 | rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:]) 72 | 73 | wh = (rb - lt).clamp(min=0) # [N,M,2] 74 | area = wh[:, :, 0] * wh[:, :, 1] 75 | 76 | return iou - (area - union) / (area+1e-6) 77 | 78 | def generalized_box_iou_padded(boxes1, boxes2): 79 | """ 80 | Generalized IoU from https://giou.stanford.edu/ 81 | 82 | The boxes should be in [x0, y0, x1, y1] format 83 | 84 | Returns a [N, M] pairwise matrix, where N = len(boxes1) 85 | and M = len(boxes2) 86 | """ 87 | # degenerate boxes gives inf / nan results 88 | # so do an early check 89 | # assert (boxes1[:, 2:] >= boxes1[:, :2]).all() 90 | # assert (boxes2[:, 2:] >= boxes2[:, :2]).all() 91 | iou, union = box_iou(boxes1, boxes2) 92 | 93 | lt = torch.min(boxes1[:, None, :2], boxes2[:, :2]) 94 | rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:]) 95 | 96 | wh = (rb - lt).clamp(min=0) # [N,M,2] 97 | area = wh[:, :, 0] * wh[:, :, 1] 98 | 99 | return iou - (area - union) / (area+1e-6) 100 | 101 | 102 | def masks_to_boxes(masks): 103 | """Compute the bounding boxes around the provided masks 104 | 105 | The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions. 106 | 107 | Returns a [N, 4] tensors, with the boxes in xyxy format 108 | """ 109 | if masks.numel() == 0: 110 | return torch.zeros((0, 4), device=masks.device) 111 | 112 | h, w = masks.shape[-2:] 113 | 114 | y = torch.arange(0, h, dtype=torch.float) 115 | x = torch.arange(0, w, dtype=torch.float) 116 | y, x = torch.meshgrid(y, x) 117 | 118 | x_mask = (masks * x.unsqueeze(0)) 119 | x_max = x_mask.flatten(1).max(-1)[0] 120 | x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0] 121 | 122 | y_mask = (masks * y.unsqueeze(0)) 123 | y_max = y_mask.flatten(1).max(-1)[0] 124 | y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0] 125 | 126 | return torch.stack([x_min, y_min, x_max, y_max], 1) -------------------------------------------------------------------------------- /dinov/utils/config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | 4 | import functools 5 | import inspect 6 | 7 | def configurable(init_func=None, *, from_config=None): 8 | """ 9 | Decorate a function or a class's __init__ method so that it can be called 10 | with a :class:`CfgNode` object using a :func:`from_config` function that translates 11 | :class:`CfgNode` to arguments. 12 | 13 | Examples: 14 | :: 15 | # Usage 1: Decorator on __init__: 16 | class A: 17 | @configurable 18 | def __init__(self, a, b=2, c=3): 19 | pass 20 | 21 | @classmethod 22 | def from_config(cls, cfg): # 'cfg' must be the first argument 23 | # Returns kwargs to be passed to __init__ 24 | return {"a": cfg.A, "b": cfg.B} 25 | 26 | a1 = A(a=1, b=2) # regular construction 27 | a2 = A(cfg) # construct with a cfg 28 | a3 = A(cfg, b=3, c=4) # construct with extra overwrite 29 | 30 | # Usage 2: Decorator on any function. Needs an extra from_config argument: 31 | @configurable(from_config=lambda cfg: {"a: cfg.A, "b": cfg.B}) 32 | def a_func(a, b=2, c=3): 33 | pass 34 | 35 | a1 = a_func(a=1, b=2) # regular call 36 | a2 = a_func(cfg) # call with a cfg 37 | a3 = a_func(cfg, b=3, c=4) # call with extra overwrite 38 | 39 | Args: 40 | init_func (callable): a class's ``__init__`` method in usage 1. The 41 | class must have a ``from_config`` classmethod which takes `cfg` as 42 | the first argument. 43 | from_config (callable): the from_config function in usage 2. It must take `cfg` 44 | as its first argument. 45 | """ 46 | 47 | if init_func is not None: 48 | assert ( 49 | inspect.isfunction(init_func) 50 | and from_config is None 51 | and init_func.__name__ == "__init__" 52 | ), "Incorrect use of @configurable. Check API documentation for examples." 53 | 54 | @functools.wraps(init_func) 55 | def wrapped(self, *args, **kwargs): 56 | try: 57 | from_config_func = type(self).from_config 58 | except AttributeError as e: 59 | raise AttributeError( 60 | "Class with @configurable must have a 'from_config' classmethod." 61 | ) from e 62 | if not inspect.ismethod(from_config_func): 63 | raise TypeError("Class with @configurable must have a 'from_config' classmethod.") 64 | 65 | # import ipdb; ipdb.set_trace() 66 | if _called_with_cfg(*args, **kwargs): 67 | explicit_args = _get_args_from_config(from_config_func, *args, **kwargs) 68 | init_func(self, **explicit_args) 69 | else: 70 | init_func(self, *args, **kwargs) 71 | 72 | return wrapped 73 | 74 | else: 75 | if from_config is None: 76 | return configurable # @configurable() is made equivalent to @configurable 77 | assert inspect.isfunction( 78 | from_config 79 | ), "from_config argument of configurable must be a function!" 80 | 81 | def wrapper(orig_func): 82 | @functools.wraps(orig_func) 83 | def wrapped(*args, **kwargs): 84 | if _called_with_cfg(*args, **kwargs): 85 | explicit_args = _get_args_from_config(from_config, *args, **kwargs) 86 | return orig_func(**explicit_args) 87 | else: 88 | return orig_func(*args, **kwargs) 89 | 90 | wrapped.from_config = from_config 91 | return wrapped 92 | 93 | return wrapper 94 | 95 | def _called_with_cfg(*args, **kwargs): 96 | """ 97 | Returns: 98 | bool: whether the arguments contain CfgNode and should be considered 99 | forwarded to from_config. 100 | """ 101 | from omegaconf import DictConfig, OmegaConf, ListConfig 102 | # from detectron2.config import LazyConfig 103 | 104 | if len(args) and (isinstance(args[0], (dict)) or (isinstance(args[0], (DictConfig)))): 105 | return True 106 | if isinstance(kwargs.pop("cfg", None), (dict)): 107 | return True 108 | # `from_config`'s first argument is forced to be "cfg". 109 | # So the above check covers all cases. 110 | return False 111 | 112 | def _get_args_from_config(from_config_func, *args, **kwargs): 113 | """ 114 | Use `from_config` to obtain explicit arguments. 115 | 116 | Returns: 117 | dict: arguments to be used for cls.__init__ 118 | """ 119 | signature = inspect.signature(from_config_func) 120 | if list(signature.parameters.keys())[0] != "cfg": 121 | if inspect.isfunction(from_config_func): 122 | name = from_config_func.__name__ 123 | else: 124 | name = f"{from_config_func.__self__}.from_config" 125 | raise TypeError(f"{name} must take 'cfg' as the first argument!") 126 | support_var_arg = any( 127 | param.kind in [param.VAR_POSITIONAL, param.VAR_KEYWORD] 128 | for param in signature.parameters.values() 129 | ) 130 | if support_var_arg: # forward all arguments to from_config, if from_config accepts them 131 | ret = from_config_func(*args, **kwargs) 132 | else: 133 | # forward supported arguments to from_config 134 | supported_arg_names = set(signature.parameters.keys()) 135 | extra_kwargs = {} 136 | for name in list(kwargs.keys()): 137 | if name not in supported_arg_names: 138 | extra_kwargs[name] = kwargs.pop(name) 139 | ret = from_config_func(*args, **kwargs) 140 | # forward the other arguments to __init__ 141 | ret.update(extra_kwargs) 142 | return ret -------------------------------------------------------------------------------- /repo.diff: -------------------------------------------------------------------------------- 1 | diff --git openseed/architectures/joint_oi_model.py openseed/architectures/joint_oi_model.py 2 | index 8086690..a0679fe 100644 3 | --- openseed/architectures/joint_oi_model.py 4 | +++ openseed/architectures/joint_oi_model.py 5 | @@ -286,6 +286,7 @@ class GeneralizedMaskDINO(nn.Module): 6 | "coco_on": dec_cfg.get('COCO', True), 7 | "coco_mask_on": dec_cfg.get('COCO_MASK', True), 8 | "o365_on": dec_cfg.get('O365', True), 9 | + "regenerate_point": dec_cfg.get('RE_POINT', False), 10 | } 11 | 12 | @property 13 | @@ -531,7 +532,7 @@ class GeneralizedMaskDINO(nn.Module): 14 | 15 | # if not self.training: 16 | # box_start = int(num_mask/4*3) 17 | - box_start = random.randint(0, self.max_num_instance - 1) # box based interactive after this number; about 1/4 18 | + box_start = random.randint(1, self.max_num_instance - 1) # box based interactive after this number; about 1/4 19 | point_coords = targets_per_image.point_coords[index[:box_start]] 20 | # FIXME randomly sample one point as the user input 21 | if self.regenerate_point: -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | torch 2 | torchvision 3 | opencv-python 4 | pyyaml 5 | json_tricks 6 | yacs 7 | scikit-learn 8 | pandas 9 | timm==0.4.12 10 | numpy==1.23.5 11 | einops 12 | fvcore 13 | transformers==4.19.2 14 | sentencepiece 15 | ftfy 16 | regex 17 | nltk 18 | vision-datasets==0.2.2 19 | pycocotools==2.0.4 20 | diffdist 21 | pyarrow 22 | cityscapesscripts 23 | shapely 24 | scikit-image 25 | mup 26 | gradio==3.40.0 27 | scann 28 | kornia==0.6.4 29 | torchmetrics==0.6.0 30 | progressbar 31 | pillow==9.4.0 32 | -------------------------------------------------------------------------------- /utils/Config.py: -------------------------------------------------------------------------------- 1 | from fvcore.common.config import CfgNode as _CfgNode 2 | 3 | class CfgNode(_CfgNode): 4 | """ 5 | The same as `fvcore.common.config.CfgNode`, but different in: 6 | 7 | 1. Use unsafe yaml loading by default. 8 | Note that this may lead to arbitrary code execution: you must not 9 | load a config file from untrusted sources before manually inspecting 10 | the content of the file. 11 | 2. Support config versioning. 12 | When attempting to merge an old config, it will convert the old config automatically. 13 | 14 | .. automethod:: clone 15 | .. automethod:: freeze 16 | .. automethod:: defrost 17 | .. automethod:: is_frozen 18 | .. automethod:: load_yaml_with_base 19 | .. automethod:: merge_from_list 20 | .. automethod:: merge_from_other_cfg 21 | """ 22 | 23 | def merge_from_dict(self, dict): 24 | pass 25 | 26 | node = CfgNode() -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .dist import * -------------------------------------------------------------------------------- /utils/arguments.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | import json 3 | import argparse 4 | import logging 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | def load_config_dict_to_opt(opt, config_dict): 10 | """ 11 | Load the key, value pairs from config_dict to opt, overriding existing values in opt 12 | if there is any. 13 | """ 14 | if not isinstance(config_dict, dict): 15 | raise TypeError("Config must be a Python dictionary") 16 | for k, v in config_dict.items(): 17 | k_parts = k.split('.') 18 | pointer = opt 19 | for k_part in k_parts[:-1]: 20 | if k_part not in pointer: 21 | pointer[k_part] = {} 22 | pointer = pointer[k_part] 23 | assert isinstance(pointer, dict), "Overriding key needs to be inside a Python dict." 24 | ori_value = pointer.get(k_parts[-1]) 25 | pointer[k_parts[-1]] = v 26 | if ori_value: 27 | logger.warning(f"Overrided {k} from {ori_value} to {pointer[k_parts[-1]]}") 28 | 29 | def load_opt_from_config_file(conf_file): 30 | """ 31 | Load opt from the config files, settings in later files can override those in previous files. 32 | 33 | Args: 34 | conf_files: config file path 35 | 36 | Returns: 37 | dict: a dictionary of opt settings 38 | """ 39 | opt = {} 40 | with open(conf_file, encoding='utf-8') as f: 41 | config_dict = yaml.safe_load(f) 42 | 43 | load_config_dict_to_opt(opt, config_dict) 44 | 45 | return opt 46 | 47 | 48 | def load_opt_from_config_files(conf_files): 49 | """ 50 | Load opt from the config files, settings in later files can override those in previous files. 51 | 52 | Args: 53 | conf_files (list): a list of config file paths 54 | 55 | Returns: 56 | dict: a dictionary of opt settings 57 | """ 58 | opt = {} 59 | for conf_file in conf_files: 60 | with open(conf_file, encoding='utf-8') as f: 61 | config_dict = yaml.safe_load(f) 62 | 63 | load_config_dict_to_opt(opt, config_dict) 64 | 65 | return opt 66 | 67 | 68 | def load_opt_command(args): 69 | parser = argparse.ArgumentParser(description='Pretrain or fine-tune models for NLP tasks.') 70 | parser.add_argument('command', help='Command: train/evaluate/train-and-evaluate') 71 | parser.add_argument('--conf_files', nargs='+', required=True, help='Path(s) to the config file(s).') 72 | parser.add_argument('--user_dir', help='Path to the user defined module for tasks (models, criteria), optimizers, and lr schedulers.') 73 | parser.add_argument('--config_overrides', nargs='*', help='Override parameters on config with a json style string, e.g. {"": , "..": }. A key with "." updates the object in the corresponding nested dict. Remember to escape " in command line.') 74 | parser.add_argument('--overrides', help='arguments that used to override the config file in cmdline', nargs=argparse.REMAINDER) 75 | 76 | cmdline_args = parser.parse_args() if not args else parser.parse_args(args) 77 | 78 | opt = load_opt_from_config_files(cmdline_args.conf_files) 79 | 80 | if cmdline_args.config_overrides: 81 | config_overrides_string = ' '.join(cmdline_args.config_overrides) 82 | logger.warning(f"Command line config overrides: {config_overrides_string}") 83 | config_dict = json.loads(config_overrides_string) 84 | load_config_dict_to_opt(opt, config_dict) 85 | 86 | if cmdline_args.overrides: 87 | assert len(cmdline_args.overrides) % 2 == 0, "overrides arguments is not paired, required: key value" 88 | keys = [cmdline_args.overrides[idx*2] for idx in range(len(cmdline_args.overrides)//2)] 89 | vals = [cmdline_args.overrides[idx*2+1] for idx in range(len(cmdline_args.overrides)//2)] 90 | vals = [val.replace('false', '').replace('False','') if len(val.replace(' ', '')) == 5 else val for val in vals] 91 | 92 | types = [] 93 | for key in keys: 94 | key = key.split('.') 95 | ele = opt.copy() 96 | while len(key) > 0: 97 | ele = ele[key.pop(0)] 98 | types.append(type(ele)) 99 | 100 | config_dict = {x:z(y) for x,y,z in zip(keys, vals, types)} 101 | load_config_dict_to_opt(opt, config_dict) 102 | 103 | # combine cmdline_args into opt dictionary 104 | for key, val in cmdline_args.__dict__.items(): 105 | if val is not None: 106 | opt[key] = val 107 | 108 | return opt, cmdline_args 109 | 110 | 111 | def save_opt_to_json(opt, conf_file): 112 | with open(conf_file, 'w', encoding='utf-8') as f: 113 | json.dump(opt, f, indent=4) 114 | 115 | 116 | def save_opt_to_yaml(opt, conf_file): 117 | with open(conf_file, 'w', encoding='utf-8') as f: 118 | yaml.dump(opt, f) 119 | -------------------------------------------------------------------------------- /utils/dist.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json, time 3 | import torch 4 | import torch.distributed as dist 5 | 6 | 7 | def get_world_size(): 8 | if torch.distributed.is_initialized(): 9 | return torch.distributed.get_world_size() 10 | return 1 11 | 12 | def all_gather(x): 13 | if get_world_size() > 1: 14 | all_x = [torch.zeros_like(x) for _ in range(get_world_size())] 15 | torch.distributed.all_gather(all_x, x.detach()) 16 | all_x[torch.distributed.get_rank()] = x 17 | x = torch.stack(all_x, dim=0) 18 | return x 19 | 20 | def init_distributed_mode(args): 21 | if 'WORLD_SIZE' in os.environ and os.environ['WORLD_SIZE'] != '': # 'RANK' in os.environ and 22 | args.rank = int(os.environ["RANK"]) 23 | args.world_size = int(os.environ['WORLD_SIZE']) 24 | args.gpu = args.local_rank = int(os.environ['LOCAL_RANK']) 25 | 26 | print('world size: {}, rank: {}, local rank: {}'.format(args.world_size, args.rank, args.local_rank)) 27 | print(json.dumps(dict(os.environ), indent=2)) 28 | elif 'SLURM_PROCID' in os.environ: 29 | args.rank = int(os.environ['SLURM_PROCID']) 30 | args.gpu = args.local_rank = int(os.environ['SLURM_LOCALID']) 31 | args.world_size = int(os.environ['SLURM_NPROCS']) 32 | 33 | if os.environ.get('HAND_DEFINE_DIST_URL', 0) == '1': 34 | pass 35 | else: 36 | import util.hostlist as uh 37 | nodenames = uh.parse_nodelist(os.environ['SLURM_JOB_NODELIST']) 38 | gpu_ids = [int(node[3:]) for node in nodenames] 39 | fixid = int(os.environ.get('FIX_DISTRIBUTED_PORT_NUMBER', 0)) 40 | # fixid += random.randint(0, 300) 41 | port = str(3137 + int(min(gpu_ids)) + fixid) 42 | args.dist_url = "tcp://{ip}:{port}".format(ip=uh.nodename_to_ip(nodenames[0]), port=port) 43 | 44 | print('world size: {}, world rank: {}, local rank: {}, device_count: {}'.format(args.world_size, args.rank, args.local_rank, torch.cuda.device_count())) 45 | 46 | 47 | else: 48 | print('Not using distributed mode') 49 | args.distributed = False 50 | args.world_size = 1 51 | args.rank = 0 52 | args.local_rank = 0 53 | return 54 | 55 | print("world_size:{} rank:{} local_rank:{}".format(args.world_size, args.rank, args.local_rank)) 56 | args.distributed = True 57 | torch.cuda.set_device(args.local_rank) 58 | args.dist_backend = 'nccl' 59 | print('| distributed init (rank {}): {}'.format(args.rank, args.dist_url), flush=True) 60 | 61 | torch.distributed.init_process_group( 62 | backend=args.dist_backend, 63 | world_size=args.world_size, 64 | rank=args.rank, 65 | init_method=args.dist_url, 66 | ) 67 | 68 | print("Before torch.distributed.barrier()") 69 | torch.distributed.barrier() 70 | print("End torch.distributed.barrier()") -------------------------------------------------------------------------------- /utils/misc.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # X-Decoder -- Generalized Decoding for Pixel, Image, and Language 3 | # Copyright (c) 2022 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Xueyan Zou (xueyan@cs.wisc.edu) 6 | # -------------------------------------------------------- 7 | import math 8 | import wandb 9 | import os 10 | 11 | 12 | # HACK for evalution 13 | def hook_metadata(metadata, name): 14 | if name == 'cityscapes_fine_sem_seg_val': 15 | metadata.__setattr__("keep_sem_bgd", False) 16 | return metadata 17 | 18 | def hook_opt(model, name): 19 | if name in ['cityscapes_fine_panoptic_val', 'ade20k_panoptic_val', 'bdd10k_40_panoptic_val', 'cityscapes_fine_panoptic_val', 'scannet_21_panoptic_val']: 20 | model.model.object_mask_threshold = 0.4 21 | else: 22 | model.model.object_mask_threshold = 0.8 23 | 24 | # HACK for evalution 25 | def hook_switcher(model, name): 26 | mappings = {} 27 | if name in ['cityscapes_fine_sem_seg_val', 'scannet_21_val_seg', 'scannet_38_val_seg', 'scannet_41_val_seg', 'sunrgbd_37_val_seg', 'bdd10k_val_sem_seg', 'ade20k_full_sem_seg_val']: 28 | mappings = {'SEMANTIC_ON': True, 'INSTANCE_ON': False, 'PANOPTIC_ON': False} 29 | elif name in ['cityscapes_fine_instance_seg_val', 'pascal_part_val_interactive', 'pascal_part_val', 'pascal_part_train'] or 'seginw' in name or 'lvis' in name or 'odinw' in name: 30 | mappings = {'SEMANTIC_ON': False, 'INSTANCE_ON': True, 'PANOPTIC_ON': False} 31 | elif name in ['cityscapes_fine_panoptic_val', 'scannet_21_panoptic_val', 'bdd10k_40_panoptic_val']: 32 | # mappings = {'SEMANTIC_ON': True, 'INSTANCE_ON': False, 'PANOPTIC_ON': True} 33 | mappings = {'SEMANTIC_ON': True, 'INSTANCE_ON': True, 'PANOPTIC_ON': True} 34 | elif 'coco_2017_val_panoptic_with_sem_seg' in name or name in ['ade20k_panoptic_val', 'ade20k_panoptic_train', 'coco_2017_test-dev', 'sam_val', 'sam_minival']: 35 | mappings = {'SEMANTIC_ON': True, 'INSTANCE_ON': True, 'PANOPTIC_ON': True} 36 | elif name in ['cityscapes_fine_sem_seg_val', 'scannet_21_val_seg', 'scannet_38_val_seg', 'scannet_41_val_seg', 37 | 'sunrgbd_37_val_seg', 'context_59_val_seg', 'context_459_val_seg', 'voc_2012_val_seg', 38 | 'bdd10k_val_sem_seg']: 39 | mappings = {'SEMANTIC_ON': True, 'INSTANCE_ON': False, 'PANOPTIC_ON': False} 40 | elif name in ['cityscapes_fine_instance_seg_val'] or 'seginw' in name: 41 | mappings = {'SEMANTIC_ON': False, 'INSTANCE_ON': True, 'PANOPTIC_ON': False} 42 | mappings = {'SEMANTIC_ON': True, 'INSTANCE_ON': True, 'PANOPTIC_ON': True} 43 | elif name in ['coco_2017_val_panoptic_with_sem_seg', 'ade20k_panoptic_val']: 44 | mappings = {'SEMANTIC_ON': True, 'INSTANCE_ON': True, 'PANOPTIC_ON': True} 45 | else: 46 | if name not in ["vlp_val", "vlp_captioning_val", "vlp_val2017", "vlp_captioning_val2017", "imagenet_val", "refcocog_val_google", "phrasecut_val", "phrasecut_test", "refcocop_val_unc", "refcoco_val_unc", "refcocog_val_umd"]: 47 | assert False, "dataset switcher is not defined" 48 | for key, value in mappings.items(): 49 | if key == 'SEMANTIC_ON': 50 | model.model.semantic_on = value 51 | if key == 'INSTANCE_ON': 52 | model.model.instance_on = value 53 | if key == 'PANOPTIC_ON': 54 | model.model.panoptic_on = value 55 | 56 | class AverageMeter(object): 57 | """Computes and stores the average and current value.""" 58 | def __init__(self): 59 | self.reset() 60 | 61 | def reset(self): 62 | self.val = 0 63 | self.avg = 0 64 | self.sum = 0 65 | self.count = 0 66 | 67 | def update(self, val, n=1, decay=0): 68 | self.val = val 69 | if decay: 70 | alpha = math.exp(-n / decay) # exponential decay over 100 updates 71 | self.sum = alpha * self.sum + (1 - alpha) * val * n 72 | self.count = alpha * self.count + (1 - alpha) * n 73 | else: 74 | self.sum += val * n 75 | self.count += n 76 | self.avg = self.sum / self.count 77 | 78 | def init_wandb(args, job_dir, entity='646396839lifeng', project='xdecoder', job_name='tmp'): 79 | wandb_dir = os.path.join(job_dir, 'wandb') 80 | os.makedirs(wandb_dir, exist_ok=True) 81 | runid = None 82 | if os.path.exists(f"{wandb_dir}/runid.txt"): 83 | runid = open(f"{wandb_dir}/runid.txt").read() 84 | 85 | wandb.init(project=project, 86 | name=job_name, 87 | dir=wandb_dir, 88 | entity=entity, 89 | resume="allow", 90 | id=runid, 91 | config={"hierarchical": True}, ) 92 | 93 | open(f"{wandb_dir}/runid.txt", 'w').write(wandb.run.id) 94 | wandb.config.update({k: args[k] for k in args if k not in wandb.config}) -------------------------------------------------------------------------------- /utils/model.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import time 4 | import pickle 5 | import torch 6 | from detectron2.utils.comm import is_main_process 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | NORM_MODULES = [ 12 | torch.nn.BatchNorm1d, 13 | torch.nn.BatchNorm2d, 14 | torch.nn.BatchNorm3d, 15 | torch.nn.SyncBatchNorm, 16 | # NaiveSyncBatchNorm inherits from BatchNorm2d 17 | torch.nn.GroupNorm, 18 | torch.nn.InstanceNorm1d, 19 | torch.nn.InstanceNorm2d, 20 | torch.nn.InstanceNorm3d, 21 | torch.nn.LayerNorm, 22 | torch.nn.LocalResponseNorm, 23 | ] 24 | 25 | def register_norm_module(cls): 26 | NORM_MODULES.append(cls) 27 | return cls 28 | 29 | def align_and_update_state_dicts(model_state_dict, ckpt_state_dict): 30 | model_keys = sorted(model_state_dict.keys()) 31 | ckpt_keys = sorted(ckpt_state_dict.keys()) 32 | result_dicts = {} 33 | matched_log = [] 34 | unmatched_log = [] 35 | unloaded_log = [] 36 | for model_key in model_keys: 37 | model_weight = model_state_dict[model_key] 38 | if model_key in ckpt_keys: 39 | ckpt_weight = ckpt_state_dict[model_key] 40 | if model_weight.shape == ckpt_weight.shape: 41 | result_dicts[model_key] = ckpt_weight 42 | ckpt_keys.pop(ckpt_keys.index(model_key)) 43 | matched_log.append("Loaded {}, Model Shape: {} <-> Ckpt Shape: {}".format(model_key, model_weight.shape, ckpt_weight.shape)) 44 | else: 45 | unmatched_log.append("*UNMATCHED* {}, Model Shape: {} <-> Ckpt Shape: {}".format(model_key, model_weight.shape, ckpt_weight.shape)) 46 | else: 47 | unloaded_log.append("*UNLOADED* {}, Model Shape: {}".format(model_key, model_weight.shape)) 48 | 49 | if is_main_process(): 50 | for info in matched_log: 51 | logger.info(info) 52 | for info in unloaded_log: 53 | logger.warning(info) 54 | for key in ckpt_keys: 55 | logger.warning("$UNUSED$ {}, Ckpt Shape: {}".format(key, ckpt_state_dict[key].shape)) 56 | for info in unmatched_log: 57 | logger.warning(info) 58 | return result_dicts -------------------------------------------------------------------------------- /utils/sam_utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | -------------------------------------------------------------------------------- /utils/sam_utils/onnx.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import torch 8 | import torch.nn as nn 9 | from torch.nn import functional as F 10 | 11 | from typing import Tuple 12 | 13 | from ..modeling import Sam 14 | from .amg import calculate_stability_score 15 | 16 | 17 | class SamOnnxModel(nn.Module): 18 | """ 19 | This model should not be called directly, but is used in ONNX export. 20 | It combines the prompt encoder, mask decoder, and mask postprocessing of Sam, 21 | with some functions modified to enable model tracing. Also supports extra 22 | options controlling what information. See the ONNX export script for details. 23 | """ 24 | 25 | def __init__( 26 | self, 27 | model: Sam, 28 | return_single_mask: bool, 29 | use_stability_score: bool = False, 30 | return_extra_metrics: bool = False, 31 | ) -> None: 32 | super().__init__() 33 | self.mask_decoder = model.mask_decoder 34 | self.model = model 35 | self.img_size = model.image_encoder.img_size 36 | self.return_single_mask = return_single_mask 37 | self.use_stability_score = use_stability_score 38 | self.stability_score_offset = 1.0 39 | self.return_extra_metrics = return_extra_metrics 40 | 41 | @staticmethod 42 | def resize_longest_image_size( 43 | input_image_size: torch.Tensor, longest_side: int 44 | ) -> torch.Tensor: 45 | input_image_size = input_image_size.to(torch.float32) 46 | scale = longest_side / torch.max(input_image_size) 47 | transformed_size = scale * input_image_size 48 | transformed_size = torch.floor(transformed_size + 0.5).to(torch.int64) 49 | return transformed_size 50 | 51 | def _embed_points(self, point_coords: torch.Tensor, point_labels: torch.Tensor) -> torch.Tensor: 52 | point_coords = point_coords + 0.5 53 | point_coords = point_coords / self.img_size 54 | point_embedding = self.model.prompt_encoder.pe_layer._pe_encoding(point_coords) 55 | point_labels = point_labels.unsqueeze(-1).expand_as(point_embedding) 56 | 57 | point_embedding = point_embedding * (point_labels != -1) 58 | point_embedding = point_embedding + self.model.prompt_encoder.not_a_point_embed.weight * ( 59 | point_labels == -1 60 | ) 61 | 62 | for i in range(self.model.prompt_encoder.num_point_embeddings): 63 | point_embedding = point_embedding + self.model.prompt_encoder.point_embeddings[ 64 | i 65 | ].weight * (point_labels == i) 66 | 67 | return point_embedding 68 | 69 | def _embed_masks(self, input_mask: torch.Tensor, has_mask_input: torch.Tensor) -> torch.Tensor: 70 | mask_embedding = has_mask_input * self.model.prompt_encoder.mask_downscaling(input_mask) 71 | mask_embedding = mask_embedding + ( 72 | 1 - has_mask_input 73 | ) * self.model.prompt_encoder.no_mask_embed.weight.reshape(1, -1, 1, 1) 74 | return mask_embedding 75 | 76 | def mask_postprocessing(self, masks: torch.Tensor, orig_im_size: torch.Tensor) -> torch.Tensor: 77 | masks = F.interpolate( 78 | masks, 79 | size=(self.img_size, self.img_size), 80 | mode="bilinear", 81 | align_corners=False, 82 | ) 83 | 84 | prepadded_size = self.resize_longest_image_size(orig_im_size, self.img_size) 85 | masks = masks[..., : int(prepadded_size[0]), : int(prepadded_size[1])] 86 | 87 | orig_im_size = orig_im_size.to(torch.int64) 88 | h, w = orig_im_size[0], orig_im_size[1] 89 | masks = F.interpolate(masks, size=(h, w), mode="bilinear", align_corners=False) 90 | return masks 91 | 92 | def select_masks( 93 | self, masks: torch.Tensor, iou_preds: torch.Tensor, num_points: int 94 | ) -> Tuple[torch.Tensor, torch.Tensor]: 95 | # Determine if we should return the multiclick mask or not from the number of points. 96 | # The reweighting is used to avoid control flow. 97 | score_reweight = torch.tensor( 98 | [[1000] + [0] * (self.model.mask_decoder.num_mask_tokens - 1)] 99 | ).to(iou_preds.device) 100 | score = iou_preds + (num_points - 2.5) * score_reweight 101 | best_idx = torch.argmax(score, dim=1) 102 | masks = masks[torch.arange(masks.shape[0]), best_idx, :, :].unsqueeze(1) 103 | iou_preds = iou_preds[torch.arange(masks.shape[0]), best_idx].unsqueeze(1) 104 | 105 | return masks, iou_preds 106 | 107 | @torch.no_grad() 108 | def forward( 109 | self, 110 | image_embeddings: torch.Tensor, 111 | point_coords: torch.Tensor, 112 | point_labels: torch.Tensor, 113 | mask_input: torch.Tensor, 114 | has_mask_input: torch.Tensor, 115 | orig_im_size: torch.Tensor, 116 | ): 117 | sparse_embedding = self._embed_points(point_coords, point_labels) 118 | dense_embedding = self._embed_masks(mask_input, has_mask_input) 119 | 120 | masks, scores = self.model.mask_decoder.predict_masks( 121 | image_embeddings=image_embeddings, 122 | image_pe=self.model.prompt_encoder.get_dense_pe(), 123 | sparse_prompt_embeddings=sparse_embedding, 124 | dense_prompt_embeddings=dense_embedding, 125 | ) 126 | 127 | if self.use_stability_score: 128 | scores = calculate_stability_score( 129 | masks, self.model.mask_threshold, self.stability_score_offset 130 | ) 131 | 132 | if self.return_single_mask: 133 | masks, scores = self.select_masks(masks, scores, point_coords.shape[1]) 134 | 135 | upscaled_masks = self.mask_postprocessing(masks, orig_im_size) 136 | 137 | if self.return_extra_metrics: 138 | stability_scores = calculate_stability_score( 139 | upscaled_masks, self.model.mask_threshold, self.stability_score_offset 140 | ) 141 | areas = (upscaled_masks > self.model.mask_threshold).sum(-1).sum(-1) 142 | return upscaled_masks, scores, stability_scores, areas, masks 143 | 144 | return upscaled_masks, scores, masks 145 | -------------------------------------------------------------------------------- /utils/sam_utils/transforms.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import numpy as np 8 | import torch 9 | from torch.nn import functional as F 10 | from torchvision.transforms.functional import resize, to_pil_image # type: ignore 11 | 12 | from copy import deepcopy 13 | from typing import Tuple 14 | 15 | 16 | class ResizeLongestSide: 17 | """ 18 | Resizes images to longest side 'target_length', as well as provides 19 | methods for resizing coordinates and boxes. Provides methods for 20 | transforming both numpy array and batched torch tensors. 21 | """ 22 | 23 | def __init__(self, target_length: int) -> None: 24 | self.target_length = target_length 25 | 26 | def apply_image(self, image: np.ndarray) -> np.ndarray: 27 | """ 28 | Expects a numpy array with shape HxWxC in uint8 format. 29 | """ 30 | target_size = self.get_preprocess_shape(image.shape[0], image.shape[1], self.target_length) 31 | return np.array(resize(to_pil_image(image), target_size)) 32 | 33 | def apply_coords(self, coords: np.ndarray, original_size: Tuple[int, ...]) -> np.ndarray: 34 | """ 35 | Expects a numpy array of length 2 in the final dimension. Requires the 36 | original image size in (H, W) format. 37 | """ 38 | old_h, old_w = original_size 39 | new_h, new_w = self.get_preprocess_shape( 40 | original_size[0], original_size[1], self.target_length 41 | ) 42 | coords = deepcopy(coords).astype(float) 43 | coords[..., 0] = coords[..., 0] * (new_w / old_w) 44 | coords[..., 1] = coords[..., 1] * (new_h / old_h) 45 | return coords 46 | 47 | def apply_boxes(self, boxes: np.ndarray, original_size: Tuple[int, ...]) -> np.ndarray: 48 | """ 49 | Expects a numpy array shape Bx4. Requires the original image size 50 | in (H, W) format. 51 | """ 52 | boxes = self.apply_coords(boxes.reshape(-1, 2, 2), original_size) 53 | return boxes.reshape(-1, 4) 54 | 55 | def apply_image_torch(self, image: torch.Tensor) -> torch.Tensor: 56 | """ 57 | Expects batched images with shape BxCxHxW and float format. This 58 | transformation may not exactly match apply_image. apply_image is 59 | the transformation expected by the model. 60 | """ 61 | # Expects an image in BCHW format. May not exactly match apply_image. 62 | target_size = self.get_preprocess_shape(image.shape[0], image.shape[1], self.target_length) 63 | return F.interpolate( 64 | image, target_size, mode="bilinear", align_corners=False, antialias=True 65 | ) 66 | 67 | def apply_coords_torch( 68 | self, coords: torch.Tensor, original_size: Tuple[int, ...] 69 | ) -> torch.Tensor: 70 | """ 71 | Expects a torch tensor with length 2 in the last dimension. Requires the 72 | original image size in (H, W) format. 73 | """ 74 | old_h, old_w = original_size 75 | new_h, new_w = self.get_preprocess_shape( 76 | original_size[0], original_size[1], self.target_length 77 | ) 78 | coords = deepcopy(coords).to(torch.float) 79 | coords[..., 0] = coords[..., 0] * (new_w / old_w) 80 | coords[..., 1] = coords[..., 1] * (new_h / old_h) 81 | return coords 82 | 83 | def apply_boxes_torch( 84 | self, boxes: torch.Tensor, original_size: Tuple[int, ...] 85 | ) -> torch.Tensor: 86 | """ 87 | Expects a torch tensor with shape Bx4. Requires the original image 88 | size in (H, W) format. 89 | """ 90 | boxes = self.apply_coords_torch(boxes.reshape(-1, 2, 2), original_size) 91 | return boxes.reshape(-1, 4) 92 | 93 | @staticmethod 94 | def get_preprocess_shape(oldh: int, oldw: int, long_side_length: int) -> Tuple[int, int]: 95 | """ 96 | Compute the output size given input size and target long side length. 97 | """ 98 | scale = long_side_length * 1.0 / max(oldh, oldw) 99 | newh, neww = oldh * scale, oldw * scale 100 | neww = int(neww + 0.5) 101 | newh = int(newh + 0.5) 102 | return (newh, neww) 103 | --------------------------------------------------------------------------------