├── .gitattributes
├── .gitignore
├── DATASET.md
├── README.md
├── __init__.py
├── configs
    ├── dinov_sam_ade_eval.yaml
    ├── dinov_sam_coco_swinl_train.yaml
    └── dinov_sam_coco_train.yaml
├── datasets
    ├── __init__.py
    ├── build.py
    ├── custom_dataset_dataloader.py
    ├── dataset_mappers
    │   ├── __init__.py
    │   ├── bdd_semseg_dataset_mapper.py
    │   ├── coco_instance_new_baseline_dataset_mapper.py
    │   ├── coco_interactive_panoptic_new_baseline_dataset_mapper.py
    │   ├── coco_panoptic_new_baseline_dataset_mapper.py
    │   ├── davis_dataset_mapper.py
    │   ├── inference_mapper_with_gt.py
    │   ├── instance_inference_mapper_with_gt.py
    │   ├── lvis_dataset_mapper.py
    │   ├── lvis_dataset_mapper_with_gt.py
    │   ├── mask_former_instance_dataset_mapper.py
    │   ├── mask_former_interactive_panoptic_dataset_mapper.py
    │   ├── mask_former_panoptic_dataset_mapper.py
    │   ├── mask_former_semantic_dataset_mapper.py
    │   ├── o365_instance_new_baseline_dataset_mapper.py
    │   ├── object365_dataset_mapper.py
    │   ├── part_data_filter_whole_new_instance_dataset_mapper.py
    │   ├── pascal_instance_new_baseline_dataset_mapper.py
    │   ├── pascalcontext_dataset_mapper.py
    │   ├── sam_baseline_dataset_mapper.py
    │   ├── sam_baseline_dataset_mapper_content.py
    │   ├── sam_baseline_dataset_mapper_json.py
    │   ├── scannet_dataset_mapper.py
    │   ├── scannet_pano_dataset_mapper.py
    │   ├── seginw_dataset_mapper.py
    │   ├── sunrgbd_dataset_mapper.py
    │   └── ytvos_dataset_mapper.py
    ├── evaluation
    │   ├── __init__.py
    │   ├── instance_evaluation.py
    │   ├── interactive_evaluation.py
    │   ├── panoptic_evaluation.py
    │   ├── pascal_part_evaluation.py
    │   └── segmentation_evaluation.py
    ├── registration
    │   ├── __init__.py
    │   ├── register_ade20k_full.py
    │   ├── register_ade20k_instance.py
    │   ├── register_ade20k_panoptic.py
    │   ├── register_bdd100k_panoseg.py
    │   ├── register_bdd100k_semseg.py
    │   ├── register_coco_panoptic_annos_semseg.py
    │   ├── register_coco_panoptic_annos_semseg_interactive.py
    │   ├── register_coco_panoptic_annos_semseg_interactive_jointboxpoint.py
    │   ├── register_coco_stuff_10k.py
    │   ├── register_context_semseg.py
    │   ├── register_davis_dataset.py
    │   ├── register_lvis_eval.py
    │   ├── register_object365_od.py
    │   ├── register_odinw_od.py
    │   ├── register_paco_part_all.py
    │   ├── register_partimagenet_part_all.py
    │   ├── register_pascal_part_all.py
    │   ├── register_pascal_part_all_interactive.py
    │   ├── register_sam.py
    │   ├── register_sam_json.py
    │   ├── register_scannet_panoptic.py
    │   ├── register_scannet_semseg.py
    │   ├── register_seginw_instance.py
    │   ├── register_sunrgbd_semseg.py
    │   └── register_ytvos_dataset.py
    ├── semseg_loader.py
    ├── shapes
    │   ├── __init__.py
    │   ├── mask_generators.py
    │   ├── sampler.py
    │   ├── scribble.py
    │   └── simpleclick_sampler.py
    └── utils
    │   └── tsv
    │       ├── __init__.py
    │       ├── io_common.py
    │       └── tsv_io.py
├── demo
    ├── __init__.py
    ├── examples
    │   ├── bags.jpg
    │   ├── corgi2.jpg
    │   ├── img.png
    │   └── ref_cat.jpeg
    └── openset_task.py
├── demo_openset.py
├── dinov
    ├── BaseModel.py
    ├── __init__.py
    ├── architectures
    │   ├── __init__.py
    │   ├── build.py
    │   ├── dinov.py
    │   └── registry.py
    ├── backbone
    │   ├── __init__.py
    │   ├── backbone.py
    │   ├── build.py
    │   ├── focal.py
    │   ├── focal_dw.py
    │   ├── registry.py
    │   └── swin.py
    ├── body
    │   ├── __init__.py
    │   ├── build.py
    │   ├── decoder
    │   │   ├── __init__.py
    │   │   ├── build.py
    │   │   ├── dinov_openset_decoder.py
    │   │   ├── dinov_refer_decoder.py
    │   │   ├── registry.py
    │   │   └── utils
    │   │   │   ├── __init__.py
    │   │   │   ├── dino_decoder.py
    │   │   │   └── utils.py
    │   ├── encoder
    │   │   ├── __init__.py
    │   │   ├── build.py
    │   │   ├── encoder_deform.py
    │   │   ├── ops
    │   │   │   ├── functions
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── ms_deform_attn_func.py
    │   │   │   ├── modules
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── ms_deform_attn.py
    │   │   │   ├── setup.py
    │   │   │   ├── src
    │   │   │   │   ├── cpu
    │   │   │   │   │   ├── ms_deform_attn_cpu.cpp
    │   │   │   │   │   └── ms_deform_attn_cpu.h
    │   │   │   │   ├── cuda
    │   │   │   │   │   ├── ms_deform_attn_cuda.cu
    │   │   │   │   │   ├── ms_deform_attn_cuda.h
    │   │   │   │   │   └── ms_deform_im2col_cuda.cuh
    │   │   │   │   ├── ms_deform_attn.h
    │   │   │   │   └── vision.cpp
    │   │   │   └── test.py
    │   │   ├── registry.py
    │   │   └── transformer_encoder_fpn.py
    │   ├── general_head.py
    │   ├── registry.py
    │   └── transformer_blocks.py
    ├── language
    │   ├── __init__.py
    │   └── build.py
    ├── modules
    │   ├── __init__.py
    │   ├── criterion_visual_openset.py
    │   ├── criterion_visual_refer_many2many.py
    │   ├── criterion_visual_refer_one2one.py
    │   ├── matcher.py
    │   ├── matcher_many2many.py
    │   ├── position_encoding.py
    │   └── postprocessing.py
    └── utils
    │   ├── __init__.py
    │   ├── box_ops.py
    │   ├── config.py
    │   └── misc.py
├── repo.diff
├── requirements.txt
├── train_net.py
└── utils
    ├── Config.py
    ├── __init__.py
    ├── arguments.py
    ├── constants.py
    ├── dist.py
    ├── lvis_cat.py
    ├── misc.py
    ├── model.py
    ├── sam_utils
        ├── __init__.py
        ├── amg.py
        ├── onnx.py
        └── transforms.py
    └── visualizer.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # IntelliJ project files
  2 | #repo.diff
  3 | .idea
  4 | .vscode
  5 | .amltignore
  6 | *.iml
  7 | out
  8 | gen
  9 | visinf
 10 | coco_caption
 11 | ### Vim template
 12 | [._]*.s[a-w][a-z]
 13 | [._]s[a-w][a-z]
 14 | *.un~
 15 | Session.vim
 16 | .netrwhist
 17 | *~
 18 | *.sh
 19 | vis_scribble
 20 | 
 21 | ### IPythonNotebook template
 22 | # Temporary data
 23 | .ipynb_checkpoints/
 24 | 
 25 | ### Python template
 26 | # Byte-compiled / optimized / DLL files
 27 | __pycache__/
 28 | *.py[cod]
 29 | *$py.class
 30 | 
 31 | # C extensions
 32 | *.so
 33 | 
 34 | # Distribution / packaging
 35 | .Python
 36 | env/
 37 | build/
 38 | develop-eggs/
 39 | dist/
 40 | downloads/
 41 | eggs/
 42 | .eggs/
 43 | #lib/
 44 | #lib64/
 45 | parts/
 46 | sdist/
 47 | var/
 48 | *.egg-info/
 49 | .installed.cfg
 50 | *.egg
 51 | 
 52 | # PyInstaller
 53 | #  Usually these files are written by a python script from a template
 54 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 55 | *.manifest
 56 | *.spec
 57 | 
 58 | # Installer logs
 59 | pip-log.txt
 60 | pip-delete-this-directory.txt
 61 | 
 62 | # Unit test / coverage reports
 63 | htmlcov/
 64 | .tox/
 65 | .coverage
 66 | .coverage.*
 67 | .cache
 68 | nosetests.xml
 69 | coverage.xml
 70 | *,cover
 71 | 
 72 | # Translations
 73 | *.mo
 74 | *.pot
 75 | 
 76 | # Django stuff:
 77 | *.log
 78 | 
 79 | # Sphinx documentation
 80 | docs/_build/
 81 | 
 82 | # PyBuilder
 83 | target/
 84 | 
 85 | *.ipynb
 86 | *.params
 87 | # *.json
 88 | #.vscode/
 89 | *.code-workspace/
 90 | 
 91 | lib/pycocotools/_mask.c
 92 | lib/nms/cpu_nms.c
 93 | 
 94 | OUTPUT
 95 | OUTPUT/*
 96 | models/*
 97 | DATASET
 98 | DATASET/*
 99 | external/
100 | MODELS
101 | MODELS/*
102 | 
103 | kill.sh
104 | 
105 | draws/
106 | plot/
107 | 
108 | 
109 | 
110 | 
111 | *venv/*
112 | *.pt
113 | *.pth
114 | 


--------------------------------------------------------------------------------
/DATASET.md:
--------------------------------------------------------------------------------
 1 | # Preparing Dataset
 2 | Our dataloader follows [Detectron2](https://github.com/facebookresearch/detectron2) contains (1) A dataset registrator. (2) A dataset mapper. (3) A dataset loader. We modify the dataset registrator and mapper for different datasets.
 3 | 
 4 | ## Training Dataset Note
 5 | <img src="https://user-images.githubusercontent.com/11957155/226159078-7f817452-76f8-44f4-af7a-9f13f3e02554.png" width="500">
 6 | There are overlap between COCO2017, COCO-Karpathy and REF-COCO dataset, and ref-coco is all overalp with the COCO2017 training data, we have exclude the refcocog-umd validation, coco-karpathy test split during training.
 7 | 
 8 | ## ADE20K, Cityscapes, COCO
 9 | Please Refer to [Mask2Former](https://github.com/facebookresearch/Mask2Former/tree/main/datasets).
10 | 
11 | ## BDD100K
12 | Please download the 10k split of BDD100k at https://doc.bdd100k.com/download.html#id1
13 | 
14 | ### Expected dataset structure for cityscapes:
15 | ```
16 | .
17 | └── bdd100k/
18 |     ├── images/
19 |     │   └── 10k/
20 |     │       ├── test
21 |     │       ├── train
22 |     │       └── val
23 |     └── labels/
24 |         ├── ins_seg
25 |         ├── pan_seg
26 |         └── sem_seg
27 | ```
28 | 
29 | ## RefCOCO
30 | Please download the original refcoco datasets at https://github.com/lichengunc/refer.
31 | 
32 | ### Expected dataset structure for refcoco:
33 | ```
34 | .
35 | └── refcocoseg/
36 |     └── refcocog/
37 |         ├── instances.json
38 |         ├── refs(google).p
39 |         └── refs(umd).p
40 | ```
41 | 
42 | Also download the coco dataset at https://cocodataset.org/#home:
43 | ### Expected dataset structure for coco:
44 | ```
45 | .
46 | └── coco/
47 |     ├── annotations
48 |     ├── train2017
49 |     └── val2017
50 | ```
51 | 
52 | After preparing the dataset, run the following command:
53 | 
54 | ```sh
55 | # NOTE: Please modify coco_root and ref_root
56 | python3 refcoco2json.py
57 | ```
58 | 
59 | ## SUN-RGBD
60 | 
61 | 
62 | ## SCAN-Net
63 | 
64 | 
65 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UX-Decoder/DINOv/53bf20d5cfdbb86fa35141a1cff432d4923599f2/__init__.py


--------------------------------------------------------------------------------
/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | from . import registration
2 | from .build import *


--------------------------------------------------------------------------------
/datasets/dataset_mappers/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | from .coco_instance_new_baseline_dataset_mapper import COCOInstanceNewBaselineDatasetMapper
 3 | from .coco_panoptic_new_baseline_dataset_mapper import COCOPanopticNewBaselineDatasetMapper
 4 | from .coco_interactive_panoptic_new_baseline_dataset_mapper import COCOInteractivePanopticNewBaselineDatasetMapper
 5 | from .mask_former_instance_dataset_mapper import MaskFormerInstanceDatasetMapper
 6 | from .mask_former_panoptic_dataset_mapper import MaskFormerPanopticDatasetMapper
 7 | from .mask_former_interactive_panoptic_dataset_mapper import MaskFormerPanopticDatasetMapperInteractive
 8 | from .mask_former_semantic_dataset_mapper import MaskFormerSemanticDatasetMapper
 9 | from .sunrgbd_dataset_mapper import SunRGBDSegDatasetMapper
10 | from .scannet_dataset_mapper import ScanNetSegDatasetMapper
11 | from .bdd_semseg_dataset_mapper import BDDSemDatasetMapper
12 | from .scannet_pano_dataset_mapper import ScanNetPanoDatasetMapper
13 | from .o365_instance_new_baseline_dataset_mapper import O365InstanceNewBaselineDatasetMapper
14 | from .sam_baseline_dataset_mapper import build_transform_gen as sam_transform_gen
15 | from .sam_baseline_dataset_mapper import SamBaselineDatasetMapper
16 | from .sam_baseline_dataset_mapper_json import SamBaselineDatasetMapperJSON
17 | from .sam_baseline_dataset_mapper_content import SamBaselineDatasetMapperContent
18 | from .pascal_instance_new_baseline_dataset_mapper import PascalInstanceNewBaselineDatasetMapper
19 | from .part_data_filter_whole_new_instance_dataset_mapper import PartFilterWholeInstanceNewBaselineDatasetMapper
20 | from .inference_mapper_with_gt import CoCoInferenceDatasetMapper
21 | from .instance_inference_mapper_with_gt import InstanceInferenceDatasetMapperGT
22 | 
23 | from .davis_dataset_mapper import DAVISDatasetMapper
24 | from .ytvos_dataset_mapper import YTVOSDatasetMapper
25 | from .seginw_dataset_mapper import SeginWDatasetMapper
26 | from .lvis_dataset_mapper_with_gt import LVISInferenceMapperWithGT
27 | from .pascalcontext_dataset_mapper import PascalContextSegDatasetMapper


--------------------------------------------------------------------------------
/datasets/dataset_mappers/bdd_semseg_dataset_mapper.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # X-Decoder -- Generalized Decoding for Pixel, Image, and Language
  3 | # Copyright (c) 2022 Microsoft
  4 | # Licensed under The MIT License [see LICENSE for details]
  5 | # Modified by Xueyan Zou (xueyan@cs.wisc.edu)
  6 | # --------------------------------------------------------
  7 | # Copyright (c) Facebook, Inc. and its affiliates.
  8 | import copy
  9 | 
 10 | import scipy.io
 11 | import numpy as np
 12 | import torch
 13 | from PIL import Image
 14 | 
 15 | from torchvision import transforms
 16 | from dinov.utils import configurable
 17 | 
 18 | __all__ = ["BDDSemDatasetMapper"]
 19 | 
 20 | 
 21 | # This is specifically designed for the COCO dataset.
 22 | class BDDSemDatasetMapper:
 23 |     """
 24 |     A callable which takes a dataset dict in Detectron2 Dataset format,
 25 |     and map it into a format used by MaskFormer.
 26 | 
 27 |     This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation.
 28 | 
 29 |     The callable currently does the following:
 30 | 
 31 |     1. Read the image from "file_name"
 32 |     2. Applies geometric transforms to the image and annotation
 33 |     3. Find and applies suitable cropping to the image and annotation
 34 |     4. Prepare image and annotation to Tensors
 35 |     """
 36 | 
 37 |     @configurable
 38 |     def __init__(
 39 |         self,
 40 |         is_train=True,
 41 |         min_size_test=None,
 42 |         max_size_test=None,
 43 |         mean=None,
 44 |         std=None,
 45 |     ):
 46 |         """
 47 |         NOTE: this interface is experimental.
 48 |         Args:
 49 |             is_train: for training or inference
 50 |             augmentations: a list of augmentations or deterministic transforms to apply
 51 |             tfm_gens: data augmentation
 52 |             image_format: an image format supported by :func:`detection_utils.read_image`.
 53 |         """
 54 |         self.is_train = is_train
 55 |         self.min_size_test = min_size_test
 56 |         self.max_size_test = max_size_test
 57 |         self.pixel_mean = torch.tensor(mean)[:,None,None]
 58 |         self.pixel_std = torch.tensor(std)[:,None,None]
 59 | 
 60 |         t = []
 61 |         t.append(transforms.Resize(self.min_size_test, interpolation=Image.BICUBIC))
 62 |         self.transform = transforms.Compose(t)
 63 |     
 64 |     @classmethod
 65 |     def from_config(cls, cfg, is_train=True):
 66 |         ret = {
 67 |             "is_train": is_train,
 68 |             "min_size_test": cfg['INPUT']['MIN_SIZE_TEST'],
 69 |             "max_size_test": cfg['INPUT']['MAX_SIZE_TEST'],
 70 |             "mean": cfg['INPUT']['PIXEL_MEAN'],
 71 |             "std": cfg['INPUT']['PIXEL_STD'],
 72 |         }
 73 |         return ret
 74 |     
 75 |     def read_semseg(self, file_name):
 76 |         if '.png' in file_name:
 77 |             semseg = np.asarray(Image.open(file_name))
 78 |         elif '.mat' in file_name:
 79 |             semseg = scipy.io.loadmat(file_name)['LabelMap']
 80 |         return semseg
 81 | 
 82 |     def __call__(self, dataset_dict):
 83 |         """
 84 |         Args:
 85 |             dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
 86 | 
 87 |         Returns:
 88 |             dict: a format that builtin models in detectron2 accept
 89 |         """
 90 |         dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
 91 |         file_name = dataset_dict['file_name']
 92 |         semseg_name = dataset_dict['sem_seg_file_name']
 93 |         image = Image.open(file_name).convert('RGB')
 94 | 
 95 |         dataset_dict['width'] = image.size[0]
 96 |         dataset_dict['height'] = image.size[1]
 97 | 
 98 |         if self.is_train == False:
 99 |             image = self.transform(image)
100 |             image = torch.from_numpy(np.asarray(image).copy())
101 |             image = image.permute(2,0,1)
102 |             
103 |         semseg = self.read_semseg(semseg_name)
104 |         semseg = torch.from_numpy(semseg.astype(np.int32))
105 |         dataset_dict['image'] = image
106 |         dataset_dict['semseg'] = semseg
107 |         return dataset_dict


--------------------------------------------------------------------------------
/datasets/dataset_mappers/coco_panoptic_new_baseline_dataset_mapper.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------
  2 | # Copyright (c) 2022 IDEA. All Rights Reserved.
  3 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  4 | # ------------------------------------------------------------------------
  5 | # Modified from Mask2Former https://github.com/facebookresearch/Mask2Former by Feng Li.
  6 | import copy
  7 | import logging
  8 | 
  9 | import numpy as np
 10 | import torch
 11 | 
 12 | from detectron2.config import configurable
 13 | from detectron2.data import detection_utils as utils
 14 | from detectron2.data import transforms as T
 15 | from detectron2.data.transforms import TransformGen
 16 | from detectron2.structures import BitMasks, Boxes, Instances
 17 | 
 18 | __all__ = ["COCOPanopticNewBaselineDatasetMapper"]
 19 | 
 20 | 
 21 | def build_transform_gen(cfg, is_train):
 22 |     """
 23 |     Create a list of default :class:`Augmentation` from config.
 24 |     Now it includes resizing and flipping.
 25 |     Returns:
 26 |         list[Augmentation]
 27 |     """
 28 |     assert is_train, "Only support training augmentation"
 29 |     image_size = cfg.INPUT.IMAGE_SIZE
 30 |     min_scale = cfg.INPUT.MIN_SCALE
 31 |     max_scale = cfg.INPUT.MAX_SCALE
 32 | 
 33 |     augmentation = []
 34 | 
 35 |     if cfg.INPUT.RANDOM_FLIP != "none":
 36 |         augmentation.append(
 37 |             T.RandomFlip(
 38 |                 horizontal=cfg.INPUT.RANDOM_FLIP == "horizontal",
 39 |                 vertical=cfg.INPUT.RANDOM_FLIP == "vertical",
 40 |             )
 41 |         )
 42 | 
 43 |     augmentation.extend([
 44 |         T.ResizeScale(
 45 |             min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size
 46 |         ),
 47 |         T.FixedSizeCrop(crop_size=(image_size, image_size)),
 48 |     ])
 49 | 
 50 |     return augmentation
 51 | 
 52 | 
 53 | # This is specifically designed for the COCO dataset.
 54 | class COCOPanopticNewBaselineDatasetMapper:
 55 |     """
 56 |     A callable which takes a dataset dict in Detectron2 Dataset format,
 57 |     and map it into a format used by MaskFormer.
 58 | 
 59 |     This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation.
 60 | 
 61 |     The callable currently does the following:
 62 | 
 63 |     1. Read the image from "file_name"
 64 |     2. Applies geometric transforms to the image and annotation
 65 |     3. Find and applies suitable cropping to the image and annotation
 66 |     4. Prepare image and annotation to Tensors
 67 |     """
 68 | 
 69 |     @configurable
 70 |     def __init__(
 71 |         self,
 72 |         is_train=True,
 73 |         *,
 74 |         tfm_gens,
 75 |         image_format,
 76 |     ):
 77 |         """
 78 |         NOTE: this interface is experimental.
 79 |         Args:
 80 |             is_train: for training or inference
 81 |             augmentations: a list of augmentations or deterministic transforms to apply
 82 |             crop_gen: crop augmentation
 83 |             tfm_gens: data augmentation
 84 |             image_format: an image format supported by :func:`detection_utils.read_image`.
 85 |         """
 86 |         self.tfm_gens = tfm_gens
 87 |         logging.getLogger(__name__).info(
 88 |             "[COCOPanopticNewBaselineDatasetMapper] Full TransformGens used in training: {}".format(
 89 |                 str(self.tfm_gens)
 90 |             )
 91 |         )
 92 | 
 93 |         self.img_format = image_format
 94 |         self.is_train = is_train
 95 | 
 96 |     @classmethod
 97 |     def from_config(cls, cfg, is_train=True):
 98 |         # Build augmentation
 99 |         tfm_gens = build_transform_gen(cfg, is_train)
100 | 
101 |         ret = {
102 |             "is_train": is_train,
103 |             "tfm_gens": tfm_gens,
104 |             "image_format": cfg.INPUT.FORMAT,
105 |         }
106 |         return ret
107 | 
108 |     def __call__(self, dataset_dict):
109 |         """
110 |         Args:
111 |             dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
112 | 
113 |         Returns:
114 |             dict: a format that builtin models in detectron2 accept
115 |         """
116 |         dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
117 |         image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
118 |         utils.check_image_size(dataset_dict, image)
119 | 
120 |         image, transforms = T.apply_transform_gens(self.tfm_gens, image)
121 |         image_shape = image.shape[:2]  # h, w
122 | 
123 |         # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
124 |         # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
125 |         # Therefore it's important to use torch.Tensor.
126 |         dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
127 | 
128 |         if not self.is_train:
129 |             # USER: Modify this if you want to keep them for some reason.
130 |             dataset_dict.pop("annotations", None)
131 |             return dataset_dict
132 | 
133 |         if "pan_seg_file_name" in dataset_dict:
134 |             pan_seg_gt = utils.read_image(dataset_dict.pop("pan_seg_file_name"), "RGB")
135 |             segments_info = dataset_dict["segments_info"]
136 | 
137 |             # apply the same transformation to panoptic segmentation
138 |             pan_seg_gt = transforms.apply_segmentation(pan_seg_gt)
139 | 
140 |             from panopticapi.utils import rgb2id
141 | 
142 |             pan_seg_gt = rgb2id(pan_seg_gt)
143 | 
144 |             instances = Instances(image_shape)
145 |             classes = []
146 |             masks = []
147 |             for segment_info in segments_info:
148 |                 class_id = segment_info["category_id"]
149 |                 if not segment_info["iscrowd"]:
150 |                     classes.append(class_id)
151 |                     masks.append(pan_seg_gt == segment_info["id"])
152 | 
153 |             classes = np.array(classes)
154 |             instances.gt_classes = torch.tensor(classes, dtype=torch.int64)
155 |             if len(masks) == 0:
156 |                 # Some image does not have annotation (all ignored)
157 |                 instances.gt_masks = torch.zeros((0, pan_seg_gt.shape[-2], pan_seg_gt.shape[-1]))
158 |                 instances.gt_boxes = Boxes(torch.zeros((0, 4)))
159 |             else:
160 |                 masks = BitMasks(
161 |                     torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks])
162 |                 )
163 |                 instances.gt_masks = masks.tensor
164 |                 instances.gt_boxes = masks.get_bounding_boxes()
165 | 
166 |             dataset_dict["instances"] = instances
167 | 
168 |         return dataset_dict
169 | 


--------------------------------------------------------------------------------
/datasets/dataset_mappers/lvis_dataset_mapper.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/d2/detr/dataset_mapper.py
  3 | import copy
  4 | import random
  5 | 
  6 | import scipy.io
  7 | import numpy as np
  8 | import torch
  9 | from PIL import Image
 10 | 
 11 | from torchvision import transforms
 12 | 
 13 | from pycocotools import mask
 14 | from detectron2.data import detection_utils as utils
 15 | from detectron2.data import transforms as T
 16 | from detectron2.data import MetadataCatalog
 17 | 
 18 | # from ...Networks.Mask2Former.utils import configurable
 19 | 
 20 | __all__ = ["LVISDatasetMapper"]
 21 | 
 22 | def build_transform_gen(cfg, is_train):
 23 |     """
 24 |     Create a list of default :class:`Augmentation` from config.
 25 |     Now it includes resizing and flipping.
 26 |     Returns:
 27 |         list[Augmentation]
 28 |     """
 29 |     assert is_train, "Only support training augmentation"
 30 |     cfg_input = cfg['INPUT']
 31 |     image_size = cfg_input['IMAGE_SIZE']
 32 |     min_scale = cfg_input['MIN_SCALE']
 33 |     max_scale = cfg_input['MAX_SCALE']
 34 | 
 35 |     augmentation = []
 36 | 
 37 | 
 38 |     if cfg_input['RANDOM_FLIP'] != "none":
 39 |         augmentation.append(
 40 |             T.RandomFlip(
 41 |                 horizontal=cfg_input['RANDOM_FLIP'] == "horizontal",
 42 |                 vertical=cfg_input['RANDOM_FLIP'] == "vertical",
 43 |             )
 44 |         )
 45 | 
 46 |     augmentation.extend([
 47 |         T.ResizeScale(
 48 |             min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size
 49 |         ),
 50 |         T.FixedSizeCrop(crop_size=(image_size, image_size)),
 51 |     ])
 52 |     
 53 |     return augmentation
 54 | 
 55 | 
 56 | # This is specifically designed for the COCO dataset.
 57 | class LVISDatasetMapper:
 58 |     """
 59 |     A callable which takes a dataset dict in Detectron2 Dataset format,
 60 |     and map it into a format used by MaskFormer.
 61 | 
 62 |     This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation.
 63 | 
 64 |     The callable currently does the following:
 65 | 
 66 |     1. Read the image from "file_name"
 67 |     2. Applies geometric transforms to the image and annotation
 68 |     3. Find and applies suitable cropping to the image and annotation
 69 |     4. Prepare image and annotation to Tensors
 70 |     """
 71 | 
 72 |     @configurable
 73 |     def __init__(
 74 |         self,
 75 |         is_train=True,
 76 |         tfm_gens=None,
 77 |         image_format=None,
 78 |         min_size_test=None,
 79 |         max_size_test=None,
 80 |         mean=None,
 81 |         std=None,
 82 |         max_len=None,
 83 |     ):
 84 |         """
 85 |         NOTE: this interface is experimental.
 86 |         Args:
 87 |             is_train: for training or inference
 88 |             augmentations: a list of augmentations or deterministic transforms to apply
 89 |             tfm_gens: data augmentation
 90 |             image_format: an image format supported by :func:`detection_utils.read_image`.
 91 |         """
 92 |         self.tfm_gens = tfm_gens
 93 |         self.img_format = image_format
 94 |         self.is_train = is_train
 95 |         self.min_size_test = min_size_test
 96 |         self.max_size_test = max_size_test
 97 |         self.pixel_mean = torch.tensor(mean)[:,None,None]
 98 |         self.pixel_std = torch.tensor(std)[:,None,None]
 99 |         self.max_grounding_num = max_len
100 | 
101 |         t = []
102 |         t.append(transforms.Resize(self.min_size_test, interpolation=Image.BICUBIC))
103 |         self.transform = transforms.Compose(t)
104 |         self.categories = torch.load(MetadataCatalog.get('logistic').get('cat_root'))
105 | 
106 |     @classmethod
107 |     def from_config(cls, cfg, is_train=True):
108 |         # Build augmentation
109 |         if is_train:
110 |             tfm_gens = build_transform_gen(cfg, is_train)
111 |         else:
112 |             tfm_gens = None
113 | 
114 |         ret = {
115 |             "is_train": is_train,
116 |             "tfm_gens": tfm_gens,
117 |             "image_format": cfg['INPUT']['FORMAT'],
118 |             "min_size_test": cfg['INPUT']['MIN_SIZE_TEST'],
119 |             "max_size_test": cfg['INPUT']['MAX_SIZE_TEST'],
120 |             "mean": cfg['INPUT']['PIXEL_MEAN'],
121 |             "std": cfg['INPUT']['PIXEL_STD'],
122 |             "max_len": cfg['MODEL']['DECODER']['GROUNDING']['MAX_LEN'],
123 |         }
124 |         return ret
125 | 
126 |     def __call__(self, dataset_dict):
127 |         """
128 |         Args:
129 |             dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
130 | 
131 |         Returns:
132 |             dict: a format that builtin models in detectron2 accept
133 |         """
134 |         dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
135 |         file_name = dataset_dict['file_name']
136 |         if self.is_train == False:
137 |             assert False, "Only support training."
138 |         else:
139 |             image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
140 |             utils.check_image_size(dataset_dict, image)
141 |             image, transforms = T.apply_transform_gens(self.tfm_gens, image)
142 |             image_shape = image.shape[:2]  # h, w
143 |             dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
144 | 
145 |             assert len(dataset_dict['instance']) > 0
146 |             masks_grd = []
147 |             texts_grd = []
148 |             boxes_grd = []
149 |             hash_grd = []
150 |             for inst, label in zip(dataset_dict['instance'], dataset_dict['labels']):
151 |                 rle = mask.frPyObjects(inst, dataset_dict['height'], dataset_dict['width'])
152 |                 m = mask.decode(rle)
153 |                 # sometimes there are multiple binary map (corresponding to multiple segs)
154 |                 m = np.sum(m, axis=2)
155 |                 m = m.astype(np.uint8)  # convert to np.uint8
156 |                 m = transforms.apply_segmentation(m[:,:,None])[:,:,0]
157 |                 masks_grd += [m]
158 |                 label_names = self.categories[label]
159 |                 rand_id = random.randint(0, len(label_names)-1)
160 |                 texts_grd.append(label_names[rand_id].lower())
161 |                 hash_grd.append(hash(label_names[rand_id].lower()))
162 | 
163 |             indices = torch.randperm(len(hash_grd))[:self.max_grounding_num]
164 |             masks_grd = torch.from_numpy(np.stack(masks_grd))[indices]
165 |             boxes_grd = torch.tensor(boxes_grd)
166 |             texts_grd = np.array(texts_grd)[indices.numpy()].tolist()
167 |             hash_grd = np.array(hash_grd)[indices.numpy()].tolist()
168 |             groundings = {'masks': masks_grd, 'texts': texts_grd, 'hash': hash_grd, 'mode': 'text'}
169 |             dataset_dict["groundings"] = groundings
170 |         return dataset_dict


--------------------------------------------------------------------------------
/datasets/dataset_mappers/mask_former_panoptic_dataset_mapper.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | import copy
  3 | import logging
  4 | 
  5 | import numpy as np
  6 | import torch
  7 | from torch.nn import functional as F
  8 | 
  9 | from detectron2.data import detection_utils as utils
 10 | from detectron2.data import transforms as T
 11 | from detectron2.structures import BitMasks, Instances
 12 | 
 13 | from .mask_former_semantic_dataset_mapper import MaskFormerSemanticDatasetMapper
 14 | from dinov.utils import configurable
 15 | 
 16 | 
 17 | 
 18 | __all__ = ["MaskFormerPanopticDatasetMapper"]
 19 | 
 20 | 
 21 | class MaskFormerPanopticDatasetMapper(MaskFormerSemanticDatasetMapper):
 22 |     """
 23 |     A callable which takes a dataset dict in Detectron2 Dataset format,
 24 |     and map it into a format used by MaskFormer for panoptic segmentation.
 25 | 
 26 |     The callable currently does the following:
 27 | 
 28 |     1. Read the image from "file_name"
 29 |     2. Applies geometric transforms to the image and annotation
 30 |     3. Find and applies suitable cropping to the image and annotation
 31 |     4. Prepare image and annotation to Tensors
 32 |     """
 33 | 
 34 |     @configurable
 35 |     def __init__(
 36 |         self,
 37 |         is_train=True,
 38 |         *,
 39 |         augmentations,
 40 |         image_format,
 41 |         ignore_label,
 42 |         size_divisibility,
 43 |     ):
 44 |         """
 45 |         NOTE: this interface is experimental.
 46 |         Args:
 47 |             is_train: for training or inference
 48 |             augmentations: a list of augmentations or deterministic transforms to apply
 49 |             image_format: an image format supported by :func:`detection_utils.read_image`.
 50 |             ignore_label: the label that is ignored to evaluation
 51 |             size_divisibility: pad image size to be divisible by this value
 52 |         """
 53 |         super().__init__(
 54 |             is_train,
 55 |             augmentations=augmentations,
 56 |             image_format=image_format,
 57 |             ignore_label=ignore_label,
 58 |             size_divisibility=size_divisibility,
 59 |         )
 60 | 
 61 |     def __call__(self, dataset_dict):
 62 |         """
 63 |         Args:
 64 |             dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
 65 | 
 66 |         Returns:
 67 |             dict: a format that builtin models in detectron2 accept
 68 |         """
 69 |         assert self.is_train, "MaskFormerPanopticDatasetMapper should only be used for training!"
 70 | 
 71 |         dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
 72 |         image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
 73 |         utils.check_image_size(dataset_dict, image)
 74 | 
 75 |         # semantic segmentation
 76 |         if "sem_seg_file_name" in dataset_dict:
 77 |             # PyTorch transformation not implemented for uint16, so converting it to double first
 78 |             sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name")).astype("double")
 79 |         else:
 80 |             sem_seg_gt = None
 81 | 
 82 |         # panoptic segmentation
 83 |         if "pan_seg_file_name" in dataset_dict:
 84 |             pan_seg_gt = utils.read_image(dataset_dict.pop("pan_seg_file_name"), "RGB")
 85 |             segments_info = dataset_dict["segments_info"]
 86 |         else:
 87 |             pan_seg_gt = None
 88 |             segments_info = None
 89 | 
 90 |         if pan_seg_gt is None:
 91 |             raise ValueError(
 92 |                 "Cannot find 'pan_seg_file_name' for panoptic segmentation dataset {}.".format(
 93 |                     dataset_dict["file_name"]
 94 |                 )
 95 |             )
 96 | 
 97 |         aug_input = T.AugInput(image, sem_seg=sem_seg_gt)
 98 |         aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input)
 99 |         image = aug_input.image
100 |         if sem_seg_gt is not None:
101 |             sem_seg_gt = aug_input.sem_seg
102 | 
103 |         # apply the same transformation to panoptic segmentation
104 |         pan_seg_gt = transforms.apply_segmentation(pan_seg_gt)
105 | 
106 |         from panopticapi.utils import rgb2id
107 | 
108 |         pan_seg_gt = rgb2id(pan_seg_gt)
109 | 
110 |         # Pad image and segmentation label here!
111 |         image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
112 |         if sem_seg_gt is not None:
113 |             sem_seg_gt = torch.as_tensor(sem_seg_gt.astype("long"))
114 |         pan_seg_gt = torch.as_tensor(pan_seg_gt.astype("long"))
115 | 
116 |         if self.size_divisibility > 0:
117 |             image_size = (image.shape[-2], image.shape[-1])
118 |             padding_size = [
119 |                 0,
120 |                 self.size_divisibility - image_size[1],
121 |                 0,
122 |                 self.size_divisibility - image_size[0],
123 |             ]
124 |             image = F.pad(image, padding_size, value=128).contiguous()
125 |             if sem_seg_gt is not None:
126 |                 sem_seg_gt = F.pad(sem_seg_gt, padding_size, value=self.ignore_label).contiguous()
127 |             pan_seg_gt = F.pad(
128 |                 pan_seg_gt, padding_size, value=0
129 |             ).contiguous()  # 0 is the VOID panoptic label
130 | 
131 |         image_shape = (image.shape[-2], image.shape[-1])  # h, w
132 | 
133 |         # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
134 |         # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
135 |         # Therefore it's important to use torch.Tensor.
136 |         dataset_dict["image"] = image
137 |         if sem_seg_gt is not None:
138 |             dataset_dict["sem_seg"] = sem_seg_gt.long()
139 | 
140 |         if "annotations" in dataset_dict:
141 |             raise ValueError("Pemantic segmentation dataset should not have 'annotations'.")
142 | 
143 |         # Prepare per-category binary masks
144 |         pan_seg_gt = pan_seg_gt.numpy()
145 |         instances = Instances(image_shape)
146 |         classes = []
147 |         masks = []
148 |         for segment_info in segments_info:
149 |             class_id = segment_info["category_id"]
150 |             if not segment_info["iscrowd"]:
151 |                 classes.append(class_id)
152 |                 masks.append(pan_seg_gt == segment_info["id"])
153 | 
154 |         classes = np.array(classes)
155 |         instances.gt_classes = torch.tensor(classes, dtype=torch.int64)
156 |         if len(masks) == 0:
157 |             # Some image does not have annotation (all ignored)
158 |             instances.gt_masks = torch.zeros((0, pan_seg_gt.shape[-2], pan_seg_gt.shape[-1]))
159 |         else:
160 |             masks = BitMasks(
161 |                 torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks])
162 |             )
163 |             instances.gt_masks = masks.tensor
164 |             instances.gt_boxes = masks.get_bounding_boxes()
165 | 
166 |         dataset_dict["instances"] = instances
167 | 
168 |         return dataset_dict
169 | 


--------------------------------------------------------------------------------
/datasets/dataset_mappers/scannet_dataset_mapper.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # X-Decoder -- Generalized Decoding for Pixel, Image, and Language
  3 | # Copyright (c) 2022 Microsoft
  4 | # Licensed under The MIT License [see LICENSE for details]
  5 | # Modified by Xueyan Zou (xueyan@cs.wisc.edu)
  6 | # --------------------------------------------------------
  7 | # Copyright (c) Facebook, Inc. and its affiliates.
  8 | import copy
  9 | 
 10 | import scipy.io
 11 | import numpy as np
 12 | import torch
 13 | from PIL import Image
 14 | 
 15 | from torchvision import transforms
 16 | from dinov.utils import configurable
 17 | 
 18 | __all__ = ["ScanNetSegDatasetMapper"]
 19 | 
 20 | 
 21 | # This is specifically designed for the COCO dataset.
 22 | class ScanNetSegDatasetMapper:
 23 |     """
 24 |     A callable which takes a dataset dict in Detectron2 Dataset format,
 25 |     and map it into a format used by MaskFormer.
 26 | 
 27 |     This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation.
 28 | 
 29 |     The callable currently does the following:
 30 | 
 31 |     1. Read the image from "file_name"
 32 |     2. Applies geometric transforms to the image and annotation
 33 |     3. Find and applies suitable cropping to the image and annotation
 34 |     4. Prepare image and annotation to Tensors
 35 |     """
 36 | 
 37 |     @configurable
 38 |     def __init__(
 39 |         self,
 40 |         is_train=True,
 41 |         min_size_test=None,
 42 |         max_size_test=None,
 43 |         mean=None,
 44 |         std=None,
 45 |     ):
 46 |         """
 47 |         NOTE: this interface is experimental.
 48 |         Args:
 49 |             is_train: for training or inference
 50 |             augmentations: a list of augmentations or deterministic transforms to apply
 51 |             tfm_gens: data augmentation
 52 |             image_format: an image format supported by :func:`detection_utils.read_image`.
 53 |         """
 54 |         self.is_train = is_train
 55 |         self.min_size_test = min_size_test
 56 |         self.max_size_test = max_size_test
 57 |         self.pixel_mean = torch.tensor(mean)[:,None,None]
 58 |         self.pixel_std = torch.tensor(std)[:,None,None]
 59 | 
 60 |         t = []
 61 |         t.append(transforms.Resize(self.min_size_test, interpolation=Image.BICUBIC))
 62 |         self.transform = transforms.Compose(t)
 63 |     
 64 |     @classmethod
 65 |     def from_config(cls, cfg, is_train=True):
 66 |         ret = {
 67 |             "is_train": is_train,
 68 |             "min_size_test": cfg['INPUT']['MIN_SIZE_TEST'],
 69 |             "max_size_test": cfg['INPUT']['MAX_SIZE_TEST'],
 70 |             "mean": cfg['INPUT']['PIXEL_MEAN'],
 71 |             "std": cfg['INPUT']['PIXEL_STD'],
 72 |         }
 73 |         return ret
 74 |     
 75 |     def read_semseg(self, file_name):
 76 |         if '.png' in file_name:
 77 |             semseg = np.asarray(Image.open(file_name))
 78 |         elif '.mat' in file_name:
 79 |             semseg = scipy.io.loadmat(file_name)['LabelMap']
 80 |         return semseg
 81 | 
 82 |     def __call__(self, dataset_dict):
 83 |         """
 84 |         Args:
 85 |             dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
 86 | 
 87 |         Returns:
 88 |             dict: a format that builtin models in detectron2 accept
 89 |         """
 90 |         dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
 91 |         file_name = dataset_dict['file_name']
 92 |         semseg_name = dataset_dict['sem_seg_file_name']
 93 |         image = Image.open(file_name).convert('RGB')
 94 |         
 95 |         dataset_dict['width'] = image.size[0]
 96 |         dataset_dict['height'] = image.size[1]
 97 | 
 98 |         if self.is_train == False:
 99 |             image = self.transform(image)
100 |             image = torch.from_numpy(np.asarray(image).copy())
101 |             image = image.permute(2,0,1)
102 |             
103 |         semseg = self.read_semseg(semseg_name)
104 |         semseg = torch.from_numpy(semseg.astype(np.int32))
105 |         dataset_dict['image'] = image
106 |         dataset_dict['semseg'] = semseg
107 |         return dataset_dict


--------------------------------------------------------------------------------
/datasets/dataset_mappers/scannet_pano_dataset_mapper.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # X-Decoder -- Generalized Decoding for Pixel, Image, and Language
 3 | # Copyright (c) 2022 Microsoft
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # Modified by Xueyan Zou (xueyan@cs.wisc.edu)
 6 | # --------------------------------------------------------
 7 | # Copyright (c) Facebook, Inc. and its affiliates.
 8 | import copy
 9 | 
10 | import scipy.io
11 | import numpy as np
12 | import torch
13 | from PIL import Image
14 | 
15 | from torchvision import transforms
16 | from dinov.utils import configurable
17 | 
18 | __all__ = ["ScanNetPanoDatasetMapper"]
19 | 
20 | 
21 | # This is specifically designed for the COCO dataset.
22 | class ScanNetPanoDatasetMapper:
23 |     """
24 |     A callable which takes a dataset dict in Detectron2 Dataset format,
25 |     and map it into a format used by MaskFormer.
26 | 
27 |     This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation.
28 | 
29 |     The callable currently does the following:
30 | 
31 |     1. Read the image from "file_name"
32 |     2. Applies geometric transforms to the image and annotation
33 |     3. Find and applies suitable cropping to the image and annotation
34 |     4. Prepare image and annotation to Tensors
35 |     """
36 | 
37 |     @configurable
38 |     def __init__(
39 |         self,
40 |         is_train=True,
41 |         min_size_test=None,
42 |         max_size_test=None,
43 |         mean=None,
44 |         std=None,
45 |     ):
46 |         """
47 |         NOTE: this interface is experimental.
48 |         Args:
49 |             is_train: for training or inference
50 |             augmentations: a list of augmentations or deterministic transforms to apply
51 |             tfm_gens: data augmentation
52 |             image_format: an image format supported by :func:`detection_utils.read_image`.
53 |         """
54 |         self.is_train = is_train
55 |         self.min_size_test = min_size_test
56 |         self.max_size_test = max_size_test
57 |         self.pixel_mean = torch.tensor(mean)[:,None,None]
58 |         self.pixel_std = torch.tensor(std)[:,None,None]
59 | 
60 |         t = []
61 |         t.append(transforms.Resize(self.min_size_test, interpolation=Image.BICUBIC))
62 |         self.transform = transforms.Compose(t)
63 |     
64 |     @classmethod
65 |     def from_config(cls, cfg, is_train=True):
66 |         ret = {
67 |             "is_train": is_train,
68 |             "min_size_test": cfg['INPUT']['MIN_SIZE_TEST'],
69 |             "max_size_test": cfg['INPUT']['MAX_SIZE_TEST'],
70 |             "mean": cfg['INPUT']['PIXEL_MEAN'],
71 |             "std": cfg['INPUT']['PIXEL_STD'],
72 |         }
73 |         return ret
74 |     
75 |     def __call__(self, dataset_dict):
76 |         """
77 |         Args:
78 |             dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
79 | 
80 |         Returns:
81 |             dict: a format that builtin models in detectron2 accept
82 |         """
83 |         dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
84 |         file_name = dataset_dict['file_name']
85 |         image = Image.open(file_name).convert('RGB')
86 | 
87 |         dataset_dict['file_name'] = '_'.join(file_name.split('/')[-3:]) # HACK for /tmp file storage on predictions.
88 |         dataset_dict['width'] = image.size[0]
89 |         dataset_dict['height'] = image.size[1]
90 | 
91 |         image = self.transform(image)
92 |         image = torch.from_numpy(np.asarray(image).copy())
93 |         image = image.permute(2,0,1)
94 |         dataset_dict['image'] = image
95 |         return dataset_dict


--------------------------------------------------------------------------------
/datasets/dataset_mappers/sunrgbd_dataset_mapper.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # X-Decoder -- Generalized Decoding for Pixel, Image, and Language
  3 | # Copyright (c) 2022 Microsoft
  4 | # Licensed under The MIT License [see LICENSE for details]
  5 | # Modified by Xueyan Zou (xueyan@cs.wisc.edu)
  6 | # --------------------------------------------------------
  7 | # Copyright (c) Facebook, Inc. and its affiliates.
  8 | import copy
  9 | 
 10 | import scipy.io
 11 | import numpy as np
 12 | import torch
 13 | from PIL import Image
 14 | 
 15 | from torchvision import transforms
 16 | from dinov.utils import configurable
 17 | 
 18 | __all__ = ["SunRGBDSegDatasetMapper"]
 19 | 
 20 | 
 21 | # This is specifically designed for the COCO dataset.
 22 | class SunRGBDSegDatasetMapper:
 23 |     """
 24 |     A callable which takes a dataset dict in Detectron2 Dataset format,
 25 |     and map it into a format used by MaskFormer.
 26 | 
 27 |     This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation.
 28 | 
 29 |     The callable currently does the following:
 30 | 
 31 |     1. Read the image from "file_name"
 32 |     2. Applies geometric transforms to the image and annotation
 33 |     3. Find and applies suitable cropping to the image and annotation
 34 |     4. Prepare image and annotation to Tensors
 35 |     """
 36 | 
 37 |     @configurable
 38 |     def __init__(
 39 |         self,
 40 |         is_train=True,
 41 |         min_size_test=None,
 42 |         max_size_test=None,
 43 |         mean=None,
 44 |         std=None,
 45 |     ):
 46 |         """
 47 |         NOTE: this interface is experimental.
 48 |         Args:
 49 |             is_train: for training or inference
 50 |             augmentations: a list of augmentations or deterministic transforms to apply
 51 |             tfm_gens: data augmentation
 52 |             image_format: an image format supported by :func:`detection_utils.read_image`.
 53 |         """
 54 |         self.is_train = is_train
 55 |         self.min_size_test = min_size_test
 56 |         self.max_size_test = max_size_test
 57 |         self.pixel_mean = torch.tensor(mean)[:,None,None]
 58 |         self.pixel_std = torch.tensor(std)[:,None,None]
 59 | 
 60 |         t = []
 61 |         t.append(transforms.Resize(self.min_size_test, interpolation=Image.BICUBIC))
 62 |         self.transform = transforms.Compose(t)
 63 |     
 64 |     @classmethod
 65 |     def from_config(cls, cfg, is_train=True):
 66 |         ret = {
 67 |             "is_train": is_train,
 68 |             "min_size_test": cfg['INPUT']['MIN_SIZE_TEST'],
 69 |             "max_size_test": cfg['INPUT']['MAX_SIZE_TEST'],
 70 |             "mean": cfg['INPUT']['PIXEL_MEAN'],
 71 |             "std": cfg['INPUT']['PIXEL_STD'],
 72 |         }
 73 |         return ret
 74 |     
 75 |     def read_semseg(self, file_name):
 76 |         if '.png' in file_name:
 77 |             semseg = np.asarray(Image.open(file_name))
 78 |         elif '.mat' in file_name:
 79 |             semseg = scipy.io.loadmat(file_name)['LabelMap']
 80 |         return semseg
 81 | 
 82 |     def __call__(self, dataset_dict):
 83 |         """
 84 |         Args:
 85 |             dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
 86 | 
 87 |         Returns:
 88 |             dict: a format that builtin models in detectron2 accept
 89 |         """
 90 |         dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
 91 |         file_name = dataset_dict['file_name']
 92 |         semseg_name = dataset_dict['sem_seg_file_name']
 93 |         image = Image.open(file_name).convert('RGB')
 94 | 
 95 |         dataset_dict['width'] = image.size[0]
 96 |         dataset_dict['height'] = image.size[1]
 97 | 
 98 |         if self.is_train == False:
 99 |             image = self.transform(image)
100 |             image = torch.from_numpy(np.asarray(image).copy())
101 |             image = image.permute(2,0,1)
102 |             
103 |         semseg = self.read_semseg(semseg_name)
104 |         semseg = torch.from_numpy(semseg.astype(np.int32))
105 |         dataset_dict['image'] = image
106 |         dataset_dict['semseg'] = semseg
107 |         return dataset_dict


--------------------------------------------------------------------------------
/datasets/evaluation/__init__.py:
--------------------------------------------------------------------------------
1 | from .instance_evaluation import *
2 | from .segmentation_evaluation import *
3 | from .panoptic_evaluation import *
4 | from .interactive_evaluation import *


--------------------------------------------------------------------------------
/datasets/evaluation/instance_evaluation.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | import contextlib
  3 | import copy
  4 | import io
  5 | import itertools
  6 | import json
  7 | import logging
  8 | import numpy as np
  9 | import os
 10 | import pickle
 11 | from collections import OrderedDict
 12 | import pycocotools.mask as mask_util
 13 | import torch
 14 | from pycocotools.coco import COCO
 15 | from pycocotools.cocoeval import COCOeval
 16 | from tabulate import tabulate
 17 | 
 18 | import detectron2.utils.comm as comm
 19 | from detectron2.config import CfgNode
 20 | from detectron2.data import MetadataCatalog
 21 | from detectron2.data.datasets.coco import convert_to_coco_json
 22 | from detectron2.evaluation.coco_evaluation import COCOEvaluator, _evaluate_predictions_on_coco
 23 | from detectron2.evaluation.fast_eval_api import COCOeval_opt
 24 | from detectron2.structures import Boxes, BoxMode, pairwise_iou
 25 | from detectron2.utils.file_io import PathManager
 26 | from detectron2.utils.logger import create_small_table
 27 | 
 28 | 
 29 | # modified from COCOEvaluator for instance segmetnat
 30 | class InstanceSegEvaluator(COCOEvaluator):
 31 |     """
 32 |     Evaluate AR for object proposals, AP for instance detection/segmentation, AP
 33 |     for keypoint detection outputs using COCO's metrics.
 34 |     See http://cocodataset.org/#detection-eval and
 35 |     http://cocodataset.org/#keypoints-eval to understand its metrics.
 36 |     The metrics range from 0 to 100 (instead of 0 to 1), where a -1 or NaN means
 37 |     the metric cannot be computed (e.g. due to no predictions made).
 38 | 
 39 |     In addition to COCO, this evaluator is able to support any bounding box detection,
 40 |     instance segmentation, or keypoint detection dataset.
 41 |     """
 42 | 
 43 |     def _eval_predictions(self, predictions, img_ids=None):
 44 |         """
 45 |         Evaluate predictions. Fill self._results with the metrics of the tasks.
 46 |         """
 47 |         self._logger.info("Preparing results for COCO format ...")
 48 |         coco_results = list(itertools.chain(*[x["instances"] for x in predictions]))
 49 |         tasks = self._tasks or self._tasks_from_predictions(coco_results)
 50 | 
 51 |         # unmap the category ids for COCO
 52 |         if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"):
 53 |             dataset_id_to_contiguous_id = self._metadata.thing_dataset_id_to_contiguous_id
 54 |             # all_contiguous_ids = list(dataset_id_to_contiguous_id.values())
 55 |             # num_classes = len(all_contiguous_ids)
 56 |             # assert min(all_contiguous_ids) == 0 and max(all_contiguous_ids) == num_classes - 1
 57 | 
 58 |             reverse_id_mapping = {v: k for k, v in dataset_id_to_contiguous_id.items()}
 59 |             for result in coco_results:
 60 |                 category_id = result["category_id"]
 61 |                 # assert category_id < num_classes, (
 62 |                 #     f"A prediction has class={category_id}, "
 63 |                 #     f"but the dataset only has {num_classes} classes and "
 64 |                 #     f"predicted class id should be in [0, {num_classes - 1}]."
 65 |                 # )
 66 |                 assert category_id in reverse_id_mapping, (
 67 |                     f"A prediction has class={category_id}, "
 68 |                     f"but the dataset only has class ids in {dataset_id_to_contiguous_id}."
 69 |                 )
 70 |                 result["category_id"] = reverse_id_mapping[category_id]
 71 | 
 72 |         if self._output_dir:
 73 |             file_path = os.path.join(self._output_dir, "coco_instances_results.json")
 74 |             self._logger.info("Saving results to {}".format(file_path))
 75 |             with PathManager.open(file_path, "w") as f:
 76 |                 f.write(json.dumps(coco_results))
 77 |                 f.flush()
 78 | 
 79 |         if not self._do_evaluation:
 80 |             self._logger.info("Annotations are not available for evaluation.")
 81 |             return
 82 | 
 83 |         self._logger.info(
 84 |             "Evaluating predictions with {} COCO API...".format(
 85 |                 "unofficial" if self._use_fast_impl else "official"
 86 |             )
 87 |         )
 88 |         for task in sorted(tasks):
 89 |             assert task in {"bbox", "segm", "keypoints"}, f"Got unknown task: {task}!"
 90 |             coco_eval = (
 91 |                 _evaluate_predictions_on_coco(
 92 |                     self._coco_api,
 93 |                     coco_results,
 94 |                     task,
 95 |                     kpt_oks_sigmas=self._kpt_oks_sigmas,
 96 |                     use_fast_impl=self._use_fast_impl,
 97 |                     img_ids=img_ids,
 98 |                     max_dets_per_image=self._max_dets_per_image,
 99 |                 )
100 |                 if len(coco_results) > 0
101 |                 else None  # cocoapi does not handle empty results very well
102 |             )
103 | 
104 |             res = self._derive_coco_results(
105 |                 coco_eval, task, class_names=self._metadata.get("thing_classes")
106 |             )
107 |             self._results[task] = res
108 | 


--------------------------------------------------------------------------------
/datasets/evaluation/pascal_part_evaluation.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | import contextlib
  3 | import copy
  4 | import io
  5 | import itertools
  6 | import json
  7 | import logging
  8 | import numpy as np
  9 | import os
 10 | import pickle
 11 | from collections import OrderedDict
 12 | import pycocotools.mask as mask_util
 13 | import torch
 14 | from pycocotools.coco import COCO
 15 | from pycocotools.cocoeval import COCOeval
 16 | from tabulate import tabulate
 17 | 
 18 | import detectron2.utils.comm as comm
 19 | from detectron2.config import CfgNode
 20 | from detectron2.data import MetadataCatalog
 21 | from detectron2.data.datasets.coco import convert_to_coco_json
 22 | from detectron2.evaluation.coco_evaluation import COCOEvaluator
 23 | from detectron2.structures import Boxes, BoxMode, pairwise_iou
 24 | from detectron2.utils.file_io import PathManager
 25 | from detectron2.utils.logger import create_small_table
 26 | from ..registration.register_pascal_part_all import (
 27 |     PASCAL_PART_BASE_CATEGORIES as categories_seen,
 28 |     PASCAL_PART_NOVEL_CATEGORIES as categories_unseen,
 29 | )
 30 | 
 31 | 
 32 | class PASCALPARTEvaluator(COCOEvaluator):
 33 |     """
 34 |     PASCALPARTEvaluator on open_vocabulary
 35 |     """
 36 | 
 37 |     def _derive_coco_results(self, coco_eval, iou_type, class_names=None):
 38 |         """
 39 |         Additionally plot mAP for 'seen classes' and 'unseen classes'
 40 |         """
 41 | 
 42 |         metrics = {
 43 |             "bbox": ["AP", "AP50", "AP75", "APs", "APm", "APl"],
 44 |             "segm": ["AP", "AP50", "AP75", "APs", "APm", "APl"],
 45 |             "keypoints": ["AP", "AP50", "AP75", "APm", "APl"],
 46 |         }[iou_type]
 47 | 
 48 |         if coco_eval is None:
 49 |             self._logger.warn("No predictions from the model!")
 50 |             return {metric: float("nan") for metric in metrics}
 51 | 
 52 |         # the standard metrics
 53 |         results = {
 54 |             metric: float(coco_eval.stats[idx] * 100 if coco_eval.stats[idx] >= 0 else "nan")
 55 |             for idx, metric in enumerate(metrics)
 56 |         }
 57 |         self._logger.info(
 58 |             "Evaluation results for {}: \n".format(iou_type) + create_small_table(results)
 59 |         )
 60 |         if not np.isfinite(sum(results.values())):
 61 |             self._logger.info("Some metrics cannot be computed and is shown as NaN.")
 62 | 
 63 |         if class_names is None or len(class_names) <= 1:
 64 |             return results
 65 |         # Compute per-category AP
 66 |         # from https://github.com/facebookresearch/Detectron/blob/a6a835f5b8208c45d0dce217ce9bbda915f44df7/detectron/datasets/json_dataset_evaluator.py#L222-L252 # noqa
 67 |         precisions = coco_eval.eval["precision"]
 68 |         # precision has dims (iou, recall, cls, area range, max dets)
 69 |         assert len(class_names) == precisions.shape[2]
 70 | 
 71 |         seen_names = set([x['name'] for x in categories_seen])
 72 |         unseen_names = set([x['name'] for x in categories_unseen])
 73 |         results_per_category = []
 74 |         results_per_category50 = []
 75 |         results_per_category_seen = []
 76 |         results_per_category_unseen = []
 77 |         results_per_category50_seen = []
 78 |         results_per_category50_unseen = []
 79 |         for idx, name in enumerate(class_names):
 80 |             # area range index 0: all area ranges
 81 |             # max dets index -1: typically 100 per image
 82 |             precision = precisions[:, :, idx, 0, -1]
 83 |             precision = precision[precision > -1]
 84 |             ap = np.mean(precision) if precision.size else float("nan")
 85 |             results_per_category.append(("{}".format(name), float(ap * 100)))
 86 |             precision50 = precisions[0, :, idx, 0, -1]
 87 |             precision50 = precision50[precision50 > -1]
 88 |             ap50 = np.mean(precision50) if precision50.size else float("nan")
 89 |             results_per_category50.append(("{}".format(name), float(ap50 * 100)))
 90 |             if name in seen_names:
 91 |                 results_per_category_seen.append(float(ap * 100))
 92 |                 results_per_category50_seen.append(float(ap50 * 100))
 93 |             if name in unseen_names:
 94 |                 results_per_category_unseen.append(float(ap * 100))
 95 |                 results_per_category50_unseen.append(float(ap50 * 100))
 96 | 
 97 |         # tabulate it
 98 |         N_COLS = min(6, len(results_per_category) * 2)
 99 |         results_flatten = list(itertools.chain(*results_per_category))
100 |         results_2d = itertools.zip_longest(*[results_flatten[i::N_COLS] for i in range(N_COLS)])
101 |         table = tabulate(
102 |             results_2d,
103 |             tablefmt="pipe",
104 |             floatfmt=".3f",
105 |             headers=["category", "AP"] * (N_COLS // 2),
106 |             numalign="left",
107 |         )
108 |         self._logger.info("Per-category {} AP: \n".format(iou_type) + table)
109 | 
110 |         N_COLS = min(6, len(results_per_category50) * 2)
111 |         results_flatten = list(itertools.chain(*results_per_category50))
112 |         results_2d = itertools.zip_longest(*[results_flatten[i::N_COLS] for i in range(N_COLS)])
113 |         table = tabulate(
114 |             results_2d,
115 |             tablefmt="pipe",
116 |             floatfmt=".3f",
117 |             headers=["category", "AP50"] * (N_COLS // 2),
118 |             numalign="left",
119 |         )
120 |         self._logger.info("Per-category {} AP50: \n".format(iou_type) + table)
121 | 
122 |         self._logger.info(
123 |             "Seen {} AP: {}".format(
124 |                 iou_type,
125 |                 sum(results_per_category_seen) / len(results_per_category_seen),
126 |             ))
127 |         self._logger.info(
128 |             "Unseen {} AP: {}".format(
129 |                 iou_type,
130 |                 sum(results_per_category_unseen) / len(results_per_category_unseen),
131 |             ))
132 | 
133 |         self._logger.info(
134 |             "Seen {} AP50: {}".format(
135 |                 iou_type,
136 |                 sum(results_per_category50_seen) / len(results_per_category50_seen),
137 |             ))
138 |         self._logger.info(
139 |             "Unseen {} AP50: {}".format(
140 |                 iou_type,
141 |                 sum(results_per_category50_unseen) / len(results_per_category50_unseen),
142 |             ))
143 | 
144 |         results.update({"AP-" + name: ap for name, ap in results_per_category})
145 |         results["AP-seen"] = sum(results_per_category_seen) / len(results_per_category_seen)
146 |         results["AP-unseen"] = sum(results_per_category_unseen) / len(results_per_category_unseen)
147 |         results["AP50-seen"] = sum(results_per_category50_seen) / len(results_per_category50_seen)
148 |         results["AP50-unseen"] = sum(results_per_category50_unseen) / len(results_per_category50_unseen)
149 |         return results


--------------------------------------------------------------------------------
/datasets/registration/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | from . import (
 3 |     register_ade20k_full,
 4 |     register_ade20k_panoptic,
 5 |     register_coco_stuff_10k,
 6 |     register_coco_panoptic_annos_semseg,
 7 |     register_coco_panoptic_annos_semseg_interactive,
 8 |     register_coco_panoptic_annos_semseg_interactive_jointboxpoint,
 9 |     register_ade20k_instance,
10 |     register_sam,
11 |     register_sunrgbd_semseg,
12 |     register_scannet_semseg,
13 |     register_bdd100k_semseg,
14 |     register_scannet_panoptic,
15 |     register_bdd100k_panoseg,
16 |     register_object365_od,
17 |     register_pascal_part_all,
18 |     register_pascal_part_all_interactive,
19 |     register_paco_part_all,
20 |     register_partimagenet_part_all,
21 | )
22 | 
23 | from . import (
24 |     register_ytvos_dataset,
25 |     register_davis_dataset,
26 |     register_seginw_instance,
27 |     register_lvis_eval,
28 |     register_context_semseg,
29 |     register_odinw_od,
30 | )


--------------------------------------------------------------------------------
/datasets/registration/register_ade20k_instance.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | import json
 3 | import logging
 4 | import numpy as np
 5 | import os
 6 | from PIL import Image
 7 | 
 8 | from detectron2.data import DatasetCatalog, MetadataCatalog
 9 | from detectron2.data.datasets.coco import load_coco_json, register_coco_instances
10 | from detectron2.utils.file_io import PathManager
11 | 
12 | ADE_CATEGORIES = [{'id': 7, 'name': 'bed'}, {'id': 8, 'name': 'windowpane'}, {'id': 10, 'name': 'cabinet'}, {'id': 12, 'name': 'person'}, {'id': 14, 'name': 'door'}, {'id': 15, 'name': 'table'}, {'id': 18, 'name': 'curtain'}, {'id': 19, 'name': 'chair'}, {'id': 20, 'name': 'car'}, {'id': 22, 'name': 'painting'}, {'id': 23, 'name': 'sofa'}, {'id': 24, 'name': 'shelf'}, {'id': 27, 'name': 'mirror'}, {'id': 30, 'name': 'armchair'}, {'id': 31, 'name': 'seat'}, {'id': 32, 'name': 'fence'}, {'id': 33, 'name': 'desk'}, {'id': 35, 'name': 'wardrobe'}, {'id': 36, 'name': 'lamp'}, {'id': 37, 'name': 'bathtub'}, {'id': 38, 'name': 'railing'}, {'id': 39, 'name': 'cushion'}, {'id': 41, 'name': 'box'}, {'id': 42, 'name': 'column'}, {'id': 43, 'name': 'signboard'}, {'id': 44, 'name': 'chest of drawers'}, {'id': 45, 'name': 'counter'}, {'id': 47, 'name': 'sink'}, {'id': 49, 'name': 'fireplace'}, {'id': 50, 'name': 'refrigerator'}, {'id': 53, 'name': 'stairs'}, {'id': 55, 'name': 'case'}, {'id': 56, 'name': 'pool table'}, {'id': 57, 'name': 'pillow'}, {'id': 58, 'name': 'screen door'}, {'id': 62, 'name': 'bookcase'}, {'id': 64, 'name': 'coffee table'}, {'id': 65, 'name': 'toilet'}, {'id': 66, 'name': 'flower'}, {'id': 67, 'name': 'book'}, {'id': 69, 'name': 'bench'}, {'id': 70, 'name': 'countertop'}, {'id': 71, 'name': 'stove'}, {'id': 72, 'name': 'palm'}, {'id': 73, 'name': 'kitchen island'}, {'id': 74, 'name': 'computer'}, {'id': 75, 'name': 'swivel chair'}, {'id': 76, 'name': 'boat'}, {'id': 78, 'name': 'arcade machine'}, {'id': 80, 'name': 'bus'}, {'id': 81, 'name': 'towel'}, {'id': 82, 'name': 'light'}, {'id': 83, 'name': 'truck'}, {'id': 85, 'name': 'chandelier'}, {'id': 86, 'name': 'awning'}, {'id': 87, 'name': 'streetlight'}, {'id': 88, 'name': 'booth'}, {'id': 89, 'name': 'television receiver'}, {'id': 90, 'name': 'airplane'}, {'id': 92, 'name': 'apparel'}, {'id': 93, 'name': 'pole'}, {'id': 95, 'name': 'bannister'}, {'id': 97, 'name': 'ottoman'}, {'id': 98, 'name': 'bottle'}, {'id': 102, 'name': 'van'}, {'id': 103, 'name': 'ship'}, {'id': 104, 'name': 'fountain'}, {'id': 107, 'name': 'washer'}, {'id': 108, 'name': 'plaything'}, {'id': 110, 'name': 'stool'}, {'id': 111, 'name': 'barrel'}, {'id': 112, 'name': 'basket'}, {'id': 115, 'name': 'bag'}, {'id': 116, 'name': 'minibike'}, {'id': 118, 'name': 'oven'}, {'id': 119, 'name': 'ball'}, {'id': 120, 'name': 'food'}, {'id': 121, 'name': 'step'}, {'id': 123, 'name': 'trade name'}, {'id': 124, 'name': 'microwave'}, {'id': 125, 'name': 'pot'}, {'id': 126, 'name': 'animal'}, {'id': 127, 'name': 'bicycle'}, {'id': 129, 'name': 'dishwasher'}, {'id': 130, 'name': 'screen'}, {'id': 132, 'name': 'sculpture'}, {'id': 133, 'name': 'hood'}, {'id': 134, 'name': 'sconce'}, {'id': 135, 'name': 'vase'}, {'id': 136, 'name': 'traffic light'}, {'id': 137, 'name': 'tray'}, {'id': 138, 'name': 'ashcan'}, {'id': 139, 'name': 'fan'}, {'id': 142, 'name': 'plate'}, {'id': 143, 'name': 'monitor'}, {'id': 144, 'name': 'bulletin board'}, {'id': 146, 'name': 'radiator'}, {'id': 147, 'name': 'glass'}, {'id': 148, 'name': 'clock'}, {'id': 149, 'name': 'flag'}]
13 | 
14 | 
15 | _PREDEFINED_SPLITS = {
16 |     # point annotations without masks
17 |     "ade20k_instance_train": (
18 |         "ADEChallengeData2016/images/training",
19 |         "ADEChallengeData2016/ade20k_instance_train.json",
20 |     ),
21 |     "ade20k_instance_val": (
22 |         "ADEChallengeData2016/images/validation",
23 |         "ADEChallengeData2016/ade20k_instance_val.json",
24 |     ),
25 | }
26 | 
27 | 
28 | def _get_ade_instances_meta():
29 |     thing_ids = [k["id"] for k in ADE_CATEGORIES]
30 |     assert len(thing_ids) == 100, len(thing_ids)
31 |     # Mapping from the incontiguous ADE category id to an id in [0, 99]
32 |     thing_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(thing_ids)}
33 |     thing_classes = [k["name"] for k in ADE_CATEGORIES]
34 |     ret = {
35 |         "thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id,
36 |         "thing_classes": thing_classes,
37 |     }
38 |     return ret
39 | 
40 | 
41 | def register_all_ade20k_instance(root):
42 |     for key, (image_root, json_file) in _PREDEFINED_SPLITS.items():
43 |         # Assume pre-defined datasets live in `./datasets`.
44 |         register_coco_instances(
45 |             key,
46 |             _get_ade_instances_meta(),
47 |             os.path.join(root, json_file) if "://" not in json_file else json_file,
48 |             os.path.join(root, image_root),
49 |         )
50 | 
51 | 
52 | _root = os.getenv("DETECTRON2_DATASETS", "datasets")
53 | if _root!='datasets':
54 |     register_all_ade20k_instance(_root)
55 | 


--------------------------------------------------------------------------------
/datasets/registration/register_bdd100k_semseg.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # X-Decoder -- Generalized Decoding for Pixel, Image, and Language
 3 | # Copyright (c) 2022 Microsoft
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # Modified by Xueyan Zou (xueyan@cs.wisc.edu)
 6 | # --------------------------------------------------------
 7 | # Copyright (c) Facebook, Inc. and its affiliates.
 8 | import numpy as np
 9 | import os
10 | import glob
11 | from typing import List, Tuple, Union
12 | 
13 | from detectron2.data import DatasetCatalog, MetadataCatalog
14 | from detectron2.utils.file_io import PathManager
15 | 
16 | from utils.constants import BDD_SEM
17 | 
18 | __all__ = ["load_scannet_instances", "register_scannet_context"]
19 | 
20 | 
21 | def load_bdd_instances(name: str, dirname: str, split: str, class_names: Union[List[str], Tuple[str, ...]]):
22 |     """
23 |     Load BDD annotations to Detectron2 format.
24 | 
25 |     Args:
26 |         dirname: Contain "Annotations", "ImageSets", "JPEGImages"
27 |         split (str): one of "train", "test", "val", "trainval"
28 |         class_names: list or tuple of class names
29 |     """
30 |     img_folder = os.path.join(dirname, 'images', '10k', split)
31 |     img_pths = sorted(glob.glob(os.path.join(img_folder, '*.jpg')))
32 |     
33 |     sem_folder = os.path.join(dirname, 'labels', 'sem_seg', 'masks', split)
34 |     sem_pths = sorted(glob.glob(os.path.join(sem_folder, '*.png')))
35 | 
36 |     assert len(img_pths) == len(sem_pths)
37 |         
38 |     dicts = []
39 |     for img_pth, sem_pth in zip(img_pths, sem_pths):
40 |         r = {
41 |             "file_name": img_pth,
42 |             "sem_seg_file_name": sem_pth,
43 |             "image_id": img_pth.split('/')[-1].split('.')[0],
44 |         }
45 |         dicts.append(r)
46 |     return dicts
47 | 
48 | 
49 | def register_bdd_context(name, dirname, split, class_names=BDD_SEM):
50 |     DatasetCatalog.register(name, lambda: load_bdd_instances(name, dirname, split, class_names))
51 |     MetadataCatalog.get(name).set(
52 |         stuff_classes=class_names,
53 |         dirname=dirname,
54 |         split=split,
55 |         ignore_label=[255],
56 |         thing_dataset_id_to_contiguous_id={},
57 |         class_offset=0,
58 |         keep_sem_bgd=False
59 |     )
60 | 
61 | 
62 | def register_all_sunrgbd_seg(root):
63 |     SPLITS = [
64 |             ("bdd10k_val_sem_seg", "bdd100k", "val"),
65 |         ]
66 |         
67 |     for name, dirname, split in SPLITS:
68 |         register_bdd_context(name, os.path.join(root, dirname), split)
69 |         MetadataCatalog.get(name).evaluator_type = "sem_seg"
70 | 
71 | 
72 | _root = os.getenv("DATSETW", "datasets")
73 | if _root!='datasets':
74 |     register_all_sunrgbd_seg(_root)


--------------------------------------------------------------------------------
/datasets/registration/register_context_semseg.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | import numpy as np
 4 | import os
 5 | import xml.etree.ElementTree as ET
 6 | from typing import List, Tuple, Union
 7 | 
 8 | from detectron2.data import DatasetCatalog, MetadataCatalog
 9 | from detectron2.structures import BoxMode
10 | from detectron2.utils.file_io import PathManager
11 | 
12 | from utils.constants import PASCAL_CONTEXT_459, PASCAL_CONTEXT_59, PASCAL_CONTEXT_33
13 | 
14 | __all__ = ["load_context_instances", "register_pascal_context"]
15 | dataset2class = {"context_459_val_seg": PASCAL_CONTEXT_459,
16 |                  "context_59_val_seg": PASCAL_CONTEXT_59}
17 | dataset2labelfolder = {"context_459_val_seg": "trainval",
18 |                        "context_59_val_seg": "59_context_labels"}
19 | dataset2postfix = {"context_459_val_seg": ".mat",
20 |                    "context_59_val_seg": ".png"}
21 | dataset2segloader = {"context_459_val_seg": "MAT",
22 |                      "context_59_val_seg": "PIL"}
23 | 
24 | 
25 | def load_context_instances(name: str, dirname: str, split: str, class_names: Union[List[str], Tuple[str, ...]]):
26 |     """
27 |     Load Pascal VOC detection annotations to Detectron2 format.
28 | 
29 |     Args:
30 |         dirname: Contain "Annotations", "ImageSets", "JPEGImages"
31 |         split (str): one of "train", "test", "val", "trainval"
32 |         class_names: list or tuple of class names
33 |     """
34 |     with PathManager.open(os.path.join(dirname, "VOC2010", "ImageSets", "Main", split + ".txt")) as f:
35 |         fileids = np.loadtxt(f, dtype=np.str)
36 | 
37 |     # Needs to read many small annotation files. Makes sense at local
38 |     image_dirname = PathManager.get_local_path(os.path.join(dirname, "VOC2010"))
39 |     semseg_dirname = PathManager.get_local_path(os.path.join(dirname, dataset2labelfolder[name]))
40 | 
41 |     dicts = []
42 |     for fileid in fileids:
43 |         jpeg_file = os.path.join(image_dirname, "JPEGImages", fileid + ".jpg")
44 |         seg_file = os.path.join(semseg_dirname, fileid + dataset2postfix[name])
45 | 
46 |         r = {
47 |             "file_name": jpeg_file,
48 |             "sem_seg_file_name": seg_file,
49 |             "image_id": fileid,
50 |         }
51 |         dicts.append(r)
52 |     return dicts
53 | 
54 | 
55 | def register_pascal_context(name, dirname, split, year, class_names=dataset2class):
56 |     DatasetCatalog.register(name, lambda: load_context_instances(name, dirname, split, class_names))
57 |     MetadataCatalog.get(name).set(
58 |         stuff_classes=class_names[name],
59 |         dirname=dirname,
60 |         year=year,
61 |         split=split,
62 |         ignore_label=[0],
63 |         thing_dataset_id_to_contiguous_id={},
64 |         class_offset=1,
65 |         semseg_loader=dataset2segloader[name],
66 |         keep_sem_bgd=False
67 |     )
68 | 
69 | 
70 | def register_all_context_seg(root):
71 |     SPLITS = [
72 |         ("context_459_val_seg", "pascal_context", "val"),
73 |         ("context_59_val_seg", "pascal_context", "val"),
74 |     ]
75 |     year = 2010
76 |     for name, dirname, split in SPLITS:
77 |         register_pascal_context(name, os.path.join(root, dirname), split, year)
78 |         MetadataCatalog.get(name).evaluator_type = "sem_seg"
79 | 
80 | 
81 | _root = os.getenv("DATSETW", "datasets")
82 | if _root!='datasets':
83 |     register_all_context_seg(_root)


--------------------------------------------------------------------------------
/datasets/registration/register_davis_dataset.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | import os
 4 | import glob
 5 | import json
 6 | from typing import List, Tuple, Union
 7 | 
 8 | import cv2
 9 | import numpy as np
10 | from scipy.io import loadmat
11 | 
12 | from detectron2.data import DatasetCatalog, MetadataCatalog
13 | from detectron2.structures import BoxMode
14 | from detectron2.utils.file_io import PathManager
15 | 
16 | 
17 | __all__ = ["load_davis_instances", "register_davis_context"]
18 | 
19 | def load_davis_instances(name: str, dirname: str, split: str, year: str):
20 |     """
21 |     Load Pascal VOC detection annotations to Detectron2 format.
22 | 
23 |     Args:
24 |         dirname: Contain "Annotations", "ImageSets", "JPEGImages"
25 |         split (str): one of "train", "test", "val", "trainval"
26 |         class_names: list or tuple of class names
27 |     """
28 |     meta_txt = os.path.join(dirname, 'ImageSets', year, "{}.txt".format(split))
29 |     meta_json = os.path.join(dirname, 'video_objects_info.json')
30 |     meta_json = json.load(open(meta_json))['videos']
31 |     video_names = [line.strip() for line in open(meta_txt).readlines()]
32 | 
33 |     video_dir = os.path.join(dirname, 'JPEGImages', '480p')
34 |     mask_dir = os.path.join(dirname, 'Annotations', '480p')
35 |     scibble_dir = os.path.join(dirname, 'Scribbles', '480p')
36 |     semantic_dir = os.path.join(dirname, 'Annotations_semantics', '480p')
37 | 
38 |     dicts = []
39 |     for vid_name in video_names:
40 |         objects = meta_json[vid_name]['objects']
41 |         r = {
42 |             "file_name": os.path.join(video_dir, vid_name),
43 |             "mask_name": os.path.join(mask_dir, vid_name),
44 |             "scibble_name": os.path.join(scibble_dir, vid_name),
45 |             "semantic_name": os.path.join(semantic_dir, vid_name),
46 |             "objects": objects,
47 |         }
48 |         dicts.append(r)
49 |     return dicts
50 | 
51 | def register_davis_context(name, dirname, split, year):
52 |     if not os.path.exists(dirname):
53 |         print("not register for ", name)
54 |         return -1
55 |     load_davis_instances(name, dirname, split, year)
56 |     DatasetCatalog.register("{}".format(name), lambda: load_davis_instances(name, dirname, split, year))
57 |     MetadataCatalog.get("{}".format(name)).set(
58 |         dirname=dirname,
59 |         thing_dataset_id_to_contiguous_id={},
60 |     )
61 | 
62 | def register_all_davis(root):
63 |     SPLITS = [
64 |             ("davis17_val", "DAVIS17", "val", "2017"),
65 |             ("davis16_val", "DAVIS17", "val", "2016"),
66 |         ]
67 | 
68 |     for name, dirname, split, year in SPLITS:
69 |         register_davis_context(name, os.path.join(root, dirname), split, year)
70 |         MetadataCatalog.get("{}".format(name)).evaluator_type = None
71 | 
72 | _root = os.getenv("TRACKING_DATASET", "datasets")
73 | if _root!='datasets':
74 |     register_all_davis(_root)
75 | 


--------------------------------------------------------------------------------
/datasets/registration/register_lvis_eval.py:
--------------------------------------------------------------------------------
  1 | from detectron2.data.datasets import get_lvis_instances_meta
  2 | from detectron2.data import DatasetCatalog, MetadataCatalog
  3 | from utils.lvis_cat import LVIS_CATEGORIES as LVIS_V1_CATEGORIES
  4 | # from utils.constants import LVIS_CATEGORIES as LVIS_V1_CATEGORIES
  5 | import logging
  6 | import os
  7 | from detectron2.utils.file_io import PathManager
  8 | from fvcore.common.timer import Timer
  9 | import json
 10 | 
 11 | 
 12 | 
 13 | _PREDEFINED_SPLITS_LVIS = {
 14 |     "lvis_v1": {
 15 |         "lvis_v1_minival": ("coco/", "coco/annotations/lvis_v1_minival_inserted_image_name.json"),
 16 |         "lvis_train": ("coco/", "lvis/lvis_v1_train.json"),
 17 |     },
 18 | }
 19 | 
 20 | def get_lvis_instances_meta_v1():
 21 |     assert len(LVIS_V1_CATEGORIES) == 1203
 22 |     cat_ids = [k["id"] for k in LVIS_V1_CATEGORIES]
 23 |     assert min(cat_ids) == 1 and max(cat_ids) == len(
 24 |         cat_ids
 25 |     ), "Category ids are not in [1, #categories], as expected"
 26 |     # Ensure that the category list is sorted by id
 27 |     thing_ids = [k["id"] for k in LVIS_V1_CATEGORIES]
 28 |     # lvis_categories = sorted(LVIS_V1_CATEGORIES, key=lambda x: x["id"])
 29 |     thing_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(thing_ids)}
 30 |     # thing_classes = [k["name"] for k in O365_CATEGORIES]
 31 |     def preprocess_name(name):
 32 |         name = name.lower().strip()
 33 |         name = name.replace('_', ' ')
 34 |         return name
 35 |     thing_classes = [preprocess_name(k["synonyms"][0]) for k in LVIS_V1_CATEGORIES]
 36 |     meta = {
 37 |         "thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id,
 38 |         "thing_classes": thing_classes,
 39 |             }
 40 |     return meta
 41 | 
 42 | 
 43 | def register_lvis_instances(name, metadata, json_file, image_root):
 44 |     """
 45 |     Register a dataset in LVIS's json annotation format for instance detection and segmentation.
 46 | 
 47 |     Args:
 48 |         name (str): a name that identifies the dataset, e.g. "lvis_v0.5_train".
 49 |         metadata (dict): extra metadata associated with this dataset. It can be an empty dict.
 50 |         json_file (str): path to the json instance annotation file.
 51 |         image_root (str or path-like): directory which contains all the images.
 52 |     """
 53 |     DatasetCatalog.register(name, lambda: load_lvis_json(image_root, json_file, name))
 54 |     MetadataCatalog.get(name).set(
 55 |         json_file=json_file, image_root=image_root, evaluator_type="lvis", **metadata
 56 |     )
 57 | 
 58 | 
 59 | def load_lvis_json(image_root, annot_json, metadata):
 60 |     """
 61 |     Args:
 62 |         image_dir (str): path to the raw dataset. e.g., "~/coco/train2017".
 63 |         gt_dir (str): path to the raw annotations. e.g., "~/coco/panoptic_train2017".
 64 |         json_file (str): path to the json file. e.g., "~/coco/annotations/panoptic_train2017.json".
 65 |     Returns:
 66 |         list[dict]: a list of dicts in Detectron2 standard format. (See
 67 |         `Using Custom Datasets </tutorials/datasets.html>`_ )
 68 |     """
 69 |     with PathManager.open(annot_json) as f:
 70 |         json_info = json.load(f)
 71 | 
 72 |     imageid2seg = {}
 73 |     imageid2box = {}
 74 |     imageid2lable = {}
 75 |     for anno in json_info["annotations"]:
 76 |         image_id = anno['image_id']
 77 |         seg = anno["segmentation"]
 78 |         bbox = anno["bbox"]
 79 |         label = anno["category_id"]
 80 |         if image_id not in imageid2seg:
 81 |             imageid2seg[image_id] = []
 82 |         if image_id not in imageid2box:
 83 |             imageid2box[image_id] = []
 84 |         if image_id not in imageid2lable:
 85 |             imageid2lable[image_id] = []
 86 |         imageid2seg[image_id] += [seg]
 87 |         imageid2box[image_id] += [bbox]
 88 |         imageid2lable[image_id] += [label]
 89 | 
 90 |     ret = []
 91 |     cnt_empty = 0
 92 |     for image in json_info["images"]:
 93 |         image_file = os.path.join(image_root ,'/'.join(image["coco_url"].split('/')[-2:]))
 94 |         image_id = image['id']
 95 |         if image_id not in imageid2lable:
 96 |             cnt_empty += 1
 97 |             continue
 98 |         ret.append(
 99 |             {
100 |                 "file_name": image_file,
101 |                 "image_id": image_id,
102 |                 "height": image['height'],
103 |                 "width": image['width'],
104 |                 "instance": imageid2seg[image_id],
105 |                 "box": imageid2box[image_id],
106 |                 "labels": imageid2lable[image_id],
107 |             }
108 |         )
109 | 
110 |     print("Empty annotations: {}".format(cnt_empty))
111 |     assert len(ret), f"No images found in {image_root}!"
112 |     assert PathManager.isfile(ret[0]["file_name"]), ret[0]["file_name"]
113 |     return ret
114 | 
115 | 
116 | def register_all_lvis(_root_eval, _root_train):
117 |     for dataset_name, splits_per_dataset in _PREDEFINED_SPLITS_LVIS.items():
118 |         for key, (image_root, json_file) in splits_per_dataset.items():
119 |             if 'val' in key:
120 |                 root = _root_eval
121 |             else:
122 |                 root = _root_train
123 |             register_lvis_instances(
124 |                 key,
125 |                 get_lvis_instances_meta_v1(),
126 |                 os.path.join(root, json_file) if "://" not in json_file else json_file,
127 |                 os.path.join(root, image_root),
128 |             )
129 | 
130 | 
131 | _root_eval = os.getenv("DATASET3", "datasets")
132 | _root_train = os.getenv("DATASET", "datasets")
133 | if _root_train!='datasets' and _root_eval!='datasets':
134 |     register_all_lvis(_root_eval, _root_train)


--------------------------------------------------------------------------------
/datasets/registration/register_paco_part_all.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | import logging
 3 | import os
 4 | from detectron2.data import DatasetCatalog, MetadataCatalog
 5 | import copy
 6 | # from detectron2.data.datasets.register_coco import register_coco_instances
 7 | from detectron2.data.datasets.coco import load_coco_json
 8 | import json
 9 | 
10 | 
11 | def _get_paco_metadata(key):
12 |     # if '_base' in key:
13 |     #     id_to_name = {x['id']: x['name'] for x in PASCAL_PART_BASE_CATEGORIES}
14 |     # else:
15 |     id_to_name = {x['id']: x['name'] for x in PACO_CATEGORIES}
16 | 
17 |     thing_classes_ = [id_to_name[k] for k in sorted(id_to_name)]
18 |     PACO_CATEGORIES_=copy.deepcopy(PACO_CATEGORIES)
19 |     for cat in PACO_CATEGORIES_:
20 |         if ':' not in cat['name']:
21 |             cat['name']=cat['name']+':whole'
22 |         if '_(' in cat['name']:
23 |             cat['name']=cat['name'][:cat['name'].find('_(')]+cat['name'][cat['name'].find(')')+1:]
24 |         if '_' in cat['name']:
25 |             cat['name']=cat['name'].replace('_',' ')
26 |     id_to_name = {x['id']: x['name'] for x in PACO_CATEGORIES_}
27 |     thing_dataset_id_to_contiguous_id = {
28 |         x: i for i, x in enumerate(sorted(id_to_name))}
29 |     thing_classes = [id_to_name[k] for k in sorted(id_to_name)]
30 | 
31 |     part_classes = [a.split(":")[1].lower() for a in thing_classes]
32 |     thing_clases_id_to_part_id={v: sorted(set(part_classes)).index(n) for v, n in enumerate(part_classes)}
33 |     whole_classes = [a.split(":")[0].lower() for a in thing_classes]
34 | 
35 |     no_part_index = sorted(set(part_classes)).index('whole')
36 |     thing_classes_id_without_part = [k for k, v in thing_clases_id_to_part_id.items() if no_part_index==v]
37 | 
38 |     thing_clases_id_to_whole_id={v: sorted(set(whole_classes)).index(n) for v, n in enumerate(whole_classes)}
39 |     thing_clases_id_to_flattened_wholepart = {tid: thing_clases_id_to_whole_id[tid]*len(set(part_classes))+pid for tid, pid in thing_clases_id_to_part_id.items()}
40 |     return {
41 |         "thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id,
42 |         "thing_classes": thing_classes_,
43 |         "thing_clases_id_to_part_id": thing_clases_id_to_part_id,
44 |         "part_classes": sorted(set(part_classes)),
45 |         "thing_clases_id_to_whole_id": thing_clases_id_to_whole_id,
46 |         "whole_classes": sorted(set(whole_classes)),
47 |         "thing_clases_id_to_flattened_wholepart": thing_clases_id_to_flattened_wholepart,
48 |         "thing_classes_id_without_part": thing_classes_id_without_part,
49 |         }
50 | 
51 | 
52 | def register_paco_part_instances(name, metadata, json_file, image_root):
53 |     DatasetCatalog.register(name, lambda: load_coco_json(
54 |         json_file, image_root, name))
55 |     MetadataCatalog.get(name).set(
56 |         json_file=json_file, image_root=image_root,
57 |         evaluator_type="pascal_part_interactive", **metadata
58 |     )
59 | 
60 | _PACO = {
61 |     "paco_train": ("coco", "paco/annotations/paco_lvis_v1_train.json"),
62 |     # "pascal_part_train_one": ("pascal_part/VOCdevkit/VOC2010/JPEGImages", "pascal_part/train_one.json"),
63 |     "paco_val_inter": ("coco", "paco/annotations/paco_lvis_v1_val_mini.json"),
64 |     # "paco_test": ("paco/val2017", "paco/annotations/paco_lvis_v1_val.json"),
65 |     # "pascal_part_base_train": ("pascal_part/VOCdevkit/VOC2010/JPEGImages", "pascal_part/train_base.json"),
66 |     # "pascal_part_base_train_one": ("pascal_part/VOCdevkit/VOC2010/JPEGImages", "pascal_part/train_base_one.json"),
67 |     # "imagenet_voc_parsed": ("imagenet/train", "imagenet/imagenet_voc_image_parsed.json"),
68 |     # "imagenet_golden_pascal_parsed": ("imagenet/train", "imagenet/imagenet_golden_pascal_parsed.json"),
69 |     # "imagenet_golden_pascal_parsed_swinbase": ("imagenet/train", "imagenet/imagenet_golden_pascal_parsed_swinbase.json"),
70 | }
71 | 
72 | 
73 | def register_paco_part(root):
74 |     for key, (image_root, json_file) in _PACO.items():
75 |         register_paco_part_instances(
76 |             key,
77 |             _get_paco_metadata(key),
78 |             os.path.join(root, json_file) if "://" not in json_file else json_file,
79 |             os.path.join(root, image_root),
80 |         )
81 | 
82 | _root = os.getenv("PACO", "datasets")
83 | if _root!="datasets":
84 |     with open(os.path.join(_root,"paco/annotations/paco_lvis_v1_val.json")) as f:
85 |         j=json.load(f)
86 |     PACO_CATEGORIES=j['categories']
87 | 
88 |     register_paco_part(_root)
89 | else:
90 |     print("skip paco register")


--------------------------------------------------------------------------------
/datasets/registration/register_partimagenet_part_all.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | import logging
  3 | import os
  4 | from detectron2.data import DatasetCatalog, MetadataCatalog
  5 | from detectron2.data.datasets.coco import load_coco_json
  6 | 
  7 | PART_IN_CATEGORIES = [{'id': 0, 'name': 'Quadruped Head', 'supercategory': 'Quadruped'},
  8 |                       {'id': 1, 'name': 'Quadruped Body', 'supercategory': 'Quadruped'},
  9 |                       {'id': 2, 'name': 'Quadruped Foot', 'supercategory': 'Quadruped'},
 10 |                       {'id': 3, 'name': 'Quadruped Tail', 'supercategory': 'Quadruped'},
 11 |                       {'id': 4, 'name': 'Biped Head', 'supercategory': 'Biped'},
 12 |                       {'id': 5, 'name': 'Biped Body', 'supercategory': 'Biped'},
 13 |                       {'id': 6, 'name': 'Biped Hand', 'supercategory': 'Biped'},
 14 |                       {'id': 7, 'name': 'Biped Foot', 'supercategory': 'Biped'},
 15 |                       {'id': 8, 'name': 'Biped Tail', 'supercategory': 'Biped'},
 16 |                       {'id': 9, 'name': 'Fish Head', 'supercategory': 'Fish'},
 17 |                       {'id': 10, 'name': 'Fish Body', 'supercategory': 'Fish'},
 18 |                       {'id': 11, 'name': 'Fish Fin', 'supercategory': 'Fish'},
 19 |                       {'id': 12, 'name': 'Fish Tail', 'supercategory': 'Fish'},
 20 |                       {'id': 13, 'name': 'Bird Head', 'supercategory': 'Bird'},
 21 |                       {'id': 14, 'name': 'Bird Body', 'supercategory': 'Bird'},
 22 |                       {'id': 15, 'name': 'Bird Wing', 'supercategory': 'Bird'},
 23 |                       {'id': 16, 'name': 'Bird Foot', 'supercategory': 'Bird'},
 24 |                       {'id': 17, 'name': 'Bird Tail', 'supercategory': 'Bird'},
 25 |                       {'id': 18, 'name': 'Snake Head', 'supercategory': 'Snake'},
 26 |                       {'id': 19, 'name': 'Snake Body', 'supercategory': 'Snake'},
 27 |                       {'id': 20, 'name': 'Reptile Head', 'supercategory': 'Reptile'},
 28 |                       {'id': 21, 'name': 'Reptile Body', 'supercategory': 'Reptile'},
 29 |                       {'id': 22, 'name': 'Reptile Foot', 'supercategory': 'Reptile'},
 30 |                       {'id': 23, 'name': 'Reptile Tail', 'supercategory': 'Reptile'},
 31 |                       {'id': 24, 'name': 'Car Body', 'supercategory': 'Car'},
 32 |                       {'id': 25, 'name': 'Car Tier', 'supercategory': 'Car'},
 33 |                       {'id': 26, 'name': 'Car Side Mirror', 'supercategory': 'Car'},
 34 |                       {'id': 27, 'name': 'Bicycle Body', 'supercategory': 'Bicycle'},
 35 |                       {'id': 28, 'name': 'Bicycle Head', 'supercategory': 'Bicycle'},
 36 |                       {'id': 29, 'name': 'Bicycle Seat', 'supercategory': 'Bicycle'},
 37 |                       {'id': 30, 'name': 'Bicycle Tier', 'supercategory': 'Bicycle'},
 38 |                       {'id': 31, 'name': 'Boat Body', 'supercategory': 'Boat'},
 39 |                       {'id': 32, 'name': 'Boat Sail', 'supercategory': 'Boat'},
 40 |                       {'id': 33, 'name': 'Aeroplane Head', 'supercategory': 'Aeroplane'},
 41 |                       {'id': 34, 'name': 'Aeroplane Body', 'supercategory': 'Aeroplane'},
 42 |                       {'id': 35, 'name': 'Aeroplane Engine', 'supercategory': 'Aeroplane'},
 43 |                       {'id': 36, 'name': 'Aeroplane Wing', 'supercategory': 'Aeroplane'},
 44 |                       {'id': 37, 'name': 'Aeroplane Tail', 'supercategory': 'Aeroplane'},
 45 |                       {'id': 38, 'name': 'Bottle Mouth', 'supercategory': 'Bottle'},
 46 |                       {'id': 39, 'name': 'Bottle Body', 'supercategory': 'Bottle'}]
 47 | 
 48 | 
 49 | def _get_partimagenet_metadata(key):
 50 |     id_to_name = {x['id']: x['name'] for x in PART_IN_CATEGORIES}
 51 |     thing_dataset_id_to_contiguous_id = {
 52 |         x: i for i, x in enumerate(sorted(id_to_name))}
 53 |     thing_classes = [id_to_name[k] for k in sorted(id_to_name)]
 54 | 
 55 |     part_classes = [a.split(" ")[1].lower() for a in thing_classes]
 56 |     thing_clases_id_to_part_id = {v: sorted(set(part_classes)).index(n) for v, n in enumerate(part_classes)}
 57 |     whole_classes = [a.split(" ")[0].lower() for a in thing_classes]
 58 |     thing_clases_id_to_whole_id = {v: sorted(set(whole_classes)).index(n) for v, n in enumerate(whole_classes)}
 59 |     thing_clases_id_to_flattened_wholepart = {tid: thing_clases_id_to_whole_id[tid] * len(set(part_classes)) + pid for
 60 |                                               tid, pid in thing_clases_id_to_part_id.items()}
 61 |     return {
 62 |         "thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id,
 63 |         "thing_classes": thing_classes,
 64 |         "thing_clases_id_to_part_id": thing_clases_id_to_part_id,
 65 |         "part_classes": sorted(set(part_classes)),
 66 |         "thing_clases_id_to_whole_id": thing_clases_id_to_whole_id,
 67 |         "whole_classes": sorted(set(whole_classes)),
 68 |         "thing_clases_id_to_flattened_wholepart": thing_clases_id_to_flattened_wholepart,
 69 |     }
 70 | 
 71 | 
 72 | def register_partimagenet_part_instances(name, metadata, json_file, image_root):
 73 |     DatasetCatalog.register(name, lambda: load_coco_json(
 74 |         json_file, image_root, name))
 75 |     MetadataCatalog.get(name).set(
 76 |         json_file=json_file, image_root=image_root,
 77 |         evaluator_type="pascal_part_interactive", **metadata
 78 |     )
 79 | 
 80 | 
 81 | _PART_IN = {
 82 |     "partimagenet_train": ("imagenet/train", "partimagenet/train_format.json"),
 83 |     "partimagenet_val_inter": ("imagenet/val", "partimagenet/val_format_mini.json"),
 84 | }
 85 | 
 86 | 
 87 | def register_partimagenet_part(root):
 88 |     for key, (image_root, json_file) in _PART_IN.items():
 89 |         register_partimagenet_part_instances(
 90 |             key,
 91 |             _get_partimagenet_metadata(key),
 92 |             os.path.join(root, json_file) if "://" not in json_file else json_file,
 93 |             os.path.join(root, image_root),
 94 |         )
 95 | 
 96 | 
 97 | _root = os.getenv("PART_IN", "datasets")
 98 | if _root!='datasets':
 99 |     register_partimagenet_part(_root)
100 | 


--------------------------------------------------------------------------------
/datasets/registration/register_sam.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2022 The IDEA Authors. All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | # ------------------------------------------------------------------------------------------------
 16 | # Copyright (c) Facebook, Inc. and its affiliates.
 17 | # ------------------------------------------------------------------------------------------------
 18 | # Modified from:
 19 | # https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/data/datasets/register_ade20k_instance.py
 20 | # ------------------------------------------------------------------------------------------------
 21 | 
 22 | import json
 23 | import logging
 24 | import numpy as np
 25 | import os
 26 | from PIL import Image
 27 | 
 28 | from detectron2.data import DatasetCatalog, MetadataCatalog
 29 | from detectron2.utils.file_io import PathManager
 30 | import detectron2.utils.comm as comm
 31 | import torch.distributed as dist
 32 | 
 33 | import os.path as op
 34 | 
 35 | SAM_CATEGORIES = [{'id': 1, 'name': 'stuff'}]
 36 | 
 37 | _PREDEFINED_SPLITS = {
 38 |     # point annotations without masks
 39 |     "sam_train": (
 40 |         "",
 41 |     ),
 42 |     "sam_val": (
 43 |         "",
 44 |     ),
 45 | }
 46 | 
 47 | 
 48 | def _get_sam_instances_meta():
 49 |     thing_ids = [k["id"] for k in SAM_CATEGORIES]
 50 |     assert len(thing_ids) == 1, len(thing_ids)
 51 |     # Mapping from the incontiguous ADE category id to an id in [0, 99]
 52 |     thing_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(thing_ids)}
 53 |     thing_classes = [k["name"] for k in SAM_CATEGORIES]
 54 |     ret = {
 55 |         "thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id,
 56 |         "thing_classes": thing_classes,
 57 |     }
 58 |     return ret
 59 | 
 60 | def load_sam_index(tsv_file, dataset_name=None, extra_annotation_keys=None):
 61 |     """
 62 |     Load a json file with COCO's instances annotation format.
 63 |     Currently supports instance detection, instance segmentation,
 64 |     and person keypoints annotations.
 65 |     """
 66 |     dataset_dicts = []
 67 |     tsv_id = 0
 68 |     files = os.listdir(tsv_file)
 69 |     start = int(os.getenv("SAM_SUBSET_START", "90"))
 70 |     end = int(os.getenv("SAM_SUBSET_END", "100"))
 71 |     if len(files)>0 and 'part' in files[0]:  # for hgx
 72 |                 files = [f for f in files if '.tsv' in f and int(f.split('.')[1].split('_')[-1])>=start and int(f.split('.')[1].split('_')[-1])<end]
 73 |     else:  # for msr
 74 |         files = [f for f in files if '.tsv' in f and int(f.split('.')[0].split('-')[-1])>=start and int(f.split('.')[0].split('-')[-1])<end]
 75 |         
 76 |     for tsv in files:
 77 |         if op.splitext(tsv)[1] == '.tsv':
 78 |             print('register tsv to create index', "tsv_id", tsv_id, tsv)
 79 |             lineidx = os.path.join(tsv_file, op.splitext(tsv)[0] + '.lineidx')
 80 |             line_name = op.splitext(tsv)[0] + '.lineidx'
 81 |             
 82 |             with open(lineidx, 'r') as fp:
 83 |                 lines = fp.readlines()
 84 |                 _lineidx = [int(i.strip().split()[0]) for i in lines]
 85 | 
 86 |             dataset_dict =[{'idx': (tsv_id, i)} for i in range(len(_lineidx))]
 87 |             dataset_dicts = dataset_dicts + dataset_dict
 88 |             tsv_id += 1
 89 |     return dataset_dicts
 90 | 
 91 | def register_sam_instances(name, metadata, tsv_file):
 92 |     assert isinstance(name, str), name
 93 | 
 94 |     DatasetCatalog.register(name, lambda: load_sam_index(tsv_file, name))
 95 | 
 96 |     # 2. Optionally, add metadata about this dataset,
 97 |     # since they might be useful in evaluation, visualization or logging
 98 |     MetadataCatalog.get(name).set(
 99 |         tsv_file=tsv_file, evaluator_type="sam_interactive",  **metadata
100 |     )
101 | 
102 | 
103 | def register_all_sam_instance(root):
104 |     for key, tsv_file in _PREDEFINED_SPLITS.items():
105 |         # Assume pre-defined datasets live in `./datasets`.
106 |         register_sam_instances(
107 |             key,
108 |             _get_sam_instances_meta(),
109 |             os.path.join(root, tsv_file[0]),
110 |         )
111 | 
112 | _root = os.getenv("SAM_DATASETS", "datasets")
113 | # _root_local = os.getenv("SAM_LOCAL", "no")
114 | # if _root_local != 'no' or 'comp_robot' in _root:
115 | #     if 'comp_robot' not in _root:
116 | #         _root = _root_local
117 | if _root != 'no':
118 |     print('registering sam datasets from', _root)
119 |     register_all_sam_instance(_root)
120 | 


--------------------------------------------------------------------------------
/datasets/registration/register_sam_json.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | import json
 3 | import os
 4 | import collections
 5 | import glob
 6 | import torch
 7 | 
 8 | from detectron2.data import DatasetCatalog, MetadataCatalog
 9 | from detectron2.data.datasets import load_sem_seg
10 | from detectron2.data.datasets.builtin_meta import COCO_CATEGORIES
11 | from detectron2.utils.file_io import PathManager
12 | 
13 | 
14 | _PREDEFINED_SPLITS_SAM_RAW = {
15 |     "sam_train": (
16 |         "meta_sa",
17 |         (901,910)
18 |     ),
19 |     "sam_minitrain": (
20 |         "meta_sa",
21 |         (0,12)
22 |     ),
23 |     "sam_val": (
24 |         "meta_sa",
25 |         (901,902)
26 |     ),
27 |     "sam_minival": (
28 |         "meta_sa",
29 |         (998,999)
30 |     ),    
31 | }
32 | 
33 | 
34 | def load_sam_instances(name: str, dirname: str, id_range: tuple):
35 |     """
36 |     Load SAM detection annotations to Detectron2 format.
37 | 
38 |     Args:
39 |         name: name of split
40 |         dirname: dataset directory path
41 |         id_range: (start, end) tuple for dataset subfolders
42 |     """
43 |     dicts = []
44 |     for id in range(*id_range):
45 |         subfolder = os.path.join(dirname, 'sa_%06d' % id, 'image_list.da')
46 |         dicts += torch.load(subfolder)
47 |     return dicts
48 | 
49 | def register_sam(name, dirname, id_range):
50 |     DatasetCatalog.register("{}".format(name), lambda: load_sam_instances(name, dirname, id_range))
51 |     MetadataCatalog.get("{}".format(name)).set(
52 |         dirname=dirname,
53 |         thing_dataset_id_to_contiguous_id={},
54 |     )
55 | 
56 | def register_all_sam(root):
57 |     for (
58 |         prefix,
59 |         (image_root, id_range),
60 |     ) in _PREDEFINED_SPLITS_SAM_RAW.items():
61 |         register_sam(
62 |             prefix,
63 |             os.path.join(root, image_root),
64 |             id_range
65 |         )
66 | 
67 | _root = os.getenv("SAM_JSON", "datasets")
68 | if _root!='datasets':
69 |     register_all_sam(_root)


--------------------------------------------------------------------------------
/datasets/registration/register_scannet_semseg.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # --------------------------------------------------------
 3 | # X-Decoder -- Generalized Decoding for Pixel, Image, and Language
 4 | # Copyright (c) 2022 Microsoft
 5 | # Licensed under The MIT License [see LICENSE for details]
 6 | # Modified by Xueyan Zou (xueyan@cs.wisc.edu)
 7 | # --------------------------------------------------------
 8 | import numpy as np
 9 | import os
10 | import glob
11 | from typing import List, Tuple, Union
12 | 
13 | from detectron2.data import DatasetCatalog, MetadataCatalog
14 | from detectron2.structures import BoxMode
15 | from detectron2.utils.file_io import PathManager
16 | 
17 | from utils.constants import SCAN_37, SCAN_40, SCAN_20
18 | 
19 | __all__ = ["load_scannet_instances", "register_scannet_context"]
20 | 
21 | name2folder = {"scannet_41_val_seg": "label41",
22 |                "scannet_38_val_seg": "label38",
23 |                "scannet_21_val_seg": "label21",}
24 | 
25 | name2class = {"scannet_41_val_seg": SCAN_40,
26 |               "scannet_38_val_seg": SCAN_37,
27 |               "scannet_21_val_seg": SCAN_20}
28 | 
29 | 
30 | def load_scannet_instances(name: str, dirname: str, split: str, class_names: Union[List[str], Tuple[str, ...]]):
31 |     """
32 |     Load ScanNet annotations to Detectron2 format.
33 | 
34 |     Args:
35 |         dirname: Contain "Annotations", "ImageSets", "JPEGImages"
36 |         split (str): one of "train", "test", "val", "trainval"
37 |         class_names: list or tuple of class names
38 |     """
39 |     with PathManager.open(os.path.join(dirname, "meta", split + ".txt")) as f:
40 |         fileids = np.loadtxt(f, dtype=np.str)
41 |         
42 |     dicts = []
43 |     for field in fileids:
44 |         image_dir = os.path.join(dirname, 'images', field[0])
45 |         semseg_dir = image_dir.replace('color', name2folder[name]).replace('jpg', 'png')
46 |         r = {
47 |             "file_name": image_dir,
48 |             "sem_seg_file_name": semseg_dir,
49 |             "image_id": semseg_dir.split('/')[-3] + semseg_dir.split('/')[-1].split('.')[0],
50 |         }
51 |         dicts.append(r)
52 |     return dicts
53 | 
54 | 
55 | def register_scannet_context(name, dirname, split, class_names=name2class):
56 |     DatasetCatalog.register(name, lambda: load_scannet_instances(name, dirname, split, class_names))
57 |     MetadataCatalog.get(name).set(
58 |         stuff_classes=class_names[name],
59 |         dirname=dirname,
60 |         split=split,
61 |         ignore_label=[0],
62 |         thing_dataset_id_to_contiguous_id={},
63 |         class_offset=1,
64 |         keep_sem_bgd=False
65 |     )
66 | 
67 | 
68 | def register_all_sunrgbd_seg(root):
69 |     SPLITS = [
70 |             ("scannet_41_val_seg", "scannet_frames_25k", "val"),
71 |             ("scannet_38_val_seg", "scannet_frames_25k", "val"),
72 |             ("scannet_21_val_seg", "scannet_frames_25k", "val"),
73 |         ]
74 |         
75 |     for name, dirname, split in SPLITS:
76 |         register_scannet_context(name, os.path.join(root, dirname), split)
77 |         MetadataCatalog.get(name).evaluator_type = "sem_seg"
78 | 
79 | 
80 | _root = os.getenv("DATSETW", "datasets")
81 | if _root!='datasets':
82 |     register_all_sunrgbd_seg(_root)


--------------------------------------------------------------------------------
/datasets/registration/register_seginw_instance.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | import json
  3 | import os
  4 | import collections
  5 | 
  6 | from detectron2.data import DatasetCatalog, MetadataCatalog
  7 | from detectron2.data.datasets import load_sem_seg
  8 | from detectron2.data.datasets.builtin_meta import COCO_CATEGORIES
  9 | from detectron2.utils.file_io import PathManager
 10 | 
 11 | _CATEGORIES = ['Elephants', 'Hand-Metal', 'Watermelon', 'House-Parts', 'HouseHold-Items', 'Strawberry', 'Fruits',
 12 |                'Nutterfly-Squireel',
 13 |                'Hand', 'Garbage', 'Chicken', 'Rail', 'Airplane-Parts', 'Brain-Tumor', 'Poles', 'Electric-Shaver',
 14 |                'Bottles',
 15 |                'Toolkits', 'Trash', 'Salmon-Fillet', 'Puppies', 'Tablets', 'Phones', 'Cows', 'Ginger-Garlic']
 16 | 
 17 | _PREDEFINED_SPLITS_SEGINW = {
 18 |     "seginw_{}_val".format(cat): (
 19 |         "valid",
 20 |         "seginw/{}".format(cat),  # image_root
 21 |         "_annotations_min1cat.coco.json",  # annot_root
 22 |     ) for cat in _CATEGORIES
 23 | 
 24 | }
 25 | _PREDEFINED_SPLITS_SEGINW.update({
 26 |     "seginw_{}_train".format(cat): (
 27 |         "train",
 28 |         "seginw/{}".format(cat),  # image_root
 29 |         "_annotations_min1cat.coco.json",  # annot_root
 30 |     ) for cat in _CATEGORIES
 31 | })
 32 | 
 33 | 
 34 | def get_metadata():
 35 |     meta = {}
 36 |     return meta
 37 | 
 38 | 
 39 | def load_seginw_json(name, image_root, annot_json, metadata):
 40 |     """
 41 |     Args:
 42 |         image_dir (str): path to the raw dataset. e.g., "~/coco/train2017".
 43 |         gt_dir (str): path to the raw annotations. e.g., "~/coco/panoptic_train2017".
 44 |         json_file (str): path to the json file. e.g., "~/coco/annotations/panoptic_train2017.json".
 45 |     Returns:
 46 |         list[dict]: a list of dicts in Detectron2 standard format. (See
 47 |         `Using Custom Datasets </tutorials/datasets.html>`_ )
 48 |     """
 49 | 
 50 |     with PathManager.open(annot_json) as f:
 51 |         json_info = json.load(f)
 52 | 
 53 |     # build dictionary for grounding
 54 |     grd_dict = collections.defaultdict(list)
 55 |     for grd_ann in json_info['annotations']:
 56 |         image_id = int(grd_ann["image_id"])
 57 |         grd_dict[image_id].append(grd_ann)
 58 | 
 59 |     ret = []
 60 |     for image in json_info["images"]:
 61 |         image_id = int(image["id"])
 62 |         image_file = os.path.join(image_root, image['file_name'])
 63 |         grounding_anno = grd_dict[image_id]
 64 | 
 65 |         if 'train' in name and len(grounding_anno) == 0:
 66 |             continue
 67 | 
 68 |         ret.append(
 69 |             {
 70 |                 "file_name": image_file,
 71 |                 "image_id": image_id,
 72 |                 "inst_info": grounding_anno,
 73 |             }
 74 |         )
 75 | 
 76 |     assert len(ret), f"No images found in {image_root}!"
 77 |     assert PathManager.isfile(ret[0]["file_name"]), ret[0]["file_name"]
 78 |     return ret
 79 | 
 80 | 
 81 | def register_seginw(
 82 |         name, metadata, image_root, annot_json):
 83 |     DatasetCatalog.register(
 84 |         name,
 85 |         lambda: load_seginw_json(name, image_root, annot_json, metadata),
 86 |     )
 87 |     MetadataCatalog.get(name).set(
 88 |         image_root=image_root,
 89 |         json_file=annot_json,
 90 |         evaluator_type="seginw",
 91 |         ignore_label=255,
 92 |         label_divisor=1000,
 93 |         **metadata,
 94 |     )
 95 | 
 96 | 
 97 | def register_all_seginw(root):
 98 |     for (
 99 |             prefix,
100 |             (split, folder_name, annot_name),
101 |     ) in _PREDEFINED_SPLITS_SEGINW.items():
102 |         register_seginw(
103 |             prefix,
104 |             get_metadata(),
105 |             os.path.join(root, folder_name, split),
106 |             os.path.join(root, folder_name, split, annot_name),
107 |         )
108 | 
109 | 
110 | _root = os.getenv("DATSETW", "datasets")
111 | if _root!='datasets':
112 |     register_all_seginw(_root)
113 | 


--------------------------------------------------------------------------------
/datasets/registration/register_sunrgbd_semseg.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | # --------------------------------------------------------
 4 | # X-Decoder -- Generalized Decoding for Pixel, Image, and Language
 5 | # Copyright (c) 2022 Microsoft
 6 | # Licensed under The MIT License [see LICENSE for details]
 7 | # Modified by Xueyan Zou (xueyan@cs.wisc.edu)
 8 | # --------------------------------------------------------
 9 | import numpy as np
10 | import os
11 | import glob
12 | from typing import List, Tuple, Union
13 | 
14 | from detectron2.data import DatasetCatalog, MetadataCatalog
15 | from detectron2.structures import BoxMode
16 | from detectron2.utils.file_io import PathManager
17 | 
18 | from utils.constants import SUN_RGBD_37
19 | 
20 | __all__ = ["load_sunrgbd_instances", "register_sunrgbd_context"]
21 | 
22 | def load_sunrgbd_instances(name: str, dirname: str, split: str, class_names: Union[List[str], Tuple[str, ...]]):
23 |     """
24 |     Load SUN-RGBD detection annotations to Detectron2 format.
25 | 
26 |     Args:
27 |         dirname: Contain "Annotations", "ImageSets", "JPEGImages"
28 |         split (str): one of "train", "test", "val", "trainval"
29 |         class_names: list or tuple of class names
30 |     """
31 |     if split == 'val':
32 |         split = 'test'
33 |         
34 |     # Needs to read many small annotation files. Makes sense at local
35 |     image_pths = sorted(glob.glob(os.path.join(dirname, 'image', split, '*.jpg')))
36 |     semseg_pths = sorted(glob.glob(os.path.join(dirname, 'label37', split, '*.png')))
37 |     
38 |     assert len(image_pths) == len(semseg_pths)
39 |     # 5k images
40 |     dicts = []
41 |     for image_dir, semseg_dir in zip(image_pths, semseg_pths):
42 |         r = {
43 |             "file_name": image_dir,
44 |             "sem_seg_file_name": semseg_dir,
45 |             "image_id": semseg_dir.split('/')[-1].split('.')[0],
46 |         }
47 |         dicts.append(r)
48 |     return dicts
49 | 
50 | 
51 | def register_sun_context(name, dirname, split, class_names=SUN_RGBD_37):
52 |     DatasetCatalog.register(name, lambda: load_sunrgbd_instances(name, dirname, split, class_names))
53 |     MetadataCatalog.get(name).set(
54 |         stuff_classes=class_names,
55 |         dirname=dirname,
56 |         split=split,
57 |         ignore_label=[0],
58 |         thing_dataset_id_to_contiguous_id={},
59 |         class_offset=1,
60 |         keep_sem_bgd=False
61 |     )
62 | 
63 | 
64 | def register_all_sunrgbd_seg(root):
65 |     SPLITS = [
66 |             ("sunrgbd_37_val_seg", "sun_rgbd", "val"),
67 |         ]
68 |         
69 |     for name, dirname, split in SPLITS:
70 |         register_sun_context(name, os.path.join(root, dirname), split)
71 |         MetadataCatalog.get(name).evaluator_type = "sem_seg"
72 | 
73 | 
74 | _root = os.getenv("DATSETW", "datasets")
75 | if _root!='datasets':
76 |     register_all_sunrgbd_seg(_root)


--------------------------------------------------------------------------------
/datasets/registration/register_ytvos_dataset.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | import os
 4 | import glob
 5 | import json
 6 | from typing import List, Tuple, Union
 7 | 
 8 | import cv2
 9 | import numpy as np
10 | from scipy.io import loadmat
11 | 
12 | from detectron2.data import DatasetCatalog, MetadataCatalog
13 | from detectron2.structures import BoxMode
14 | from detectron2.utils.file_io import PathManager
15 | 
16 | 
17 | __all__ = ["load_ytovs_instances", "register_ytvos_context"]
18 | 
19 | def load_ytvos_instances(name: str, dirname: str, split: str):
20 |     """
21 |     Load Pascal VOC detection annotations to Detectron2 format.
22 | 
23 |     Args:
24 |         dirname: Contain "Annotations", "ImageSets", "JPEGImages"
25 |         split (str): one of "train", "test", "val", "trainval"
26 |         class_names: list or tuple of class names
27 |     """
28 |     meta_json = os.path.join(dirname, split, "meta.json")
29 |     video_dir = os.path.join(dirname, split, 'JPEGImages')
30 |     mask_dir = os.path.join(dirname, split, 'Annotations')
31 |     video_names = os.listdir(video_dir)
32 |     meta = json.load(open(meta_json))['videos']
33 | 
34 |     dicts = []
35 |     for vid_name in video_names:
36 |         objects = meta[vid_name]['objects']
37 |         r = {
38 |             "file_name": os.path.join(video_dir, vid_name),
39 |             "mask_name": os.path.join(mask_dir, vid_name),
40 |             "objects": objects,
41 |         }
42 |         dicts.append(r)
43 | 
44 |     return dicts
45 | 
46 | def register_ytvos_context(name, dirname, split):
47 |     if not os.path.exists(dirname):
48 |         print("not register for ", name)
49 |         return -1
50 |     DatasetCatalog.register("{}".format(name), lambda: load_ytvos_instances(name, dirname, split))
51 |     MetadataCatalog.get("{}".format(name)).set(
52 |         dirname=dirname,
53 |         thing_dataset_id_to_contiguous_id={},
54 |     )
55 | 
56 | def register_all_davis(root):
57 |     SPLITS = [
58 |             ("ytvos19_val", "ytvos2019", "valid"),
59 |             ("ytvos18_val", "ytvos2018", "valid"),
60 |         ]
61 | 
62 |     for name, dirname, split in SPLITS:
63 |         register_ytvos_context(name, os.path.join(root, dirname), split)
64 |         MetadataCatalog.get("{}".format(name)).evaluator_type = None
65 | 
66 | _root = os.getenv("TRACKING_DATASET", "datasets")
67 | if _root!='datasets':
68 |     register_all_davis(_root)


--------------------------------------------------------------------------------
/datasets/semseg_loader.py:
--------------------------------------------------------------------------------
 1 | from PIL import Image
 2 | import scipy.io
 3 | import numpy as np
 4 | 
 5 | def load_semseg(filename, loader_type):
 6 |     if loader_type == 'PIL':
 7 |         semseg = np.array(Image.open(filename), dtype=np.int)
 8 |     elif loader_type == 'MAT':
 9 |         semseg = scipy.io.loadmat(filename)['LabelMap']
10 |     return semseg


--------------------------------------------------------------------------------
/datasets/shapes/__init__.py:
--------------------------------------------------------------------------------
 1 | from .sampler import ShapeSampler
 2 | from .simpleclick_sampler import SimpleClickSampler
 3 | 
 4 | 
 5 | def build_shape_sampler(cfg, **kwargs):
 6 |     sampler_name = cfg['STROKE_SAMPLER']['EVAL']['MODE']
 7 |     if sampler_name == 'random':
 8 |         return ShapeSampler(cfg, **kwargs)
 9 |     elif sampler_name == 'best':
10 |         return SimpleClickSampler(cfg, **kwargs)
11 |     else:
12 |         assert False, "not implemented"


--------------------------------------------------------------------------------
/datasets/shapes/sampler.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import random
 3 | 
 4 | import torch
 5 | import torch.nn as nn
 6 | 
 7 | from dinov.utils import configurable
 8 | from .scribble import Scribble
 9 | 
10 | 
11 | class ShapeSampler(nn.Module):
12 |     @configurable
13 |     def __init__(self, max_candidate=1, shape_prob=[], shape_candidate=[], is_train=True):
14 |         super().__init__()
15 |         self.max_candidate = max_candidate
16 |         self.shape_prob = shape_prob
17 |         self.shape_candidate = shape_candidate
18 |         self.is_train = is_train
19 | 
20 |     @classmethod
21 |     def from_config(cls, cfg, is_train=True, mode=None):
22 |         max_candidate = cfg['STROKE_SAMPLER']['MAX_CANDIDATE']
23 |         candidate_probs = cfg['STROKE_SAMPLER']['CANDIDATE_PROBS']
24 |         candidate_names = cfg['STROKE_SAMPLER']['CANDIDATE_NAMES']
25 |         candidate_classes = [getattr(sys.modules[__name__], class_name)(cfg, is_train) for class_name in candidate_names]
26 | 
27 |         # overwrite condidate_prob
28 |         if not is_train:
29 |             candidate_probs = [0.0 for x in range(len(candidate_names))]
30 |             candidate_probs[candidate_names.index(mode)] = 1.0
31 | 
32 |         # Build augmentation
33 |         return {
34 |             "max_candidate": max_candidate,
35 |             "shape_prob": candidate_probs,
36 |             "shape_candidate": candidate_classes,
37 |             "is_train": is_train,
38 |         }
39 | 
40 |     def forward(self, masks, boxes, max_candidate=50):
41 |         # masks = instances.gt_masks.tensor
42 |         # boxes = instances.gt_boxes.tensor
43 | 
44 |         if len(masks) == 0:
45 |             gt_masks = torch.zeros(masks.shape[-2:]).bool()
46 |             rand_masks = torch.zeros(masks.shape[-2:]).bool()
47 |             return {'gt_masks': gt_masks[None,:], 'rand_shape': torch.stack([rand_masks]), 'types': ['none']}
48 |         indices = [x for x in range(len(masks))]
49 |  
50 |         if self.is_train:
51 |             # random.shuffle(indices)
52 |             candidate_mask = masks[indices[:max_candidate]]
53 |             # candidate_box = boxes[indices[:max_candidate]]
54 |         else:
55 |             candidate_mask = masks
56 |         candidate_box = boxes
57 |         
58 |         draw_funcs = random.choices(self.shape_candidate, weights=self.shape_prob, k=len(candidate_mask))   # sample one shape, i.e., point
59 |         rand_shapes = [d.draw(x, b).cuda() for d,x, b in zip(draw_funcs, candidate_mask, candidate_box)]
60 |         types = [repr(x) for x in draw_funcs]
61 |         for i in range(0, len(rand_shapes)):
62 |             if rand_shapes[i].sum() == 0:
63 |                 candidate_mask[i] = candidate_mask[i] * 0
64 |                 types[i] = 'none'
65 | 
66 |         # candidate_mask: (c,h,w), bool. rand_shape: (c, iter, h, w), bool. types: list(c)
67 |         try:
68 |             rand_shapess=torch.stack(rand_shapes)
69 |         except RuntimeError:
70 |             for r in rand_shapes:
71 |                 print('r ', r.device())
72 |             print(candidate_mask.device())
73 |         return {'gt_masks': candidate_mask, 'rand_shape': torch.stack(rand_shapes), 'types': types, 'sampler': self}
74 | 
75 | def build_shape_sampler(cfg, **kwargs):
76 |     return ShapeSampler(cfg, **kwargs)


--------------------------------------------------------------------------------
/datasets/shapes/scribble.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | import torch
 4 | 
 5 | from .mask_generators import get_mask_by_input_strokes
 6 | 
 7 | 
 8 | class Scribble:
 9 |     def __init__(self, cfg, is_train):
10 |         self.num_stroke = cfg['STROKE_SAMPLER']['SCRIBBLE']['NUM_STROKES']
11 |         self.stroke_preset = cfg['STROKE_SAMPLER']['SCRIBBLE']['STROKE_PRESET']
12 |         self.stroke_prob = cfg['STROKE_SAMPLER']['SCRIBBLE']['STROKE_PROB']
13 |         self.eval_stroke = cfg['STROKE_SAMPLER']['EVAL']['MAX_ITER']
14 |         self.is_train = is_train
15 | 
16 |     @staticmethod
17 |     def get_stroke_preset(stroke_preset):
18 |         if stroke_preset == 'rand_curve':
19 |             return {
20 |                 "nVertexBound": [20, 50],
21 |                 "maxHeadSpeed": 30,
22 |                 "maxHeadAcceleration": (30, 0.5),
23 |                 "brushWidthBound": (3, 15),
24 |                 "nMovePointRatio": 0.5,
25 |                 "maxPiontMove": 6,
26 |                 "maxLineAcceleration": (9, 0.5),
27 |                 "boarderGap": None,
28 |                 "maxInitSpeed": 10
29 |             }
30 |         elif stroke_preset == 'rand_curve_small':
31 |             return {
32 |                 "nVertexBound": [6, 22],
33 |                 "maxHeadSpeed": 12,
34 |                 "maxHeadAcceleration": (8, 0.5),
35 |                 "brushWidthBound": (2.5, 5),
36 |                 "nMovePointRatio": 0.5,
37 |                 "maxPiontMove": 1.5,
38 |                 "maxLineAcceleration": (3, 0.5),
39 |                 "boarderGap": None,
40 |                 "maxInitSpeed": 3
41 |             }
42 |         else:
43 |             raise NotImplementedError(f'The stroke presetting "{stroke_preset}" does not exist.')
44 | 
45 |     def get_random_points_from_mask(self, mask, n=5):
46 |         h,w = mask.shape
47 |         view_mask = mask.reshape(h*w)
48 |         non_zero_idx = view_mask.nonzero()[:,0]
49 |         selected_idx = torch.randperm(len(non_zero_idx))[:n]
50 |         non_zero_idx = non_zero_idx[selected_idx]
51 |         y = (non_zero_idx // w)*1.0
52 |         x = (non_zero_idx % w)*1.0
53 |         return torch.cat((x[:,None], y[:,None]), dim=1).cpu().numpy()
54 | 
55 |     def draw(self, mask=None, box=None):
56 |         if mask.sum() < 1:
57 |             return torch.zeros(mask.shape).bool().cuda() # if mask is empty
58 |         if not self.is_train:
59 |             return self.draw_eval(mask=mask, box=box)
60 |         stroke_preset_name = random.choices(self.stroke_preset, weights=self.stroke_prob, k=1)[0]
61 |         preset = Scribble.get_stroke_preset(stroke_preset_name)
62 |         nStroke = random.randint(1, min(self.num_stroke, mask.sum().item()))
63 |         h,w = mask.shape
64 |         points = self.get_random_points_from_mask(mask, n=nStroke)
65 |         rand_mask = get_mask_by_input_strokes(
66 |             init_points=points,
67 |             imageWidth=w, imageHeight=h, nStroke=min(nStroke, len(points)), **preset)
68 |         rand_mask = (~torch.from_numpy(rand_mask)).cuda() * mask
69 |         return rand_mask
70 | 
71 |     def draw_eval(self, mask=None, box=None):
72 |         stroke_preset_name = random.choices(self.stroke_preset, weights=self.stroke_prob, k=1)[0]
73 |         preset = Scribble.get_stroke_preset(stroke_preset_name)
74 |         nStroke = min(self.eval_stroke, mask.sum().item())
75 |         h,w = mask.shape
76 |         points = self.get_random_points_from_mask(mask, n=nStroke)
77 |         rand_masks = []
78 |         for i in range(len(points)):
79 |             rand_mask = get_mask_by_input_strokes(
80 |                 init_points=points[:i+1],
81 |                 imageWidth=w, imageHeight=h, nStroke=min(i, len(points)), **preset)
82 |             rand_mask = (~torch.from_numpy(rand_mask)).cuda() * mask
83 |             rand_masks += [rand_mask]
84 |         return torch.stack(rand_masks)
85 | 
86 |     @staticmethod
87 |     def draw_by_points(points, mask, h, w):
88 |         preset = Scribble.get_stroke_preset('rand_curve_small')
89 |         rand_mask = get_mask_by_input_strokes(
90 |             init_points=points,
91 |             imageWidth=w, imageHeight=h, nStroke=len(points), **preset)[None,]
92 |         rand_masks = (~torch.from_numpy(rand_mask)) * mask
93 |         return rand_masks
94 | 
95 |     def __repr__(self,):
96 |         return 'scribble'


--------------------------------------------------------------------------------
/datasets/shapes/simpleclick_sampler.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import random
 3 | 
 4 | import cv2
 5 | import numpy as np
 6 | from scipy import ndimage
 7 | import torch
 8 | import torch.nn as nn
 9 | import torch.nn.functional as F
10 | from kornia.contrib import distance_transform
11 | 
12 | from .scribble import Scribble
13 | from dinov.utils import configurable
14 | 
15 | 
16 | class SimpleClickSampler(nn.Module):
17 |     @configurable
18 |     def __init__(self, mask_mode='point', sample_negtive=False, is_train=True, dilation=None, dilation_kernel=None):
19 |         super().__init__()
20 |         self.mask_mode = mask_mode
21 |         self.sample_negtive = sample_negtive
22 |         self.is_train = is_train
23 |         self.dilation = dilation
24 |         self.register_buffer("dilation_kernel", dilation_kernel)
25 | 
26 |     @classmethod
27 |     def from_config(cls, cfg, is_train=True, mode=None):
28 |         mask_mode = mode
29 |         sample_negtive = cfg['STROKE_SAMPLER']['EVAL']['NEGATIVE']
30 | 
31 |         dilation = cfg['STROKE_SAMPLER']['DILATION']
32 |         dilation_kernel = torch.ones((1, 1, dilation, dilation), device=torch.cuda.current_device())
33 | 
34 |         # Build augmentation
35 |         return {
36 |             "mask_mode": mask_mode,
37 |             "sample_negtive": sample_negtive,
38 |             "is_train": is_train,
39 |             "dilation": dilation,
40 |             "dilation_kernel": dilation_kernel,
41 |         }
42 | 
43 |     def forward_scribble(self, instances, pred_masks=None, prev_masks=None):
44 |         gt_masks_batch = instances.gt_masks
45 |         _,h,w = gt_masks_batch.shape
46 | 
47 |         rand_shapes = []
48 |         for i in range(len(gt_masks_batch)):
49 |             gt_masks = gt_masks_batch[i:i+1]
50 |             assert len(gt_masks) == 1 # it only supports a single image, with a single candidate mask.
51 |             # pred_masks is after padding
52 | 
53 |             # We only consider positive points
54 |             pred_masks = torch.zeros(gt_masks.shape).bool() if pred_masks is None else pred_masks[:,:h,:w]
55 |             prev_masks = torch.zeros(gt_masks.shape).bool() if prev_masks is None else prev_masks
56 | 
57 |             fp = gt_masks & (~(gt_masks & pred_masks)) & (~prev_masks)
58 |             next_mask = torch.zeros(gt_masks.shape).bool()
59 | 
60 |             mask_dt = torch.from_numpy(cv2.distanceTransform(fp[0].numpy().astype(np.uint8), cv2.DIST_L2, 0)[None,:])
61 |             max_value = mask_dt.max()
62 |             next_mask[(mask_dt==max_value).nonzero()[0:1].t().tolist()] = True
63 | 
64 |             points = next_mask[0].nonzero().flip(dims=[-1])
65 |             next_mask = Scribble.draw_by_points(points, gt_masks, h, w)
66 |             rand_shapes += [(prev_masks | next_mask)]
67 | 
68 |         types = ['scribble' for i in range(len(gt_masks_batch))]
69 |         return {'gt_masks': instances.gt_masks, 'rand_shape': rand_shapes, 'types': types, 'sampler': self}
70 | 
71 |     def forward(self, instances, *args, **kwargs):
72 |         if self.mask_mode == 'Point':
73 |             return self.forward_point(instances, *args, **kwargs)
74 |         elif self.mask_mode == 'Circle':
75 |             assert False, "Circle not support best path."
76 |         elif self.mask_mode == 'Scribble':
77 |             assert False, "Scribble not support best path."
78 |         elif self.mask_mode == 'Polygon':
79 |             assert False, "Polygon not support best path."
80 | 


--------------------------------------------------------------------------------
/datasets/utils/tsv/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Author: Yihao Chen
 3 | # @Date:   2021-08-16 16:56:22
 4 | # @Last Modified by:   Yihao Chen
 5 | # @Last Modified time: 2021-08-16 17:00:28
 6 | 
 7 | from .io_common import FileProgressingbar, img_from_base64, generate_lineidx
 8 | from .tsv_io import TSVFile
 9 | 
10 | __all__ = [
11 |     'FileProgressingbar', 'img_from_base64', 'generate_lineidx', 'TSVFile'
12 | ]


--------------------------------------------------------------------------------
/datasets/utils/tsv/io_common.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Author: Yihao Chen
 3 | # @Date:   2021-08-13 14:35:27
 4 | # @Last Modified by:   Yihao Chen
 5 | # @Last Modified time: 2022-04-24 11:38:58
 6 | 
 7 | import os
 8 | import base64
 9 | from io import BytesIO
10 | from PIL import Image
11 | 
12 | import cv2
13 | import yaml
14 | import progressbar
15 | import numpy as np
16 | import torchvision.transforms as T
17 | 
18 | class FileProgressingbar:
19 |     fileobj = None
20 |     pbar = None
21 |     def __init__(self, fileobj, msg):
22 |         fileobj.seek(0, os.SEEK_END)
23 |         flen = fileobj.tell()
24 |         fileobj.seek(0, os.SEEK_SET)
25 |         self.fileobj = fileobj
26 |         widgets = [msg, progressbar.AnimatedMarker(), ' ', progressbar.Percentage(), ' ', progressbar.Bar(), ' ', progressbar.ETA()]
27 |         self.pbar = progressbar.ProgressBar(widgets=widgets, maxval=flen).start()
28 | 
29 |     def update(self):
30 |         self.pbar.update(self.fileobj.tell())
31 | 
32 | 
33 | def img_from_base64(imagestring):
34 |     jpgbytestring = base64.b64decode(imagestring)
35 |     image = BytesIO(jpgbytestring)
36 |     image = Image.open(image).convert("RGB")
37 |     return image
38 | 
39 |     # jpgbytestring = base64.b64decode(imagestring)
40 |     # nparr = np.frombuffer(jpgbytestring, np.uint8)
41 |     # try:
42 |     #     r = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
43 |     #     # r = cv2.cvtColor(r, cv2.COLOR_BGR2RGB)
44 |     #     return r
45 |     # except:
46 |     #     return None
47 | 
48 | 
49 | def generate_lineidx(filein, idxout):
50 |     assert not os.path.isfile(idxout)
51 |     with open(filein, 'r') as tsvin, open(idxout, 'w') as tsvout:
52 |         bar = FileProgressingbar(tsvin, 'Generating lineidx {0}: '.format(idxout))
53 |         fsize = os.fstat(tsvin.fileno()).st_size
54 |         fpos = 0
55 |         while fpos != fsize:
56 |             tsvout.write(str(fpos)+"\n")
57 |             tsvin.readline()
58 |             fpos = tsvin.tell()
59 |             bar.update()
60 | 


--------------------------------------------------------------------------------
/datasets/utils/tsv/tsv_io.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Author: Yihao Chen
 3 | # @Date:   2021-08-13 14:26:21
 4 | # @Last Modified by:   Yihao Chen
 5 | # @Last Modified time: 2022-08-17 00:57:51
 6 | import time
 7 | import os
 8 | import os.path as op
 9 | from .io_common import generate_lineidx, FileProgressingbar
10 | 
11 | 
12 | class TSVFile(object):
13 |     def __init__(self, tsv_file, silence=True):
14 |         self.tsv_file = tsv_file
15 |         self.lineidx = op.splitext(tsv_file)[0] + '.lineidx'
16 | 
17 |         self.label_file = op.splitext(tsv_file)[0] + '.label'
18 |         self.label_lineidx = op.splitext(tsv_file)[0] + '.label.lineidx'
19 | 
20 |         if os.path.exists(self.label_file):
21 |             self.split_label = True
22 |         else:
23 |             self.split_label = False
24 | 
25 |         self._fp = None
26 |         self._lineidx = None
27 | 
28 |         self._label_fp = None
29 |         self._label_lineidx = None
30 | 
31 |         self.pid = None
32 |         self.silence = silence
33 |         self._ensure_lineidx_loaded()
34 | 
35 |     def num_rows(self):
36 |         return len(self._lineidx)
37 | 
38 |     def seek(self, idx):
39 |         self._ensure_tsv_opened()
40 |         pos = self._lineidx[idx]
41 |         self._fp.seek(pos)
42 |         tsv_info = [s.strip() for s in self._fp.readline().split('\t')]    
43 | 
44 |         if self.split_label:
45 |             label_pos = self._label_lineidx[idx]
46 |             self._label_fp.seek(label_pos)
47 |             label_info = [s.strip() for s in self._label_fp.readline().split('\t')]
48 | 
49 |             assert tsv_info[0] == label_info[0]
50 |             tsv_info = [tsv_info[0], label_info[-1], tsv_info[-1]]
51 | 
52 |         return tsv_info
53 | 
54 |     def close(self):
55 |         if self._fp is not None:
56 |             self._fp.close()
57 |             del self._fp
58 |             del self._lineidx
59 |             
60 |             self._fp = None
61 |             self._lineidx = None
62 | 
63 |     def _ensure_lineidx_loaded(self):
64 |         if not op.isfile(self.lineidx) and not op.islink(self.lineidx):
65 |             generate_lineidx(self.tsv_file, self.lineidx)
66 | 
67 |         if self._lineidx is None:
68 |             with open(self.lineidx, 'r') as fp:
69 |                 lines = fp.readlines()
70 |                 self._lineidx = [int(i.strip().split()[0]) for i in lines]
71 | 
72 |             if self.split_label:
73 |                 with open(self.label_lineidx, 'r') as fp:
74 |                     lines = fp.readlines()
75 |                     self._label_lineidx = [int(i.strip().split()[0]) for i in lines]                
76 | 
77 | 
78 |     def _ensure_tsv_opened(self):
79 |         self._ensure_lineidx_loaded()
80 |         if self._fp is None:
81 |             self._fp = open(self.tsv_file, 'r')
82 |             self.pid = os.getpid()
83 | 
84 |             if self.split_label:
85 |                 self._label_fp = open(self.label_file, 'r')
86 | 
87 |         if self.pid != os.getpid():
88 |             print('re-open {} because the process id changed'.format(self.tsv_file))
89 |             self._fp = open(self.tsv_file, 'r')
90 |             self.pid = os.getpid()
91 | 
92 |             if self.split_label:
93 |                 self._label_fp = open(self.label_file, 'r')
94 | 


--------------------------------------------------------------------------------
/demo/__init__.py:
--------------------------------------------------------------------------------
1 | from .openset_task import task_openset


--------------------------------------------------------------------------------
/demo/examples/bags.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UX-Decoder/DINOv/53bf20d5cfdbb86fa35141a1cff432d4923599f2/demo/examples/bags.jpg


--------------------------------------------------------------------------------
/demo/examples/corgi2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UX-Decoder/DINOv/53bf20d5cfdbb86fa35141a1cff432d4923599f2/demo/examples/corgi2.jpg


--------------------------------------------------------------------------------
/demo/examples/img.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UX-Decoder/DINOv/53bf20d5cfdbb86fa35141a1cff432d4923599f2/demo/examples/img.png


--------------------------------------------------------------------------------
/demo/examples/ref_cat.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UX-Decoder/DINOv/53bf20d5cfdbb86fa35141a1cff432d4923599f2/demo/examples/ref_cat.jpeg


--------------------------------------------------------------------------------
/demo/openset_task.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # Semantic-SAM: Segment and Recognize Anything at Any Granularity
  3 | # Copyright (c) 2023 Microsoft
  4 | # Licensed under The MIT License [see LICENSE for details]
  5 | # Written by Hao Zhang (hzhangcx@connect.ust.hk)
  6 | # --------------------------------------------------------
  7 | # Copyright (c) 2024 Microsoft
  8 | # Licensed under The MIT License [see LICENSE for details]
  9 | # Written by Feng Li (fliay@connect.ust.hk)
 10 | # --------------------------------------------------------
 11 | 
 12 | import torch
 13 | import numpy as np
 14 | from torchvision import transforms
 15 | from utils.visualizer import Visualizer
 16 | from typing import Tuple
 17 | from PIL import Image
 18 | from detectron2.data import MetadataCatalog
 19 | import os
 20 | import cv2
 21 | 
 22 | metadata = MetadataCatalog.get('coco_2017_train_panoptic')
 23 | 
 24 | 
 25 | def inverse_sigmoid(x, eps=1e-5):
 26 |     x = x.clamp(min=0, max=1)
 27 |     x1 = x.clamp(min=eps)
 28 |     x2 = (1 - x).clamp(min=eps)
 29 |     return torch.log(x1/x2)
 30 | 
 31 | def task_openset(model,generic_vp1, generic_vp2, generic_vp3, generic_vp4,
 32 |                    generic_vp5, generic_vp6, generic_vp7, generic_vp8, image_tgt=None, text_size=640,hole_scale=100,island_scale=100):
 33 |     in_context_examples = [generic_vp1, generic_vp2, generic_vp3, generic_vp4,
 34 |                    generic_vp5, generic_vp6, generic_vp7, generic_vp8]
 35 |     in_context_examples = [x for x in in_context_examples if x is not None]
 36 |     t = []
 37 |     t.append(transforms.Resize(int(text_size), interpolation=Image.BICUBIC))
 38 |     def prepare_image(image_ori):
 39 |         width = image_ori.size[0]
 40 |         height = image_ori.size[1]
 41 |         image_ori = np.asarray(image_ori)        
 42 |         images = torch.from_numpy(image_ori.copy()).permute(2,0,1).cuda()
 43 |         return images, height, width
 44 |     transform1 = transforms.Compose(t)
 45 |     image_ori_tgt = transform1(image_tgt)
 46 |     images_tgt, height_tgt, width_tgt = prepare_image(image_ori_tgt)
 47 |     data_tgt = {"image": images_tgt, "height": height_tgt, "width": width_tgt}
 48 |     batched_inputs = []
 49 |     batched_inputs_tgt = [data_tgt]
 50 |     multi_scale_features2, mask_features2, _, _ = model.model.get_encoder_feature(batched_inputs_tgt)
 51 |     input_query_label_content_all = []
 52 |     point_coords = torch.ones(1, 4).cuda().float()
 53 |     point_coords[:, :2] = 0.
 54 |     input_query_bbox_content_init = inverse_sigmoid(point_coords[None])
 55 |     for image in in_context_examples:
 56 |         image_ori = transform1(image['image'])
 57 |         mask_ori = transform1(image['mask'])
 58 |         images, height, width = prepare_image(image_ori)
 59 |         
 60 |         data = {"image": images, "height": height, "width": width}
 61 |         data['seg_image'] = data_tgt
 62 | 
 63 |         mask_ori = np.asarray(mask_ori)[:,:,0:1].copy()
 64 |         mask_ori = torch.from_numpy(mask_ori).permute(2,0,1)
 65 | 
 66 |         data['targets'] = [dict()]
 67 |         data['targets'][0]['rand_shape']=mask_ori
 68 |         data['targets'][0]['pb']=torch.tensor([1.])    # FIXME 0 or 1
 69 | 
 70 |         frame = data
 71 |         rand_shape = mask_ori
 72 |         frame['targets'][0]['rand_shape'] = rand_shape
 73 |         
 74 |         batched_inputs.append(frame)
 75 |         
 76 |         multi_scale_features, _, padded_h, padded_w = model.model.get_encoder_feature([frame])
 77 |         input_query_label_content, input_query_bbox_content, attn_mask_content = model.model. \
 78 |             get_visual_prompt_content_feature(multi_scale_features, frame['targets'][0]['rand_shape'], padded_h, padded_w)
 79 |         input_query_label_content_all.append(input_query_label_content)
 80 |         
 81 |     # prompt to tgt image
 82 |     input_query_label_content_current = torch.stack(input_query_label_content_all).mean(0)
 83 |     masks, ious, ori_masks, scores_per_image_openset = model.model.evaluate_demo_content_openset_multi_with_content_features(
 84 |         batched_inputs_tgt, mask_features2, multi_scale_features2, input_query_label_content_current,
 85 |         input_query_bbox_content_init, attn_mask_content, padded_h, padded_w)
 86 |     if len(ious.shape)>1:
 87 |         ious=ious[0]
 88 |     ids=torch.argsort(scores_per_image_openset,descending=True)
 89 |     areas=[]
 90 |     image_ori = image_ori_tgt
 91 |     new_pred_mask = []
 92 |     new_pred_class_score = []
 93 |     for i in ids:
 94 |         new_pred_class_score.append(scores_per_image_openset[i])
 95 |         new_pred_mask.append(masks[i])
 96 |     pred_masks_poses = new_pred_mask
 97 |     ious = new_pred_class_score
 98 |     visual = Visualizer(image_ori, metadata=metadata)
 99 |     for i,(pred_masks_pos,iou, _, _) in enumerate(zip(pred_masks_poses,ious, pred_masks_poses, pred_masks_poses)):
100 |         iou=round(float(iou),2)
101 |         texts=f'{iou}'
102 |         mask=(pred_masks_pos>0.0).cpu().numpy()
103 |         area=mask.sum()
104 |         areas.append(area)
105 |         # uncomment for additional postprocessing
106 |         # mask,_=remove_small_regions(mask,int(hole_scale),mode="holes")
107 |         # mask,_=remove_small_regions(mask,int(island_scale),mode="islands")
108 |         mask=(mask).astype(np.float)
109 |         color=[0.,0.,1.0]
110 |         color=[0.502, 0.0, 0.502]
111 |         demo = visual.draw_binary_mask(mask, text='', alpha=0.7, edge_color=color)
112 |     res = demo.get_image()
113 | 
114 |     torch.cuda.empty_cache()
115 | 
116 |     return res
117 | 
118 | def remove_small_regions(
119 |     mask: np.ndarray, area_thresh: float, mode: str
120 | ) -> Tuple[np.ndarray, bool]:
121 |     """
122 |     Removes small disconnected regions and holes in a mask. Returns the
123 |     mask and an indicator of if the mask has been modified.
124 |     """
125 |     import cv2  # type: ignore
126 | 
127 |     assert mode in ["holes", "islands"]
128 |     correct_holes = mode == "holes"
129 |     working_mask = (correct_holes ^ mask).astype(np.uint8)
130 |     n_labels, regions, stats, _ = cv2.connectedComponentsWithStats(working_mask, 8)
131 |     sizes = stats[:, -1][1:]  # Row 0 is background label
132 |     small_regions = [i + 1 for i, s in enumerate(sizes) if s < area_thresh]
133 |     if len(small_regions) == 0:
134 |         return mask, False
135 |     fill_labels = [0] + small_regions
136 |     if not correct_holes:
137 |         fill_labels = [i for i in range(n_labels) if i not in fill_labels]
138 |         # If every region is below threshold, keep largest
139 |         if len(fill_labels) == 0:
140 |             fill_labels = [int(np.argmax(sizes)) + 1]
141 |     mask = np.isin(regions, fill_labels)
142 |     return mask, True


--------------------------------------------------------------------------------
/demo_openset.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # Semantic-SAM: Segment and Recognize Anything at Any Granularity
  3 | # Copyright (c) 2023 Microsoft
  4 | # Licensed under The MIT License [see LICENSE for details]
  5 | # Written by Hao Zhang (hzhangcx@connect.ust.hk)
  6 | # --------------------------------------------------------
  7 | # Copyright (c) 2024 Microsoft
  8 | # Licensed under The MIT License [see LICENSE for details]
  9 | # Written by Feng Li (fliay@connect.ust.hk)
 10 | # --------------------------------------------------------
 11 | 
 12 | 
 13 | import gradio as gr
 14 | import torch
 15 | import argparse
 16 | 
 17 | from dinov.BaseModel import BaseModel
 18 | from dinov import build_model
 19 | from utils.arguments import load_opt_from_config_file
 20 | 
 21 | from demo import task_openset
 22 | 
 23 | def parse_option():
 24 |     parser = argparse.ArgumentParser('DINOv Demo', add_help=False)
 25 |     parser.add_argument('--conf_files', default="configs/dinov_sam_coco_swinl_train.yaml", metavar="FILE", help='path to config file', )
 26 |     parser.add_argument('--ckpt', default="", metavar="FILE", help='path to ckpt', required=True)
 27 |     parser.add_argument('--port', default=6099, type=int, help='path to ckpt', )
 28 |     args = parser.parse_args()
 29 | 
 30 |     return args
 31 | 
 32 | 
 33 | class ImageMask(gr.components.Image):
 34 |     """
 35 |     Sets: source="canvas", tool="sketch"
 36 |     """
 37 | 
 38 |     is_template = True
 39 | 
 40 |     def __init__(self, **kwargs):
 41 |         super().__init__(source="upload", tool="sketch", interactive=True, **kwargs)
 42 | 
 43 |     def preprocess(self, x):
 44 |         return super().preprocess(x)
 45 | 
 46 | 
 47 | '''
 48 | build args
 49 | '''
 50 | args = parse_option()
 51 | 
 52 | '''
 53 | build model
 54 | '''
 55 | 
 56 | sam_cfg=args.conf_files
 57 | 
 58 | opt = load_opt_from_config_file(sam_cfg)
 59 | 
 60 | model_sam = BaseModel(opt, build_model(opt)).from_pretrained(args.ckpt).eval().cuda()
 61 | 
 62 | @torch.no_grad()
 63 | def inference(generic_vp1, generic_vp2, generic_vp3, generic_vp4,
 64 |                    generic_vp5, generic_vp6, generic_vp7, generic_vp8, image2,*args, **kwargs):
 65 |     with torch.autocast(device_type='cuda', dtype=torch.float16):
 66 |         model=model_sam
 67 |         a= task_openset(model, generic_vp1, generic_vp2, generic_vp3, generic_vp4,
 68 |                    generic_vp5, generic_vp6, generic_vp7, generic_vp8, image2, *args, **kwargs)
 69 |         return a
 70 | 
 71 | 
 72 | '''
 73 | launch app
 74 | '''
 75 | title = "DINOv: Visual In-Context Prompting"
 76 | 
 77 | article = "The Demo is Run on DINOv."
 78 | 
 79 | demo = gr.Blocks()
 80 | image_tgt=gr.components.Image(label="Target Image ",type="pil",brush_radius=15.0)
 81 | gallery_output=gr.components.Image(label="Results Image ",type="pil",brush_radius=15.0)
 82 | 
 83 | generic_vp1 = ImageMask(label="scribble on refer Image 1",type="pil",brush_radius=15.0)
 84 | generic_vp2 = ImageMask(label="scribble on refer Image 2",type="pil",brush_radius=15.0)
 85 | generic_vp3 = ImageMask(label="scribble on refer Image 3",type="pil",brush_radius=15.0)
 86 | generic_vp4 = ImageMask(label="scribble on refer Image 5",type="pil",brush_radius=15.0)
 87 | generic_vp5 = ImageMask(label="scribble on refer Image 6",type="pil",brush_radius=15.0)
 88 | generic_vp6 = ImageMask(label="scribble on refer Image 7",type="pil",brush_radius=15.0)
 89 | generic_vp7 = ImageMask(label="scribble on refer Image 8",type="pil",brush_radius=15.0)
 90 | generic_vp8 = ImageMask(label="scribble on refer Image 9",type="pil",brush_radius=15.0)
 91 | generic = gr.TabbedInterface([
 92 |                         generic_vp1, generic_vp2, generic_vp3, generic_vp4,
 93 |                         generic_vp5, generic_vp6, generic_vp7, generic_vp8
 94 |                     ], ["1", "2", "3", "4", "5", "6", "7", "8"])
 95 | 
 96 | title='''
 97 | # DINOv: Visual In-Context Prompting
 98 | 
 99 | # [[Read our arXiv Paper](https://arxiv.org/pdf/2311.13601.pdf)\] &nbsp; \[[Github page](https://github.com/UX-Decoder/DINOv)\] 
100 | '''
101 | 
102 | with demo:
103 |     with gr.Row():
104 |         with gr.Column(scale=3.0):
105 |             generation_tittle = gr.Markdown(title)
106 |             image_tgt.render()
107 |             generic.render()
108 |             with gr.Row(scale=2.0):
109 |                 clearBtn = gr.ClearButton(
110 |                     components=[image_tgt])
111 |                 runBtn = gr.Button("Run")
112 |         with gr.Column(scale=5.0):
113 | 
114 |             gallery_tittle = gr.Markdown("# Open-set results.")
115 |             with gr.Row(scale=9.0):
116 |                 gallery_output.render()
117 | 
118 |             example = gr.Examples(
119 |                 examples=[
120 |                     ["demo/examples/bags.jpg"],
121 |                     ["demo/examples/img.png"],
122 |                     ["demo/examples/corgi2.jpg"],
123 |                     ["demo/examples/ref_cat.jpeg"],
124 |                 ],
125 |                 inputs=image_tgt,
126 |                 cache_examples=False,
127 |             )
128 | 
129 |     title = title,
130 |     article = article,
131 |     allow_flagging = 'never',
132 | 
133 |     runBtn.click(inference, inputs=[generic_vp1, generic_vp2, generic_vp3, generic_vp4,
134 |                    generic_vp5, generic_vp6, generic_vp7, generic_vp8, image_tgt],
135 |               outputs = [gallery_output])
136 | 
137 | 
138 | 
139 | demo.queue().launch(share=True,server_port=args.port)
140 | 
141 | 


--------------------------------------------------------------------------------
/dinov/BaseModel.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import logging
 3 | 
 4 | import torch
 5 | import torch.nn as nn
 6 | 
 7 | from utils.model import align_and_update_state_dicts
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | 
11 | 
12 | class BaseModel(nn.Module):
13 |     def __init__(self, opt, module: nn.Module):
14 |         super(BaseModel, self).__init__()
15 |         self.opt = opt
16 |         self.model = module
17 | 
18 |     def forward(self, *inputs, **kwargs):
19 |         outputs = self.model(*inputs, **kwargs)
20 |         return outputs
21 | 
22 |     def from_pretrained(self, load_dir):
23 |         state_dict = torch.load(load_dir, map_location='cpu')
24 |         if 'model' in state_dict:
25 |             state_dict=state_dict['model']
26 |         state_dict={k[6:]:v for k,v in state_dict.items() if k.startswith('model.')}
27 |         state_dict = align_and_update_state_dicts(self.model.state_dict(), state_dict)
28 |         self.model.load_state_dict(state_dict, strict=False)
29 |         return self


--------------------------------------------------------------------------------
/dinov/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from __future__ import division
3 | from __future__ import print_function
4 | 
5 | from .architectures import build_model
6 | from utils.dist import get_world_size, all_gather


--------------------------------------------------------------------------------
/dinov/architectures/__init__.py:
--------------------------------------------------------------------------------
1 | from .dinov import *
2 | from .build import build_model
3 | 
4 | 


--------------------------------------------------------------------------------
/dinov/architectures/build.py:
--------------------------------------------------------------------------------
 1 | from .registry import model_entrypoints
 2 | from .registry import is_model
 3 | 
 4 | 
 5 | def build_model(config, **kwargs):
 6 |     model_name = config['MODEL']['NAME']
 7 | 
 8 |     if not is_model(model_name):
 9 |         raise ValueError(f'Unkown model: {model_name}')
10 | 
11 |     return model_entrypoints(model_name)(config, **kwargs)


--------------------------------------------------------------------------------
/dinov/architectures/registry.py:
--------------------------------------------------------------------------------
 1 | _model_entrypoints = {}
 2 | 
 3 | def register_model(fn):
 4 |     module_name_split = fn.__module__.split('.')
 5 |     model_name = module_name_split[-1]
 6 |     _model_entrypoints[model_name] = fn
 7 |     return fn
 8 | 
 9 | def model_entrypoints(model_name):
10 |     return _model_entrypoints[model_name]
11 | 
12 | def is_model(model_name):
13 |     return model_name in _model_entrypoints


--------------------------------------------------------------------------------
/dinov/backbone/__init__.py:
--------------------------------------------------------------------------------
1 | from .build import build_backbone
2 | 
3 | from .focal import *
4 | from .focal_dw import *
5 | from .swin import *
6 | from .backbone import *


--------------------------------------------------------------------------------
/dinov/backbone/backbone.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | import torch.nn as nn
 3 | 
 4 | from detectron2.modeling import ShapeSpec
 5 | 
 6 | __all__ = ["Backbone"]
 7 | 
 8 | 
 9 | class Backbone(nn.Module):
10 |     """
11 |     Abstract base class for network backbones.
12 |     """
13 | 
14 |     def __init__(self):
15 |         """
16 |         The `__init__` method of any subclass can specify its own set of arguments.
17 |         """
18 |         super().__init__()
19 | 
20 |     def forward(self):
21 |         """
22 |         Subclasses must override this method, but adhere to the same return type.
23 | 
24 |         Returns:
25 |             dict[str->Tensor]: mapping from feature name (e.g., "res2") to tensor
26 |         """
27 |         pass
28 | 
29 |     @property
30 |     def size_divisibility(self) -> int:
31 |         """
32 |         Some backbones require the input height and width to be divisible by a
33 |         specific integer. This is typically true for encoder / decoder type networks
34 |         with lateral connection (e.g., FPN) for which feature maps need to match
35 |         dimension in the "bottom up" and "top down" paths. Set to 0 if no specific
36 |         input size divisibility is required.
37 |         """
38 |         return 0
39 | 
40 |     def output_shape(self):
41 |         """
42 |         Returns:
43 |             dict[str->ShapeSpec]
44 |         """
45 |         # this is a backward-compatible default
46 |         return {
47 |             name: ShapeSpec(
48 |                 channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
49 |             )
50 |             for name in self._out_features
51 |         }
52 | 


--------------------------------------------------------------------------------
/dinov/backbone/build.py:
--------------------------------------------------------------------------------
 1 | from .registry import model_entrypoints
 2 | from .registry import is_model
 3 | 
 4 | from .backbone import *
 5 | 
 6 | def build_backbone(config, **kwargs):
 7 |     model_name = config['MODEL']['BACKBONE']['NAME']
 8 |     if not is_model(model_name):
 9 |         raise ValueError(f'Unkown model: {model_name}')
10 | 
11 |     return model_entrypoints(model_name)(config, **kwargs)


--------------------------------------------------------------------------------
/dinov/backbone/registry.py:
--------------------------------------------------------------------------------
 1 | _model_entrypoints = {}
 2 | 
 3 | 
 4 | def register_backbone(fn):
 5 |     module_name_split = fn.__module__.split('.')
 6 |     model_name = module_name_split[-1]
 7 |     _model_entrypoints[model_name] = fn
 8 |     return fn
 9 | 
10 | def model_entrypoints(model_name):
11 |     return _model_entrypoints[model_name]
12 | 
13 | def is_model(model_name):
14 |     return model_name in _model_entrypoints
15 | 


--------------------------------------------------------------------------------
/dinov/body/__init__.py:
--------------------------------------------------------------------------------
1 | from .build import build_openseed_head


--------------------------------------------------------------------------------
/dinov/body/build.py:
--------------------------------------------------------------------------------
 1 | from .registry import model_entrypoints
 2 | from .registry import is_model
 3 | from .general_head import *
 4 | 
 5 | 
 6 | def build_openseed_head(config, *args, **kwargs):
 7 |     model_name = config['MODEL']['HEAD']
 8 |     if not is_model(model_name):
 9 |         raise ValueError(f'Unkown model: {model_name}')
10 | 
11 |     body = model_entrypoints(model_name)(config, *args, **kwargs)
12 |     return body


--------------------------------------------------------------------------------
/dinov/body/decoder/__init__.py:
--------------------------------------------------------------------------------
1 | from .build import build_decoder
2 | from .dinov_openset_decoder import *
3 | 


--------------------------------------------------------------------------------
/dinov/body/decoder/build.py:
--------------------------------------------------------------------------------
 1 | from .registry import model_entrypoints
 2 | from .registry import is_model
 3 | 
 4 | 
 5 | def build_decoder(config, *args, **kwargs):
 6 |     model_name = config['MODEL']['DECODER']['NAME']
 7 | 
 8 |     if not is_model(model_name):
 9 |         raise ValueError(f'Unkown model: {model_name}')
10 | 
11 |     return model_entrypoints(model_name)(config, *args, **kwargs)


--------------------------------------------------------------------------------
/dinov/body/decoder/registry.py:
--------------------------------------------------------------------------------
 1 | _model_entrypoints = {}
 2 | 
 3 | def register_decoder(fn):
 4 |     module_name_split = fn.__module__.split('.')
 5 |     model_name = module_name_split[-1]
 6 |     _model_entrypoints[model_name] = fn
 7 |     return fn
 8 | 
 9 | def model_entrypoints(model_name):
10 |     return _model_entrypoints[model_name]
11 | 
12 | def is_model(model_name):
13 |     return model_name in _model_entrypoints


--------------------------------------------------------------------------------
/dinov/body/decoder/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .utils import *


--------------------------------------------------------------------------------
/dinov/body/decoder/utils/utils.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import copy
  3 | from torch import nn, Tensor
  4 | import os
  5 | 
  6 | import math
  7 | import torch.nn.functional as F
  8 | from torch import nn
  9 | 
 10 | 
 11 | class MLP(nn.Module):
 12 |     """ Very simple multi-layer perceptron (also called FFN)"""
 13 | 
 14 |     def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
 15 |         super().__init__()
 16 |         self.num_layers = num_layers
 17 |         h = [hidden_dim] * (num_layers - 1)
 18 |         self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
 19 | 
 20 |     def forward(self, x):
 21 |         for i, layer in enumerate(self.layers):
 22 |             x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
 23 |         return x
 24 | 
 25 | 
 26 | def inverse_sigmoid(x, eps=1e-5):
 27 |     x = x.clamp(min=0, max=1)
 28 |     x1 = x.clamp(min=eps)
 29 |     x2 = (1 - x).clamp(min=eps)
 30 |     return torch.log(x1/x2)
 31 | 
 32 | 
 33 | def gen_encoder_output_proposals(memory:Tensor, memory_padding_mask:Tensor, spatial_shapes:Tensor):
 34 |     """
 35 |     Input:
 36 |         - memory: bs, \sum{hw}, d_model
 37 |         - memory_padding_mask: bs, \sum{hw}
 38 |         - spatial_shapes: nlevel, 2
 39 |     Output:
 40 |         - output_memory: bs, \sum{hw}, d_model
 41 |         - output_proposals: bs, \sum{hw}, 4
 42 |     """
 43 |     N_, S_, C_ = memory.shape
 44 |     base_scale = 4.0
 45 |     proposals = []
 46 |     _cur = 0
 47 |     for lvl, (H_, W_) in enumerate(spatial_shapes):
 48 |         mask_flatten_ = memory_padding_mask[:, _cur:(_cur + H_ * W_)].view(N_, H_, W_, 1)
 49 |         valid_H = torch.sum(~mask_flatten_[:, :, 0, 0], 1)
 50 |         valid_W = torch.sum(~mask_flatten_[:, 0, :, 0], 1)
 51 | 
 52 |         grid_y, grid_x = torch.meshgrid(torch.linspace(0, H_ - 1, H_, dtype=torch.float32, device=memory.device),
 53 |                                         torch.linspace(0, W_ - 1, W_, dtype=torch.float32, device=memory.device))
 54 |         grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1)
 55 | 
 56 |         scale = torch.cat([valid_W.unsqueeze(-1), valid_H.unsqueeze(-1)], 1).view(N_, 1, 1, 2)
 57 |         grid = (grid.unsqueeze(0).expand(N_, -1, -1, -1) + 0.5) / scale
 58 |         wh = torch.ones_like(grid) * 0.05 * (2.0 ** lvl)
 59 |         proposal = torch.cat((grid, wh), -1).view(N_, -1, 4)
 60 |         proposals.append(proposal)
 61 |         _cur += (H_ * W_)
 62 |     output_proposals = torch.cat(proposals, 1)
 63 |     output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True)
 64 |     output_proposals = torch.log(output_proposals / (1 - output_proposals))
 65 |     output_proposals = output_proposals.masked_fill(memory_padding_mask.unsqueeze(-1), float('inf'))
 66 |     output_proposals = output_proposals.masked_fill(~output_proposals_valid, float('inf'))
 67 | 
 68 |     output_memory = memory
 69 |     output_memory = output_memory.masked_fill(memory_padding_mask.unsqueeze(-1), float(0))
 70 |     output_memory = output_memory.masked_fill(~output_proposals_valid, float(0))
 71 |     return output_memory, output_proposals
 72 | 
 73 | 
 74 | def gen_sineembed_for_position(pos_tensor, dim=128):
 75 |     # n_query, bs, _ = pos_tensor.size()
 76 |     # sineembed_tensor = torch.zeros(n_query, bs, 256)
 77 |     scale = 2 * math.pi
 78 |     dim_t = torch.arange(dim, dtype=torch.float32, device=pos_tensor.device)
 79 |     dim_t = 10000 ** (2 * (dim_t // 2) / dim)
 80 |     x_embed = pos_tensor[:, :, 0] * scale
 81 |     y_embed = pos_tensor[:, :, 1] * scale
 82 |     pos_x = x_embed[:, :, None] / dim_t
 83 |     pos_y = y_embed[:, :, None] / dim_t
 84 |     pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2)
 85 |     pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=3).flatten(2)
 86 |     if pos_tensor.size(-1) == 2:
 87 |         pos = torch.cat((pos_y, pos_x), dim=2)
 88 |     elif pos_tensor.size(-1) == 4:
 89 |         w_embed = pos_tensor[:, :, 2] * scale
 90 |         pos_w = w_embed[:, :, None] / dim_t
 91 |         pos_w = torch.stack((pos_w[:, :, 0::2].sin(), pos_w[:, :, 1::2].cos()), dim=3).flatten(2)
 92 | 
 93 |         h_embed = pos_tensor[:, :, 3] * scale
 94 |         pos_h = h_embed[:, :, None] / dim_t
 95 |         pos_h = torch.stack((pos_h[:, :, 0::2].sin(), pos_h[:, :, 1::2].cos()), dim=3).flatten(2)
 96 | 
 97 |         pos = torch.cat((pos_y, pos_x, pos_w, pos_h), dim=2)
 98 |     else:
 99 |         raise ValueError("Unknown pos_tensor shape(-1):{}".format(pos_tensor.size(-1)))
100 |     return pos
101 | 
102 | 
103 | def _get_activation_fn(activation):
104 |     """Return an activation function given a string"""
105 |     if activation == "relu":
106 |         return F.relu
107 |     if activation == "gelu":
108 |         return F.gelu
109 |     if activation == "glu":
110 |         return F.glu
111 |     if activation == "prelu":
112 |         return nn.PReLU()
113 |     if activation == "selu":
114 |         return F.selu
115 |     raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
116 | 
117 | 
118 | def _get_clones(module, N, layer_share=False):
119 | 
120 |     if layer_share:
121 |         return nn.ModuleList([module for i in range(N)])
122 |     else:
123 |         return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
124 | 
125 | def from_divisablity(x, div):
126 |     if x % div == 0:
127 |         return x
128 |     return (int(x / div) + 1) * div
129 | 
130 | def getIdx(a, id_start):
131 |     co = a.unsqueeze(0) - a.unsqueeze(1)
132 |     uniquer = co.unique(dim=0)
133 |     out = []
134 |     for r in uniquer:
135 |         cover = torch.arange(a.size(0)).to(a)
136 |         mask = r == 0
137 |         idx = cover[mask]
138 |         out.append(idx)
139 |     out = [o + id_start for o in out]
140 |     return {str(k.cpu().numpy()): v for k, v in zip(a.unique(), out[::-1])}
141 | 
142 | def get_world_size():
143 |     if torch.distributed.is_initialized():
144 |         return torch.distributed.get_world_size()
145 |     return 1
146 | 
147 | def all_gather(x):
148 |     if get_world_size() > 1:
149 |         all_x = [torch.zeros_like(x) for _ in range(get_world_size())]
150 |         torch.distributed.all_gather(all_x, x.detach())
151 |         all_x[torch.distributed.get_rank()] = x
152 |         x = torch.stack(all_x, dim=0)
153 |     return x
154 | 
155 | 
156 | def get_unpadded_tensor(tensors, num_examples):
157 |     new_tensor_list = []
158 |     for i, tensor in enumerate(tensors):
159 |         new_tensor_list.append(tensor[:num_examples[i]])
160 |     return new_tensor_list


--------------------------------------------------------------------------------
/dinov/body/encoder/__init__.py:
--------------------------------------------------------------------------------
1 | from .build import build_encoder


--------------------------------------------------------------------------------
/dinov/body/encoder/build.py:
--------------------------------------------------------------------------------
 1 | from .registry import model_entrypoints
 2 | from .registry import is_model
 3 | from .encoder_deform import *
 4 | 
 5 | def build_encoder(config, *args, **kwargs):
 6 |     model_name = config['MODEL']['ENCODER']['NAME']
 7 | 
 8 |     if not is_model(model_name):
 9 |         raise ValueError(f'Unkown model: {model_name}')
10 | 
11 |     return model_entrypoints(model_name)(config, *args, **kwargs)


--------------------------------------------------------------------------------
/dinov/body/encoder/ops/functions/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from .ms_deform_attn_func import MSDeformAttnFunction
13 | 
14 | 


--------------------------------------------------------------------------------
/dinov/body/encoder/ops/functions/ms_deform_attn_func.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from __future__ import absolute_import
13 | from __future__ import print_function
14 | from __future__ import division
15 | 
16 | import torch
17 | import torch.nn.functional as F
18 | from torch.autograd import Function
19 | from torch.autograd.function import once_differentiable
20 | 
21 | try:
22 |     import MultiScaleDeformableAttention as MSDA
23 | except ModuleNotFoundError as e:
24 |     info_string = (
25 |         "\n\nPlease compile MultiScaleDeformableAttention CUDA op with the following commands:\n"
26 |         "\t`cd mask2former/modeling/pixel_decoder/ops`\n"
27 |         "\t`sh make.sh`\n"
28 |     )
29 |     raise ModuleNotFoundError(info_string)
30 | 
31 | 
32 | class MSDeformAttnFunction(Function):
33 |     @staticmethod
34 |     def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step):
35 |         ctx.im2col_step = im2col_step
36 |         output = MSDA.ms_deform_attn_forward(
37 |             value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step)
38 |         ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights)
39 |         return output
40 | 
41 |     @staticmethod
42 |     @once_differentiable
43 |     def backward(ctx, grad_output):
44 |         value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors
45 |         grad_value, grad_sampling_loc, grad_attn_weight = \
46 |             MSDA.ms_deform_attn_backward(
47 |                 value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step)
48 | 
49 |         return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
50 | 
51 | 
52 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
53 |     # for debug and test only,
54 |     # need to use cuda version instead
55 |     N_, S_, M_, D_ = value.shape
56 |     _, Lq_, M_, L_, P_, _ = sampling_locations.shape
57 |     value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
58 |     sampling_grids = 2 * sampling_locations - 1
59 |     sampling_value_list = []
60 |     for lid_, (H_, W_) in enumerate(value_spatial_shapes):
61 |         # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
62 |         value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_)
63 |         # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
64 |         sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)
65 |         # N_*M_, D_, Lq_, P_
66 |         sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,
67 |                                           mode='bilinear', padding_mode='zeros', align_corners=False)
68 |         sampling_value_list.append(sampling_value_l_)
69 |     # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
70 |     attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_)
71 |     output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)
72 |     return output.transpose(1, 2).contiguous()
73 | 


--------------------------------------------------------------------------------
/dinov/body/encoder/ops/modules/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from .ms_deform_attn import MSDeformAttn
13 | 


--------------------------------------------------------------------------------
/dinov/body/encoder/ops/setup.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | import os
13 | import glob
14 | 
15 | import torch
16 | 
17 | from torch.utils.cpp_extension import CUDA_HOME
18 | from torch.utils.cpp_extension import CppExtension
19 | from torch.utils.cpp_extension import CUDAExtension
20 | 
21 | from setuptools import find_packages
22 | from setuptools import setup
23 | 
24 | requirements = ["torch", "torchvision"]
25 | 
26 | def get_extensions():
27 |     this_dir = os.path.dirname(os.path.abspath(__file__))
28 |     extensions_dir = os.path.join(this_dir, "src")
29 | 
30 |     main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
31 |     source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
32 |     source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
33 | 
34 |     sources = main_file + source_cpu
35 |     extension = CppExtension
36 |     extra_compile_args = {"cxx": []}
37 |     define_macros = []
38 | 
39 |     # Force cuda since torch ask for a device, not if cuda is in fact available.
40 |     if (os.environ.get('FORCE_CUDA') or torch.cuda.is_available()) and CUDA_HOME is not None:
41 |         extension = CUDAExtension
42 |         sources += source_cuda
43 |         define_macros += [("WITH_CUDA", None)]
44 |         extra_compile_args["nvcc"] = [
45 |             "-DCUDA_HAS_FP16=1",
46 |             "-D__CUDA_NO_HALF_OPERATORS__",
47 |             "-D__CUDA_NO_HALF_CONVERSIONS__",
48 |             "-D__CUDA_NO_HALF2_OPERATORS__",
49 |         ]
50 |     else:
51 |         if CUDA_HOME is None:
52 |             raise NotImplementedError('CUDA_HOME is None. Please set environment variable CUDA_HOME.')
53 |         else:
54 |             raise NotImplementedError('No CUDA runtime is found. Please set FORCE_CUDA=1 or test it by running torch.cuda.is_available().')
55 | 
56 |     sources = [os.path.join(extensions_dir, s) for s in sources]
57 |     include_dirs = [extensions_dir]
58 |     ext_modules = [
59 |         extension(
60 |             "MultiScaleDeformableAttention",
61 |             sources,
62 |             include_dirs=include_dirs,
63 |             define_macros=define_macros,
64 |             extra_compile_args=extra_compile_args,
65 |         )
66 |     ]
67 |     return ext_modules
68 | 
69 | setup(
70 |     name="MultiScaleDeformableAttention",
71 |     version="1.0",
72 |     author="Weijie Su",
73 |     url="https://github.com/fundamentalvision/Deformable-DETR",
74 |     description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention",
75 |     packages=find_packages(exclude=("configs", "tests",)),
76 |     ext_modules=get_extensions(),
77 |     cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
78 | )
79 | 


--------------------------------------------------------------------------------
/dinov/body/encoder/ops/src/cpu/ms_deform_attn_cpu.cpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #include <vector>
17 | 
18 | #include <ATen/ATen.h>
19 | #include <ATen/cuda/CUDAContext.h>
20 | 
21 | 
22 | at::Tensor
23 | ms_deform_attn_cpu_forward(
24 |     const at::Tensor &value, 
25 |     const at::Tensor &spatial_shapes,
26 |     const at::Tensor &level_start_index,
27 |     const at::Tensor &sampling_loc,
28 |     const at::Tensor &attn_weight,
29 |     const int im2col_step)
30 | {
31 |     AT_ERROR("Not implement on cpu");
32 | }
33 | 
34 | std::vector<at::Tensor>
35 | ms_deform_attn_cpu_backward(
36 |     const at::Tensor &value, 
37 |     const at::Tensor &spatial_shapes,
38 |     const at::Tensor &level_start_index,
39 |     const at::Tensor &sampling_loc,
40 |     const at::Tensor &attn_weight,
41 |     const at::Tensor &grad_output,
42 |     const int im2col_step)
43 | {
44 |     AT_ERROR("Not implement on cpu");
45 | }
46 | 
47 | 


--------------------------------------------------------------------------------
/dinov/body/encoder/ops/src/cpu/ms_deform_attn_cpu.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #pragma once
17 | #include <torch/extension.h>
18 | 
19 | at::Tensor
20 | ms_deform_attn_cpu_forward(
21 |     const at::Tensor &value, 
22 |     const at::Tensor &spatial_shapes,
23 |     const at::Tensor &level_start_index,
24 |     const at::Tensor &sampling_loc,
25 |     const at::Tensor &attn_weight,
26 |     const int im2col_step);
27 | 
28 | std::vector<at::Tensor>
29 | ms_deform_attn_cpu_backward(
30 |     const at::Tensor &value, 
31 |     const at::Tensor &spatial_shapes,
32 |     const at::Tensor &level_start_index,
33 |     const at::Tensor &sampling_loc,
34 |     const at::Tensor &attn_weight,
35 |     const at::Tensor &grad_output,
36 |     const int im2col_step);
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/dinov/body/encoder/ops/src/cuda/ms_deform_attn_cuda.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #pragma once
17 | #include <torch/extension.h>
18 | 
19 | at::Tensor ms_deform_attn_cuda_forward(
20 |     const at::Tensor &value, 
21 |     const at::Tensor &spatial_shapes,
22 |     const at::Tensor &level_start_index,
23 |     const at::Tensor &sampling_loc,
24 |     const at::Tensor &attn_weight,
25 |     const int im2col_step);
26 | 
27 | std::vector<at::Tensor> ms_deform_attn_cuda_backward(
28 |     const at::Tensor &value, 
29 |     const at::Tensor &spatial_shapes,
30 |     const at::Tensor &level_start_index,
31 |     const at::Tensor &sampling_loc,
32 |     const at::Tensor &attn_weight,
33 |     const at::Tensor &grad_output,
34 |     const int im2col_step);
35 | 
36 | 


--------------------------------------------------------------------------------
/dinov/body/encoder/ops/src/ms_deform_attn.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #pragma once
17 | 
18 | #include "cpu/ms_deform_attn_cpu.h"
19 | 
20 | #ifdef WITH_CUDA
21 | #include "cuda/ms_deform_attn_cuda.h"
22 | #endif
23 | 
24 | 
25 | at::Tensor
26 | ms_deform_attn_forward(
27 |     const at::Tensor &value, 
28 |     const at::Tensor &spatial_shapes,
29 |     const at::Tensor &level_start_index,
30 |     const at::Tensor &sampling_loc,
31 |     const at::Tensor &attn_weight,
32 |     const int im2col_step)
33 | {
34 |     if (value.type().is_cuda())
35 |     {
36 | #ifdef WITH_CUDA
37 |         return ms_deform_attn_cuda_forward(
38 |             value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step);
39 | #else
40 |         AT_ERROR("Not compiled with GPU support");
41 | #endif
42 |     }
43 |     AT_ERROR("Not implemented on the CPU");
44 | }
45 | 
46 | std::vector<at::Tensor>
47 | ms_deform_attn_backward(
48 |     const at::Tensor &value, 
49 |     const at::Tensor &spatial_shapes,
50 |     const at::Tensor &level_start_index,
51 |     const at::Tensor &sampling_loc,
52 |     const at::Tensor &attn_weight,
53 |     const at::Tensor &grad_output,
54 |     const int im2col_step)
55 | {
56 |     if (value.type().is_cuda())
57 |     {
58 | #ifdef WITH_CUDA
59 |         return ms_deform_attn_cuda_backward(
60 |             value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step);
61 | #else
62 |         AT_ERROR("Not compiled with GPU support");
63 | #endif
64 |     }
65 |     AT_ERROR("Not implemented on the CPU");
66 | }
67 | 
68 | 


--------------------------------------------------------------------------------
/dinov/body/encoder/ops/src/vision.cpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #include "ms_deform_attn.h"
17 | 
18 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
19 |   m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward");
20 |   m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward");
21 | }
22 | 


--------------------------------------------------------------------------------
/dinov/body/encoder/ops/test.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from __future__ import absolute_import
13 | from __future__ import print_function
14 | from __future__ import division
15 | 
16 | import time
17 | import torch
18 | import torch.nn as nn
19 | from torch.autograd import gradcheck
20 | 
21 | from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch
22 | 
23 | 
24 | N, M, D = 1, 2, 2
25 | Lq, L, P = 2, 2, 2
26 | shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda()
27 | level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1]))
28 | S = sum([(H*W).item() for H, W in shapes])
29 | 
30 | 
31 | torch.manual_seed(3)
32 | 
33 | 
34 | @torch.no_grad()
35 | def check_forward_equal_with_pytorch_double():
36 |     value = torch.rand(N, S, M, D).cuda() * 0.01
37 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
38 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
39 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
40 |     im2col_step = 2
41 |     output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu()
42 |     output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu()
43 |     fwdok = torch.allclose(output_cuda, output_pytorch)
44 |     max_abs_err = (output_cuda - output_pytorch).abs().max()
45 |     max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
46 | 
47 |     print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
48 | 
49 | 
50 | @torch.no_grad()
51 | def check_forward_equal_with_pytorch_float():
52 |     value = torch.rand(N, S, M, D).cuda() * 0.01
53 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
54 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
55 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
56 |     im2col_step = 2
57 |     output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu()
58 |     output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu()
59 |     fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3)
60 |     max_abs_err = (output_cuda - output_pytorch).abs().max()
61 |     max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
62 | 
63 |     print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
64 | 
65 | 
66 | def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True):
67 | 
68 |     value = torch.rand(N, S, M, channels).cuda() * 0.01
69 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
70 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
71 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
72 |     im2col_step = 2
73 |     func = MSDeformAttnFunction.apply
74 | 
75 |     value.requires_grad = grad_value
76 |     sampling_locations.requires_grad = grad_sampling_loc
77 |     attention_weights.requires_grad = grad_attn_weight
78 | 
79 |     gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step))
80 | 
81 |     print(f'* {gradok} check_gradient_numerical(D={channels})')
82 | 
83 | 
84 | if __name__ == '__main__':
85 |     check_forward_equal_with_pytorch_double()
86 |     check_forward_equal_with_pytorch_float()
87 | 
88 |     for channels in [30, 32, 64, 71, 1025, 2048, 3096]:
89 |         check_gradient_numerical(channels, True, True, True)
90 | 
91 | 
92 | 
93 | 


--------------------------------------------------------------------------------
/dinov/body/encoder/registry.py:
--------------------------------------------------------------------------------
 1 | _model_entrypoints = {}
 2 | 
 3 | def register_encoder(fn):
 4 |     module_name_split = fn.__module__.split('.')
 5 |     model_name = module_name_split[-1]
 6 |     _model_entrypoints[model_name] = fn
 7 |     return fn
 8 | 
 9 | def model_entrypoints(model_name):
10 |     return _model_entrypoints[model_name]
11 | 
12 | def is_model(model_name):
13 |     return model_name in _model_entrypoints
14 | 


--------------------------------------------------------------------------------
/dinov/body/general_head.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------
  2 | # Copyright (c) MicroSoft, Inc. and its affiliates.
  3 | # Modified from DINO https://github.com/IDEA-Research/MaskDINO by Feng Li.
  4 | # ------------------------------------------------------------------------
  5 | import logging
  6 | from typing import Callable, Dict, List, Optional, Tuple, Union
  7 | 
  8 | from torch import nn
  9 | 
 10 | from detectron2.layers import Conv2d, ShapeSpec, get_norm
 11 | 
 12 | from .registry import register_body
 13 | from .encoder import build_encoder
 14 | from .decoder import build_decoder
 15 | from ..utils import configurable
 16 | 
 17 | 
 18 | class IMaskDINOHead(nn.Module):
 19 |     @configurable
 20 |     def __init__(
 21 |         self,
 22 |         input_shape: Dict[str, ShapeSpec],
 23 |         *,
 24 |         num_classes: int,
 25 |         pixel_decoder: nn.Module,
 26 |         loss_weight: float = 1.0,
 27 |         ignore_value: int = -1,
 28 |         transformer_predictor: nn.Module,
 29 |     ):
 30 |         """
 31 |         Args:
 32 |             input_shape: shapes (channels and stride) of the input features
 33 |             num_classes: number of classes to predict
 34 |             pixel_decoder: the pixel decoder module
 35 |             loss_weight: loss weight
 36 |             ignore_value: category id to be ignored during training.
 37 |             transformer_predictor: the transformer decoder that makes prediction
 38 |             transformer_in_feature: input feature name to the transformer_predictor
 39 |         """
 40 |         super().__init__()
 41 |         input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride)
 42 |         self.in_features = [k for k, v in input_shape]
 43 |         self.ignore_value = ignore_value
 44 |         self.common_stride = 4
 45 |         self.loss_weight = loss_weight
 46 | 
 47 |         self.pixel_decoder = pixel_decoder
 48 |         self.predictor = transformer_predictor
 49 | 
 50 |         self.num_classes = num_classes
 51 |         # store processed features
 52 |         self.processed_features = None
 53 | 
 54 |     @classmethod
 55 |     def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec], lang_encoder: nn.Module, extra: dict):
 56 |         enc_cfg = cfg['MODEL']['ENCODER']
 57 |         dec_cfg = cfg['MODEL']['DECODER']
 58 |         transformer_predictor_in_channels = enc_cfg['CONVS_DIM']
 59 | 
 60 |         return {
 61 |             "input_shape": {
 62 |                 k: v for k, v in input_shape.items() if k in enc_cfg['IN_FEATURES']
 63 |             },
 64 |             "ignore_value": enc_cfg['IGNORE_VALUE'],
 65 |             "num_classes": enc_cfg.get('NUM_CLASSES', None),
 66 |             "pixel_decoder": build_encoder(cfg, input_shape),
 67 |             "loss_weight": enc_cfg['LOSS_WEIGHT'],
 68 |             "transformer_predictor": build_decoder(
 69 |                 cfg,
 70 |                 transformer_predictor_in_channels,
 71 |                 lang_encoder,
 72 |                 mask_classification=True,
 73 |                 extra=extra,
 74 |             ),
 75 |         }
 76 | 
 77 |     def forward_encoder(self, features, mask=None,targets=None, target_queries=None, target_vlp=None, prediction_switch=None, task='seg', extra={}):
 78 |         mask_features, transformer_encoder_features, multi_scale_features = self.pixel_decoder.forward_features(
 79 |             features, mask)
 80 |         self.processed_features = (mask_features, transformer_encoder_features, multi_scale_features)
 81 | 
 82 |     def forward_decoder(self, features, mask=None,targets=None, target_queries=None, target_vlp=None, prediction_switch=None, task='seg', extra={}):
 83 |         assert self.processed_features is not None, "need to precess features first"
 84 |         mask_features, transformer_encoder_features, multi_scale_features = self.processed_features
 85 |         predictions = self.predictor(multi_scale_features, mask_features, mask, targets=targets,
 86 |                                          target_queries=target_queries, target_vlp=target_vlp, task=task, extra=extra)
 87 |         return predictions
 88 | 
 89 |     def forward(self, features, mask=None, targets=None, target_queries=None, target_vlp=None, task='seg', extra={}):
 90 |         return self.layers(features, mask, targets=targets, target_queries=target_queries, target_vlp=target_vlp, task=task, extra=extra)
 91 | 
 92 |     def layers(self, features, mask=None,targets=None, target_queries=None, target_vlp=None, prediction_switch=None, task='seg', extra={}):
 93 |         mask_features, transformer_encoder_features, multi_scale_features = self.pixel_decoder.forward_features(features, mask)
 94 |         predictions = self.predictor(multi_scale_features, mask_features, mask, targets=targets,
 95 |                                          target_queries=target_queries, target_vlp=target_vlp, task=task, extra=extra)
 96 |         return predictions
 97 | 
 98 | 
 99 | @register_body
100 | def get_interactive_maskdino_head(cfg, input_shape, lang_encoder, extra):
101 |     return IMaskDINOHead(cfg, input_shape, lang_encoder, extra)


--------------------------------------------------------------------------------
/dinov/body/registry.py:
--------------------------------------------------------------------------------
 1 | _model_entrypoints = {}
 2 | 
 3 | 
 4 | def register_body(fn):
 5 |     module_name_split = fn.__module__.split('.')
 6 |     model_name = module_name_split[-1]
 7 |     _model_entrypoints[model_name] = fn
 8 |     return fn
 9 | 
10 | def model_entrypoints(model_name):
11 |     return _model_entrypoints[model_name]
12 | 
13 | def is_model(model_name):
14 |     return model_name in _model_entrypoints


--------------------------------------------------------------------------------
/dinov/language/__init__.py:
--------------------------------------------------------------------------------
1 | from .build import build_language_encoder


--------------------------------------------------------------------------------
/dinov/language/build.py:
--------------------------------------------------------------------------------
 1 | """
 2 | placeholder for language open-set or grounding
 3 | """
 4 | 
 5 | 
 6 | def build_language_encoder(config, **kwargs):
 7 |     model_name = config['MODEL']['TEXT']['ARCH']
 8 |     if model_name=='noencoder':
 9 |         return None
10 |     else:
11 |         raise NotImplementedError


--------------------------------------------------------------------------------
/dinov/modules/__init__.py:
--------------------------------------------------------------------------------
1 | from .position_encoding import *
2 | from .postprocessing import *
3 | from .matcher import *
4 | from .criterion_visual_refer_one2one import *
5 | from .criterion_visual_openset import *
6 | from .criterion_visual_refer_many2many import *
7 | from .matcher_many2many import *
8 | 
9 | 


--------------------------------------------------------------------------------
/dinov/modules/position_encoding.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py
 3 | """
 4 | Various positional encodings for the transformer.
 5 | """
 6 | import math
 7 | 
 8 | import torch
 9 | from torch import nn
10 | 
11 | 
12 | class PositionEmbeddingSine(nn.Module):
13 |     """
14 |     This is a more standard version of the position embedding, very similar to the one
15 |     used by the Attention is all you need paper, generalized to work on images.
16 |     """
17 | 
18 |     def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
19 |         super().__init__()
20 |         self.num_pos_feats = num_pos_feats
21 |         self.temperature = temperature
22 |         self.normalize = normalize
23 |         if scale is not None and normalize is False:
24 |             raise ValueError("normalize should be True if scale is passed")
25 |         if scale is None:
26 |             scale = 2 * math.pi
27 |         self.scale = scale
28 | 
29 |     def forward(self, x, mask=None):
30 |         if mask is None:
31 |             mask = torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool)
32 |         not_mask = ~mask
33 |         y_embed = not_mask.cumsum(1, dtype=x.dtype)
34 |         x_embed = not_mask.cumsum(2, dtype=x.dtype)
35 |         if self.normalize:
36 |             eps = 1e-6
37 |             y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
38 |             x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
39 | 
40 |         dim_t = torch.arange(self.num_pos_feats, dtype=x.dtype, device=x.device)
41 |         dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
42 | 
43 |         pos_x = x_embed[:, :, :, None] / dim_t
44 |         pos_y = y_embed[:, :, :, None] / dim_t
45 |         pos_x = torch.stack(
46 |             (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4
47 |         ).flatten(3)
48 |         pos_y = torch.stack(
49 |             (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4
50 |         ).flatten(3)
51 |         pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
52 |         return pos
53 | 
54 |     def __repr__(self, _repr_indent=4):
55 |         head = "Positional encoding " + self.__class__.__name__
56 |         body = [
57 |             "num_pos_feats: {}".format(self.num_pos_feats),
58 |             "temperature: {}".format(self.temperature),
59 |             "normalize: {}".format(self.normalize),
60 |             "scale: {}".format(self.scale),
61 |         ]
62 |         # _repr_indent = 4
63 |         lines = [head] + [" " * _repr_indent + line for line in body]
64 |         return "\n".join(lines)
65 | 


--------------------------------------------------------------------------------
/dinov/modules/postprocessing.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | import torch
 3 | from torch.nn import functional as F
 4 | 
 5 | from detectron2.structures import Instances, ROIMasks
 6 | 
 7 | 
 8 | def sem_seg_postprocess(result, img_size, output_height, output_width):
 9 |     """
10 |     Return semantic segmentation predictions in the original resolution.
11 | 
12 |     The input images are often resized when entering semantic segmentor. Moreover, in same
13 |     cases, they also padded inside segmentor to be divisible by maximum network stride.
14 |     As a result, we often need the predictions of the segmentor in a different
15 |     resolution from its inputs.
16 | 
17 |     Args:
18 |         result (Tensor): semantic segmentation prediction logits. A tensor of shape (C, H, W),
19 |             where C is the number of classes, and H, W are the height and width of the prediction.
20 |         img_size (tuple): image size that segmentor is taking as input.
21 |         output_height, output_width: the desired output resolution.
22 | 
23 |     Returns:
24 |         semantic segmentation prediction (Tensor): A tensor of the shape
25 |             (C, output_height, output_width) that contains per-pixel soft predictions.
26 |     """
27 |     if len(result.shape)>3:
28 |         result = result[:, :, : img_size[0], : img_size[1]].expand(1, -1, -1, -1)
29 |     else:
30 |         result = result[:, : img_size[0], : img_size[1]].expand(1, -1, -1, -1)
31 |     result = F.interpolate(
32 |         result, size=(output_height, output_width), mode="bicubic", align_corners=False, antialias=True
33 |     )[0]
34 |     return result
35 | 


--------------------------------------------------------------------------------
/dinov/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .config import *
2 | from .misc import *
3 | from .box_ops import *


--------------------------------------------------------------------------------
/dinov/utils/box_ops.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  2 | """
  3 | Utilities for bounding box manipulation and GIoU.
  4 | """
  5 | import torch
  6 | from torchvision.ops.boxes import box_area
  7 | import numpy as np
  8 | 
  9 | 
 10 | def build_point_grid(n_per_side: int) -> np.ndarray:
 11 |     """Generates a 2D grid of points evenly spaced in [0,1]x[0,1]."""
 12 |     offset = 1 / (2 * n_per_side)
 13 |     points_one_side = np.linspace(offset, 1 - offset, n_per_side)
 14 |     points_x = np.tile(points_one_side[None, :], (n_per_side, 1))
 15 |     points_y = np.tile(points_one_side[:, None], (1, n_per_side))
 16 |     points = np.stack([points_x, points_y], axis=-1).reshape(-1, 2)
 17 |     return points
 18 | 
 19 | def box_cxcywh_to_xyxy(x):
 20 |     x_c, y_c, w, h = x.unbind(-1)
 21 |     b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
 22 |          (x_c + 0.5 * w), (y_c + 0.5 * h)]
 23 |     return torch.stack(b, dim=-1)
 24 | 
 25 | 
 26 | def box_xyxy_to_cxcywh(x):
 27 |     x0, y0, x1, y1 = x.unbind(-1)
 28 |     b = [(x0 + x1) / 2, (y0 + y1) / 2,
 29 |          (x1 - x0), (y1 - y0)]
 30 |     return torch.stack(b, dim=-1)
 31 | 
 32 | def box_xywh_to_xyxy(x):
 33 |     x0, y0, x1, y1 = x.unbind(-1)
 34 |     b = [x0, y0, (x0 + x1), (y0 + y1)]
 35 |     return torch.stack(b, dim=-1)
 36 | 
 37 | 
 38 | # modified from torchvision to also return the union
 39 | def box_iou(boxes1, boxes2):
 40 |     area1 = box_area(boxes1)
 41 |     area2 = box_area(boxes2)
 42 | 
 43 |     lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
 44 |     rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
 45 | 
 46 |     wh = (rb - lt).clamp(min=0)  # [N,M,2]
 47 |     inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
 48 | 
 49 |     union = area1[:, None] + area2 - inter
 50 | 
 51 |     iou = inter / (union+1e-6)
 52 |     return iou, union
 53 | 
 54 | 
 55 | def generalized_box_iou(boxes1, boxes2):
 56 |     """
 57 |     Generalized IoU from https://giou.stanford.edu/
 58 | 
 59 |     The boxes should be in [x0, y0, x1, y1] format
 60 | 
 61 |     Returns a [N, M] pairwise matrix, where N = len(boxes1)
 62 |     and M = len(boxes2)
 63 |     """
 64 |     # degenerate boxes gives inf / nan results
 65 |     # so do an early check
 66 |     assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
 67 |     assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
 68 |     iou, union = box_iou(boxes1, boxes2)
 69 | 
 70 |     lt = torch.min(boxes1[:, None, :2], boxes2[:, :2])
 71 |     rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
 72 | 
 73 |     wh = (rb - lt).clamp(min=0)  # [N,M,2]
 74 |     area = wh[:, :, 0] * wh[:, :, 1]
 75 | 
 76 |     return iou - (area - union) / (area+1e-6)
 77 | 
 78 | def generalized_box_iou_padded(boxes1, boxes2):
 79 |     """
 80 |     Generalized IoU from https://giou.stanford.edu/
 81 | 
 82 |     The boxes should be in [x0, y0, x1, y1] format
 83 | 
 84 |     Returns a [N, M] pairwise matrix, where N = len(boxes1)
 85 |     and M = len(boxes2)
 86 |     """
 87 |     # degenerate boxes gives inf / nan results
 88 |     # so do an early check
 89 |     # assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
 90 |     # assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
 91 |     iou, union = box_iou(boxes1, boxes2)
 92 | 
 93 |     lt = torch.min(boxes1[:, None, :2], boxes2[:, :2])
 94 |     rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
 95 | 
 96 |     wh = (rb - lt).clamp(min=0)  # [N,M,2]
 97 |     area = wh[:, :, 0] * wh[:, :, 1]
 98 | 
 99 |     return iou - (area - union) / (area+1e-6)
100 | 
101 | 
102 | def masks_to_boxes(masks):
103 |     """Compute the bounding boxes around the provided masks
104 | 
105 |     The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions.
106 | 
107 |     Returns a [N, 4] tensors, with the boxes in xyxy format
108 |     """
109 |     if masks.numel() == 0:
110 |         return torch.zeros((0, 4), device=masks.device)
111 | 
112 |     h, w = masks.shape[-2:]
113 | 
114 |     y = torch.arange(0, h, dtype=torch.float)
115 |     x = torch.arange(0, w, dtype=torch.float)
116 |     y, x = torch.meshgrid(y, x)
117 | 
118 |     x_mask = (masks * x.unsqueeze(0))
119 |     x_max = x_mask.flatten(1).max(-1)[0]
120 |     x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
121 | 
122 |     y_mask = (masks * y.unsqueeze(0))
123 |     y_max = y_mask.flatten(1).max(-1)[0]
124 |     y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
125 | 
126 |     return torch.stack([x_min, y_min, x_max, y_max], 1)


--------------------------------------------------------------------------------
/dinov/utils/config.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Copyright (c) Facebook, Inc. and its affiliates.
  3 | 
  4 | import functools
  5 | import inspect
  6 | 
  7 | def configurable(init_func=None, *, from_config=None):
  8 |     """
  9 |     Decorate a function or a class's __init__ method so that it can be called
 10 |     with a :class:`CfgNode` object using a :func:`from_config` function that translates
 11 |     :class:`CfgNode` to arguments.
 12 | 
 13 |     Examples:
 14 |     ::
 15 |         # Usage 1: Decorator on __init__:
 16 |         class A:
 17 |             @configurable
 18 |             def __init__(self, a, b=2, c=3):
 19 |                 pass
 20 | 
 21 |             @classmethod
 22 |             def from_config(cls, cfg):   # 'cfg' must be the first argument
 23 |                 # Returns kwargs to be passed to __init__
 24 |                 return {"a": cfg.A, "b": cfg.B}
 25 | 
 26 |         a1 = A(a=1, b=2)  # regular construction
 27 |         a2 = A(cfg)       # construct with a cfg
 28 |         a3 = A(cfg, b=3, c=4)  # construct with extra overwrite
 29 | 
 30 |         # Usage 2: Decorator on any function. Needs an extra from_config argument:
 31 |         @configurable(from_config=lambda cfg: {"a: cfg.A, "b": cfg.B})
 32 |         def a_func(a, b=2, c=3):
 33 |             pass
 34 | 
 35 |         a1 = a_func(a=1, b=2)  # regular call
 36 |         a2 = a_func(cfg)       # call with a cfg
 37 |         a3 = a_func(cfg, b=3, c=4)  # call with extra overwrite
 38 | 
 39 |     Args:
 40 |         init_func (callable): a class's ``__init__`` method in usage 1. The
 41 |             class must have a ``from_config`` classmethod which takes `cfg` as
 42 |             the first argument.
 43 |         from_config (callable): the from_config function in usage 2. It must take `cfg`
 44 |             as its first argument.
 45 |     """
 46 | 
 47 |     if init_func is not None:
 48 |         assert (
 49 |             inspect.isfunction(init_func)
 50 |             and from_config is None
 51 |             and init_func.__name__ == "__init__"
 52 |         ), "Incorrect use of @configurable. Check API documentation for examples."
 53 | 
 54 |         @functools.wraps(init_func)
 55 |         def wrapped(self, *args, **kwargs):
 56 |             try:
 57 |                 from_config_func = type(self).from_config
 58 |             except AttributeError as e:
 59 |                 raise AttributeError(
 60 |                     "Class with @configurable must have a 'from_config' classmethod."
 61 |                 ) from e
 62 |             if not inspect.ismethod(from_config_func):
 63 |                 raise TypeError("Class with @configurable must have a 'from_config' classmethod.")
 64 | 
 65 |             # import ipdb; ipdb.set_trace()
 66 |             if _called_with_cfg(*args, **kwargs):
 67 |                 explicit_args = _get_args_from_config(from_config_func, *args, **kwargs)
 68 |                 init_func(self, **explicit_args)
 69 |             else:
 70 |                 init_func(self, *args, **kwargs)
 71 | 
 72 |         return wrapped
 73 | 
 74 |     else:
 75 |         if from_config is None:
 76 |             return configurable  # @configurable() is made equivalent to @configurable
 77 |         assert inspect.isfunction(
 78 |             from_config
 79 |         ), "from_config argument of configurable must be a function!"
 80 | 
 81 |         def wrapper(orig_func):
 82 |             @functools.wraps(orig_func)
 83 |             def wrapped(*args, **kwargs):
 84 |                 if _called_with_cfg(*args, **kwargs):
 85 |                     explicit_args = _get_args_from_config(from_config, *args, **kwargs)
 86 |                     return orig_func(**explicit_args)
 87 |                 else:
 88 |                     return orig_func(*args, **kwargs)
 89 | 
 90 |             wrapped.from_config = from_config
 91 |             return wrapped
 92 | 
 93 |         return wrapper
 94 | 
 95 | def _called_with_cfg(*args, **kwargs):
 96 |     """
 97 |     Returns:
 98 |         bool: whether the arguments contain CfgNode and should be considered
 99 |             forwarded to from_config.
100 |     """
101 |     from omegaconf import DictConfig, OmegaConf, ListConfig
102 |     # from detectron2.config import LazyConfig
103 | 
104 |     if len(args) and (isinstance(args[0], (dict)) or (isinstance(args[0], (DictConfig)))):
105 |         return True
106 |     if isinstance(kwargs.pop("cfg", None), (dict)):
107 |         return True
108 |     # `from_config`'s first argument is forced to be "cfg".
109 |     # So the above check covers all cases.
110 |     return False
111 | 
112 | def _get_args_from_config(from_config_func, *args, **kwargs):
113 |     """
114 |     Use `from_config` to obtain explicit arguments.
115 | 
116 |     Returns:
117 |         dict: arguments to be used for cls.__init__
118 |     """
119 |     signature = inspect.signature(from_config_func)
120 |     if list(signature.parameters.keys())[0] != "cfg":
121 |         if inspect.isfunction(from_config_func):
122 |             name = from_config_func.__name__
123 |         else:
124 |             name = f"{from_config_func.__self__}.from_config"
125 |         raise TypeError(f"{name} must take 'cfg' as the first argument!")
126 |     support_var_arg = any(
127 |         param.kind in [param.VAR_POSITIONAL, param.VAR_KEYWORD]
128 |         for param in signature.parameters.values()
129 |     )
130 |     if support_var_arg:  # forward all arguments to from_config, if from_config accepts them
131 |         ret = from_config_func(*args, **kwargs)
132 |     else:
133 |         # forward supported arguments to from_config
134 |         supported_arg_names = set(signature.parameters.keys())
135 |         extra_kwargs = {}
136 |         for name in list(kwargs.keys()):
137 |             if name not in supported_arg_names:
138 |                 extra_kwargs[name] = kwargs.pop(name)
139 |         ret = from_config_func(*args, **kwargs)
140 |         # forward the other arguments to __init__
141 |         ret.update(extra_kwargs)
142 |     return ret


--------------------------------------------------------------------------------
/repo.diff:
--------------------------------------------------------------------------------
 1 | diff --git openseed/architectures/joint_oi_model.py openseed/architectures/joint_oi_model.py
 2 | index 8086690..a0679fe 100644
 3 | --- openseed/architectures/joint_oi_model.py
 4 | +++ openseed/architectures/joint_oi_model.py
 5 | @@ -286,6 +286,7 @@ class GeneralizedMaskDINO(nn.Module):
 6 |              "coco_on": dec_cfg.get('COCO', True),
 7 |              "coco_mask_on": dec_cfg.get('COCO_MASK', True),
 8 |              "o365_on": dec_cfg.get('O365', True),
 9 | +            "regenerate_point": dec_cfg.get('RE_POINT', False),
10 |          }
11 |  
12 |      @property
13 | @@ -531,7 +532,7 @@ class GeneralizedMaskDINO(nn.Module):
14 |  
15 |              # if not self.training:
16 |              #     box_start = int(num_mask/4*3)
17 | -            box_start = random.randint(0, self.max_num_instance - 1)  # box based interactive after this number; about 1/4
18 | +            box_start = random.randint(1, self.max_num_instance - 1)  # box based interactive after this number; about 1/4
19 |              point_coords = targets_per_image.point_coords[index[:box_start]]
20 |              # FIXME randomly sample one point as the user input
21 |              if self.regenerate_point:


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | torch
 2 | torchvision
 3 | opencv-python
 4 | pyyaml
 5 | json_tricks
 6 | yacs
 7 | scikit-learn
 8 | pandas
 9 | timm==0.4.12
10 | numpy==1.23.5
11 | einops
12 | fvcore
13 | transformers==4.19.2
14 | sentencepiece
15 | ftfy
16 | regex
17 | nltk
18 | vision-datasets==0.2.2
19 | pycocotools==2.0.4
20 | diffdist
21 | pyarrow
22 | cityscapesscripts
23 | shapely
24 | scikit-image
25 | mup
26 | gradio==3.40.0
27 | scann
28 | kornia==0.6.4
29 | torchmetrics==0.6.0
30 | progressbar
31 | pillow==9.4.0
32 | 


--------------------------------------------------------------------------------
/utils/Config.py:
--------------------------------------------------------------------------------
 1 | from fvcore.common.config import CfgNode as _CfgNode
 2 | 
 3 | class CfgNode(_CfgNode):
 4 |     """
 5 |     The same as `fvcore.common.config.CfgNode`, but different in:
 6 | 
 7 |     1. Use unsafe yaml loading by default.
 8 |        Note that this may lead to arbitrary code execution: you must not
 9 |        load a config file from untrusted sources before manually inspecting
10 |        the content of the file.
11 |     2. Support config versioning.
12 |        When attempting to merge an old config, it will convert the old config automatically.
13 | 
14 |     .. automethod:: clone
15 |     .. automethod:: freeze
16 |     .. automethod:: defrost
17 |     .. automethod:: is_frozen
18 |     .. automethod:: load_yaml_with_base
19 |     .. automethod:: merge_from_list
20 |     .. automethod:: merge_from_other_cfg
21 |     """
22 | 
23 |     def merge_from_dict(self, dict):
24 |         pass
25 |     
26 | node = CfgNode()


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .dist import *


--------------------------------------------------------------------------------
/utils/arguments.py:
--------------------------------------------------------------------------------
  1 | import yaml
  2 | import json
  3 | import argparse
  4 | import logging
  5 | 
  6 | logger = logging.getLogger(__name__)
  7 | 
  8 | 
  9 | def load_config_dict_to_opt(opt, config_dict):
 10 |     """
 11 |     Load the key, value pairs from config_dict to opt, overriding existing values in opt
 12 |     if there is any.
 13 |     """
 14 |     if not isinstance(config_dict, dict):
 15 |         raise TypeError("Config must be a Python dictionary")
 16 |     for k, v in config_dict.items():
 17 |         k_parts = k.split('.')
 18 |         pointer = opt
 19 |         for k_part in k_parts[:-1]:
 20 |             if k_part not in pointer:
 21 |                 pointer[k_part] = {}
 22 |             pointer = pointer[k_part]
 23 |             assert isinstance(pointer, dict), "Overriding key needs to be inside a Python dict."
 24 |         ori_value = pointer.get(k_parts[-1])
 25 |         pointer[k_parts[-1]] = v
 26 |         if ori_value:
 27 |             logger.warning(f"Overrided {k} from {ori_value} to {pointer[k_parts[-1]]}")
 28 | 
 29 | def load_opt_from_config_file(conf_file):
 30 |     """
 31 |     Load opt from the config files, settings in later files can override those in previous files.
 32 | 
 33 |     Args:
 34 |         conf_files: config file path
 35 | 
 36 |     Returns:
 37 |         dict: a dictionary of opt settings
 38 |     """
 39 |     opt = {}
 40 |     with open(conf_file, encoding='utf-8') as f:
 41 |         config_dict = yaml.safe_load(f)
 42 | 
 43 |     load_config_dict_to_opt(opt, config_dict)
 44 | 
 45 |     return opt
 46 | 
 47 | 
 48 | def load_opt_from_config_files(conf_files):
 49 |     """
 50 |     Load opt from the config files, settings in later files can override those in previous files.
 51 | 
 52 |     Args:
 53 |         conf_files (list): a list of config file paths
 54 | 
 55 |     Returns:
 56 |         dict: a dictionary of opt settings
 57 |     """
 58 |     opt = {}
 59 |     for conf_file in conf_files:
 60 |         with open(conf_file, encoding='utf-8') as f:
 61 |             config_dict = yaml.safe_load(f)
 62 | 
 63 |         load_config_dict_to_opt(opt, config_dict)
 64 | 
 65 |     return opt
 66 | 
 67 | 
 68 | def load_opt_command(args):
 69 |     parser = argparse.ArgumentParser(description='Pretrain or fine-tune models for NLP tasks.')
 70 |     parser.add_argument('command', help='Command: train/evaluate/train-and-evaluate')
 71 |     parser.add_argument('--conf_files', nargs='+', required=True, help='Path(s) to the config file(s).')
 72 |     parser.add_argument('--user_dir', help='Path to the user defined module for tasks (models, criteria), optimizers, and lr schedulers.')
 73 |     parser.add_argument('--config_overrides', nargs='*', help='Override parameters on config with a json style string, e.g. {"<PARAM_NAME_1>": <PARAM_VALUE_1>, "<PARAM_GROUP_2>.<PARAM_SUBGROUP_2>.<PARAM_2>": <PARAM_VALUE_2>}. A key with "." updates the object in the corresponding nested dict. Remember to escape " in command line.')
 74 |     parser.add_argument('--overrides', help='arguments that used to override the config file in cmdline', nargs=argparse.REMAINDER)
 75 | 
 76 |     cmdline_args = parser.parse_args() if not args else parser.parse_args(args)
 77 | 
 78 |     opt = load_opt_from_config_files(cmdline_args.conf_files)
 79 | 
 80 |     if cmdline_args.config_overrides:
 81 |         config_overrides_string = ' '.join(cmdline_args.config_overrides)
 82 |         logger.warning(f"Command line config overrides: {config_overrides_string}")
 83 |         config_dict = json.loads(config_overrides_string)
 84 |         load_config_dict_to_opt(opt, config_dict)
 85 | 
 86 |     if cmdline_args.overrides:
 87 |         assert len(cmdline_args.overrides) % 2 == 0, "overrides arguments is not paired, required: key value"
 88 |         keys = [cmdline_args.overrides[idx*2] for idx in range(len(cmdline_args.overrides)//2)]
 89 |         vals = [cmdline_args.overrides[idx*2+1] for idx in range(len(cmdline_args.overrides)//2)]
 90 |         vals = [val.replace('false', '').replace('False','') if len(val.replace(' ', '')) == 5 else val for val in vals]
 91 | 
 92 |         types = []
 93 |         for key in keys:
 94 |             key = key.split('.')
 95 |             ele = opt.copy()
 96 |             while len(key) > 0:
 97 |                 ele = ele[key.pop(0)]
 98 |             types.append(type(ele))
 99 |         
100 |         config_dict = {x:z(y) for x,y,z in zip(keys, vals, types)}
101 |         load_config_dict_to_opt(opt, config_dict)
102 | 
103 |     # combine cmdline_args into opt dictionary
104 |     for key, val in cmdline_args.__dict__.items():
105 |         if val is not None:
106 |             opt[key] = val
107 | 
108 |     return opt, cmdline_args
109 | 
110 | 
111 | def save_opt_to_json(opt, conf_file):
112 |     with open(conf_file, 'w', encoding='utf-8') as f:
113 |         json.dump(opt, f, indent=4)
114 | 
115 | 
116 | def save_opt_to_yaml(opt, conf_file):
117 |     with open(conf_file, 'w', encoding='utf-8') as f:
118 |         yaml.dump(opt, f)
119 | 


--------------------------------------------------------------------------------
/utils/dist.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json, time
 3 | import torch
 4 | import torch.distributed as dist
 5 | 
 6 | 
 7 | def get_world_size():
 8 |     if torch.distributed.is_initialized():
 9 |         return torch.distributed.get_world_size()
10 |     return 1
11 | 
12 | def all_gather(x):
13 |     if get_world_size() > 1:
14 |         all_x = [torch.zeros_like(x) for _ in range(get_world_size())]
15 |         torch.distributed.all_gather(all_x, x.detach())
16 |         all_x[torch.distributed.get_rank()] = x
17 |         x = torch.stack(all_x, dim=0)
18 |     return x
19 | 
20 | def init_distributed_mode(args):
21 |     if 'WORLD_SIZE' in os.environ and os.environ['WORLD_SIZE'] != '': # 'RANK' in os.environ and 
22 |         args.rank = int(os.environ["RANK"])
23 |         args.world_size = int(os.environ['WORLD_SIZE'])
24 |         args.gpu = args.local_rank = int(os.environ['LOCAL_RANK'])
25 | 
26 |         print('world size: {}, rank: {}, local rank: {}'.format(args.world_size, args.rank, args.local_rank))
27 |         print(json.dumps(dict(os.environ), indent=2))
28 |     elif 'SLURM_PROCID' in os.environ:
29 |         args.rank = int(os.environ['SLURM_PROCID'])
30 |         args.gpu = args.local_rank = int(os.environ['SLURM_LOCALID'])
31 |         args.world_size = int(os.environ['SLURM_NPROCS'])
32 | 
33 |         if os.environ.get('HAND_DEFINE_DIST_URL', 0) == '1':
34 |             pass
35 |         else:
36 |             import util.hostlist as uh
37 |             nodenames = uh.parse_nodelist(os.environ['SLURM_JOB_NODELIST'])
38 |             gpu_ids = [int(node[3:]) for node in nodenames]
39 |             fixid = int(os.environ.get('FIX_DISTRIBUTED_PORT_NUMBER', 0))
40 |             # fixid += random.randint(0, 300)
41 |             port = str(3137 + int(min(gpu_ids)) + fixid)
42 |             args.dist_url = "tcp://{ip}:{port}".format(ip=uh.nodename_to_ip(nodenames[0]), port=port)
43 | 
44 |         print('world size: {}, world rank: {}, local rank: {}, device_count: {}'.format(args.world_size, args.rank, args.local_rank, torch.cuda.device_count()))
45 | 
46 | 
47 |     else:
48 |         print('Not using distributed mode')
49 |         args.distributed = False
50 |         args.world_size = 1
51 |         args.rank = 0
52 |         args.local_rank = 0
53 |         return
54 | 
55 |     print("world_size:{} rank:{} local_rank:{}".format(args.world_size, args.rank, args.local_rank))
56 |     args.distributed = True
57 |     torch.cuda.set_device(args.local_rank)
58 |     args.dist_backend = 'nccl'
59 |     print('| distributed init (rank {}): {}'.format(args.rank, args.dist_url), flush=True)
60 | 
61 |     torch.distributed.init_process_group(
62 |         backend=args.dist_backend, 
63 |         world_size=args.world_size, 
64 |         rank=args.rank,
65 |         init_method=args.dist_url,
66 |     )
67 | 
68 |     print("Before torch.distributed.barrier()")
69 |     torch.distributed.barrier()
70 |     print("End torch.distributed.barrier()")


--------------------------------------------------------------------------------
/utils/misc.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # X-Decoder -- Generalized Decoding for Pixel, Image, and Language
 3 | # Copyright (c) 2022 Microsoft
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # Written by Xueyan Zou (xueyan@cs.wisc.edu)
 6 | # --------------------------------------------------------
 7 | import math
 8 | import wandb
 9 | import os
10 | 
11 | 
12 | # HACK for evalution 
13 | def hook_metadata(metadata, name):
14 |     if name == 'cityscapes_fine_sem_seg_val':
15 |         metadata.__setattr__("keep_sem_bgd", False)
16 |     return metadata
17 | 
18 | def hook_opt(model, name):
19 |     if name in ['cityscapes_fine_panoptic_val', 'ade20k_panoptic_val', 'bdd10k_40_panoptic_val', 'cityscapes_fine_panoptic_val', 'scannet_21_panoptic_val']:
20 |         model.model.object_mask_threshold = 0.4
21 |     else:
22 |         model.model.object_mask_threshold = 0.8
23 | 
24 | # HACK for evalution 
25 | def hook_switcher(model, name):
26 |     mappings = {}
27 |     if name in ['cityscapes_fine_sem_seg_val', 'scannet_21_val_seg', 'scannet_38_val_seg', 'scannet_41_val_seg', 'sunrgbd_37_val_seg', 'bdd10k_val_sem_seg', 'ade20k_full_sem_seg_val']:
28 |         mappings = {'SEMANTIC_ON': True, 'INSTANCE_ON': False, 'PANOPTIC_ON': False}
29 |     elif name in ['cityscapes_fine_instance_seg_val', 'pascal_part_val_interactive', 'pascal_part_val', 'pascal_part_train'] or 'seginw' in name or 'lvis' in name or 'odinw' in name:
30 |         mappings = {'SEMANTIC_ON': False, 'INSTANCE_ON': True, 'PANOPTIC_ON': False}
31 |     elif name in ['cityscapes_fine_panoptic_val', 'scannet_21_panoptic_val', 'bdd10k_40_panoptic_val']:
32 |         # mappings = {'SEMANTIC_ON': True, 'INSTANCE_ON': False, 'PANOPTIC_ON': True}
33 |         mappings = {'SEMANTIC_ON': True, 'INSTANCE_ON': True, 'PANOPTIC_ON': True}
34 |     elif 'coco_2017_val_panoptic_with_sem_seg' in name or name in ['ade20k_panoptic_val', 'ade20k_panoptic_train', 'coco_2017_test-dev', 'sam_val', 'sam_minival']:
35 |         mappings = {'SEMANTIC_ON': True, 'INSTANCE_ON': True, 'PANOPTIC_ON': True}
36 |     elif name in ['cityscapes_fine_sem_seg_val', 'scannet_21_val_seg', 'scannet_38_val_seg', 'scannet_41_val_seg',
37 |                 'sunrgbd_37_val_seg', 'context_59_val_seg', 'context_459_val_seg', 'voc_2012_val_seg',
38 |                 'bdd10k_val_sem_seg']:
39 |         mappings = {'SEMANTIC_ON': True, 'INSTANCE_ON': False, 'PANOPTIC_ON': False}
40 |     elif name in ['cityscapes_fine_instance_seg_val'] or 'seginw' in name:
41 |         mappings = {'SEMANTIC_ON': False, 'INSTANCE_ON': True, 'PANOPTIC_ON': False}
42 |         mappings = {'SEMANTIC_ON': True, 'INSTANCE_ON': True, 'PANOPTIC_ON': True}
43 |     elif name in ['coco_2017_val_panoptic_with_sem_seg', 'ade20k_panoptic_val']:
44 |         mappings = {'SEMANTIC_ON': True, 'INSTANCE_ON': True, 'PANOPTIC_ON': True}
45 |     else:
46 |         if name not in ["vlp_val", "vlp_captioning_val", "vlp_val2017", "vlp_captioning_val2017", "imagenet_val", "refcocog_val_google", "phrasecut_val", "phrasecut_test", "refcocop_val_unc", "refcoco_val_unc", "refcocog_val_umd"]:
47 |             assert False, "dataset switcher is not defined"
48 |     for key, value in mappings.items():
49 |         if key == 'SEMANTIC_ON':
50 |             model.model.semantic_on = value
51 |         if key == 'INSTANCE_ON':
52 |             model.model.instance_on = value
53 |         if key == 'PANOPTIC_ON':
54 |             model.model.panoptic_on = value
55 | 
56 | class AverageMeter(object):
57 |     """Computes and stores the average and current value."""
58 |     def __init__(self):
59 |         self.reset()
60 | 
61 |     def reset(self):
62 |         self.val = 0
63 |         self.avg = 0
64 |         self.sum = 0
65 |         self.count = 0
66 | 
67 |     def update(self, val, n=1, decay=0):
68 |         self.val = val
69 |         if decay:
70 |             alpha = math.exp(-n / decay)  # exponential decay over 100 updates
71 |             self.sum = alpha * self.sum + (1 - alpha) * val * n
72 |             self.count = alpha * self.count + (1 - alpha) * n
73 |         else:
74 |             self.sum += val * n
75 |             self.count += n
76 |         self.avg = self.sum / self.count
77 | 
78 | def init_wandb(args, job_dir, entity='646396839lifeng', project='xdecoder', job_name='tmp'):
79 |     wandb_dir = os.path.join(job_dir, 'wandb')
80 |     os.makedirs(wandb_dir, exist_ok=True)
81 |     runid = None
82 |     if os.path.exists(f"{wandb_dir}/runid.txt"):
83 |         runid = open(f"{wandb_dir}/runid.txt").read()
84 | 
85 |     wandb.init(project=project,
86 |                name=job_name,
87 |                dir=wandb_dir,
88 |                entity=entity,
89 |                resume="allow",
90 |                id=runid,
91 |                config={"hierarchical": True}, )
92 | 
93 |     open(f"{wandb_dir}/runid.txt", 'w').write(wandb.run.id)
94 |     wandb.config.update({k: args[k] for k in args if k not in wandb.config})


--------------------------------------------------------------------------------
/utils/model.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import time
 4 | import pickle
 5 | import torch
 6 | from detectron2.utils.comm import is_main_process
 7 | 
 8 | logger = logging.getLogger(__name__)
 9 | 
10 | 
11 | NORM_MODULES = [
12 |     torch.nn.BatchNorm1d,
13 |     torch.nn.BatchNorm2d,
14 |     torch.nn.BatchNorm3d,
15 |     torch.nn.SyncBatchNorm,
16 |     # NaiveSyncBatchNorm inherits from BatchNorm2d
17 |     torch.nn.GroupNorm,
18 |     torch.nn.InstanceNorm1d,
19 |     torch.nn.InstanceNorm2d,
20 |     torch.nn.InstanceNorm3d,
21 |     torch.nn.LayerNorm,
22 |     torch.nn.LocalResponseNorm,
23 | ]
24 | 
25 | def register_norm_module(cls):
26 |     NORM_MODULES.append(cls)
27 |     return cls
28 | 
29 | def align_and_update_state_dicts(model_state_dict, ckpt_state_dict):
30 |     model_keys = sorted(model_state_dict.keys())
31 |     ckpt_keys = sorted(ckpt_state_dict.keys())
32 |     result_dicts = {}
33 |     matched_log = []
34 |     unmatched_log = []
35 |     unloaded_log = []
36 |     for model_key in model_keys:
37 |         model_weight = model_state_dict[model_key]
38 |         if model_key in ckpt_keys:
39 |             ckpt_weight = ckpt_state_dict[model_key]
40 |             if model_weight.shape == ckpt_weight.shape:
41 |                 result_dicts[model_key] = ckpt_weight
42 |                 ckpt_keys.pop(ckpt_keys.index(model_key))
43 |                 matched_log.append("Loaded {}, Model Shape: {} <-> Ckpt Shape: {}".format(model_key, model_weight.shape, ckpt_weight.shape))
44 |             else:
45 |                 unmatched_log.append("*UNMATCHED* {}, Model Shape: {} <-> Ckpt Shape: {}".format(model_key, model_weight.shape, ckpt_weight.shape))
46 |         else:
47 |             unloaded_log.append("*UNLOADED* {}, Model Shape: {}".format(model_key, model_weight.shape))
48 |             
49 |     if is_main_process():
50 |         for info in matched_log:
51 |             logger.info(info)
52 |         for info in unloaded_log:
53 |             logger.warning(info)
54 |         for key in ckpt_keys:
55 |             logger.warning("$UNUSED$ {}, Ckpt Shape: {}".format(key, ckpt_state_dict[key].shape))
56 |         for info in unmatched_log:
57 |             logger.warning(info)
58 |     return result_dicts


--------------------------------------------------------------------------------
/utils/sam_utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | 
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | 


--------------------------------------------------------------------------------
/utils/sam_utils/onnx.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # All rights reserved.
  3 | 
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | import torch
  8 | import torch.nn as nn
  9 | from torch.nn import functional as F
 10 | 
 11 | from typing import Tuple
 12 | 
 13 | from ..modeling import Sam
 14 | from .amg import calculate_stability_score
 15 | 
 16 | 
 17 | class SamOnnxModel(nn.Module):
 18 |     """
 19 |     This model should not be called directly, but is used in ONNX export.
 20 |     It combines the prompt encoder, mask decoder, and mask postprocessing of Sam,
 21 |     with some functions modified to enable model tracing. Also supports extra
 22 |     options controlling what information. See the ONNX export script for details.
 23 |     """
 24 | 
 25 |     def __init__(
 26 |         self,
 27 |         model: Sam,
 28 |         return_single_mask: bool,
 29 |         use_stability_score: bool = False,
 30 |         return_extra_metrics: bool = False,
 31 |     ) -> None:
 32 |         super().__init__()
 33 |         self.mask_decoder = model.mask_decoder
 34 |         self.model = model
 35 |         self.img_size = model.image_encoder.img_size
 36 |         self.return_single_mask = return_single_mask
 37 |         self.use_stability_score = use_stability_score
 38 |         self.stability_score_offset = 1.0
 39 |         self.return_extra_metrics = return_extra_metrics
 40 | 
 41 |     @staticmethod
 42 |     def resize_longest_image_size(
 43 |         input_image_size: torch.Tensor, longest_side: int
 44 |     ) -> torch.Tensor:
 45 |         input_image_size = input_image_size.to(torch.float32)
 46 |         scale = longest_side / torch.max(input_image_size)
 47 |         transformed_size = scale * input_image_size
 48 |         transformed_size = torch.floor(transformed_size + 0.5).to(torch.int64)
 49 |         return transformed_size
 50 | 
 51 |     def _embed_points(self, point_coords: torch.Tensor, point_labels: torch.Tensor) -> torch.Tensor:
 52 |         point_coords = point_coords + 0.5
 53 |         point_coords = point_coords / self.img_size
 54 |         point_embedding = self.model.prompt_encoder.pe_layer._pe_encoding(point_coords)
 55 |         point_labels = point_labels.unsqueeze(-1).expand_as(point_embedding)
 56 | 
 57 |         point_embedding = point_embedding * (point_labels != -1)
 58 |         point_embedding = point_embedding + self.model.prompt_encoder.not_a_point_embed.weight * (
 59 |             point_labels == -1
 60 |         )
 61 | 
 62 |         for i in range(self.model.prompt_encoder.num_point_embeddings):
 63 |             point_embedding = point_embedding + self.model.prompt_encoder.point_embeddings[
 64 |                 i
 65 |             ].weight * (point_labels == i)
 66 | 
 67 |         return point_embedding
 68 | 
 69 |     def _embed_masks(self, input_mask: torch.Tensor, has_mask_input: torch.Tensor) -> torch.Tensor:
 70 |         mask_embedding = has_mask_input * self.model.prompt_encoder.mask_downscaling(input_mask)
 71 |         mask_embedding = mask_embedding + (
 72 |             1 - has_mask_input
 73 |         ) * self.model.prompt_encoder.no_mask_embed.weight.reshape(1, -1, 1, 1)
 74 |         return mask_embedding
 75 | 
 76 |     def mask_postprocessing(self, masks: torch.Tensor, orig_im_size: torch.Tensor) -> torch.Tensor:
 77 |         masks = F.interpolate(
 78 |             masks,
 79 |             size=(self.img_size, self.img_size),
 80 |             mode="bilinear",
 81 |             align_corners=False,
 82 |         )
 83 | 
 84 |         prepadded_size = self.resize_longest_image_size(orig_im_size, self.img_size)
 85 |         masks = masks[..., : int(prepadded_size[0]), : int(prepadded_size[1])]
 86 | 
 87 |         orig_im_size = orig_im_size.to(torch.int64)
 88 |         h, w = orig_im_size[0], orig_im_size[1]
 89 |         masks = F.interpolate(masks, size=(h, w), mode="bilinear", align_corners=False)
 90 |         return masks
 91 | 
 92 |     def select_masks(
 93 |         self, masks: torch.Tensor, iou_preds: torch.Tensor, num_points: int
 94 |     ) -> Tuple[torch.Tensor, torch.Tensor]:
 95 |         # Determine if we should return the multiclick mask or not from the number of points.
 96 |         # The reweighting is used to avoid control flow.
 97 |         score_reweight = torch.tensor(
 98 |             [[1000] + [0] * (self.model.mask_decoder.num_mask_tokens - 1)]
 99 |         ).to(iou_preds.device)
100 |         score = iou_preds + (num_points - 2.5) * score_reweight
101 |         best_idx = torch.argmax(score, dim=1)
102 |         masks = masks[torch.arange(masks.shape[0]), best_idx, :, :].unsqueeze(1)
103 |         iou_preds = iou_preds[torch.arange(masks.shape[0]), best_idx].unsqueeze(1)
104 | 
105 |         return masks, iou_preds
106 | 
107 |     @torch.no_grad()
108 |     def forward(
109 |         self,
110 |         image_embeddings: torch.Tensor,
111 |         point_coords: torch.Tensor,
112 |         point_labels: torch.Tensor,
113 |         mask_input: torch.Tensor,
114 |         has_mask_input: torch.Tensor,
115 |         orig_im_size: torch.Tensor,
116 |     ):
117 |         sparse_embedding = self._embed_points(point_coords, point_labels)
118 |         dense_embedding = self._embed_masks(mask_input, has_mask_input)
119 | 
120 |         masks, scores = self.model.mask_decoder.predict_masks(
121 |             image_embeddings=image_embeddings,
122 |             image_pe=self.model.prompt_encoder.get_dense_pe(),
123 |             sparse_prompt_embeddings=sparse_embedding,
124 |             dense_prompt_embeddings=dense_embedding,
125 |         )
126 | 
127 |         if self.use_stability_score:
128 |             scores = calculate_stability_score(
129 |                 masks, self.model.mask_threshold, self.stability_score_offset
130 |             )
131 | 
132 |         if self.return_single_mask:
133 |             masks, scores = self.select_masks(masks, scores, point_coords.shape[1])
134 | 
135 |         upscaled_masks = self.mask_postprocessing(masks, orig_im_size)
136 | 
137 |         if self.return_extra_metrics:
138 |             stability_scores = calculate_stability_score(
139 |                 upscaled_masks, self.model.mask_threshold, self.stability_score_offset
140 |             )
141 |             areas = (upscaled_masks > self.model.mask_threshold).sum(-1).sum(-1)
142 |             return upscaled_masks, scores, stability_scores, areas, masks
143 | 
144 |         return upscaled_masks, scores, masks
145 | 


--------------------------------------------------------------------------------
/utils/sam_utils/transforms.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # All rights reserved.
  3 | 
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | import numpy as np
  8 | import torch
  9 | from torch.nn import functional as F
 10 | from torchvision.transforms.functional import resize, to_pil_image  # type: ignore
 11 | 
 12 | from copy import deepcopy
 13 | from typing import Tuple
 14 | 
 15 | 
 16 | class ResizeLongestSide:
 17 |     """
 18 |     Resizes images to longest side 'target_length', as well as provides
 19 |     methods for resizing coordinates and boxes. Provides methods for
 20 |     transforming both numpy array and batched torch tensors.
 21 |     """
 22 | 
 23 |     def __init__(self, target_length: int) -> None:
 24 |         self.target_length = target_length
 25 | 
 26 |     def apply_image(self, image: np.ndarray) -> np.ndarray:
 27 |         """
 28 |         Expects a numpy array with shape HxWxC in uint8 format.
 29 |         """
 30 |         target_size = self.get_preprocess_shape(image.shape[0], image.shape[1], self.target_length)
 31 |         return np.array(resize(to_pil_image(image), target_size))
 32 | 
 33 |     def apply_coords(self, coords: np.ndarray, original_size: Tuple[int, ...]) -> np.ndarray:
 34 |         """
 35 |         Expects a numpy array of length 2 in the final dimension. Requires the
 36 |         original image size in (H, W) format.
 37 |         """
 38 |         old_h, old_w = original_size
 39 |         new_h, new_w = self.get_preprocess_shape(
 40 |             original_size[0], original_size[1], self.target_length
 41 |         )
 42 |         coords = deepcopy(coords).astype(float)
 43 |         coords[..., 0] = coords[..., 0] * (new_w / old_w)
 44 |         coords[..., 1] = coords[..., 1] * (new_h / old_h)
 45 |         return coords
 46 | 
 47 |     def apply_boxes(self, boxes: np.ndarray, original_size: Tuple[int, ...]) -> np.ndarray:
 48 |         """
 49 |         Expects a numpy array shape Bx4. Requires the original image size
 50 |         in (H, W) format.
 51 |         """
 52 |         boxes = self.apply_coords(boxes.reshape(-1, 2, 2), original_size)
 53 |         return boxes.reshape(-1, 4)
 54 | 
 55 |     def apply_image_torch(self, image: torch.Tensor) -> torch.Tensor:
 56 |         """
 57 |         Expects batched images with shape BxCxHxW and float format. This
 58 |         transformation may not exactly match apply_image. apply_image is
 59 |         the transformation expected by the model.
 60 |         """
 61 |         # Expects an image in BCHW format. May not exactly match apply_image.
 62 |         target_size = self.get_preprocess_shape(image.shape[0], image.shape[1], self.target_length)
 63 |         return F.interpolate(
 64 |             image, target_size, mode="bilinear", align_corners=False, antialias=True
 65 |         )
 66 | 
 67 |     def apply_coords_torch(
 68 |         self, coords: torch.Tensor, original_size: Tuple[int, ...]
 69 |     ) -> torch.Tensor:
 70 |         """
 71 |         Expects a torch tensor with length 2 in the last dimension. Requires the
 72 |         original image size in (H, W) format.
 73 |         """
 74 |         old_h, old_w = original_size
 75 |         new_h, new_w = self.get_preprocess_shape(
 76 |             original_size[0], original_size[1], self.target_length
 77 |         )
 78 |         coords = deepcopy(coords).to(torch.float)
 79 |         coords[..., 0] = coords[..., 0] * (new_w / old_w)
 80 |         coords[..., 1] = coords[..., 1] * (new_h / old_h)
 81 |         return coords
 82 | 
 83 |     def apply_boxes_torch(
 84 |         self, boxes: torch.Tensor, original_size: Tuple[int, ...]
 85 |     ) -> torch.Tensor:
 86 |         """
 87 |         Expects a torch tensor with shape Bx4. Requires the original image
 88 |         size in (H, W) format.
 89 |         """
 90 |         boxes = self.apply_coords_torch(boxes.reshape(-1, 2, 2), original_size)
 91 |         return boxes.reshape(-1, 4)
 92 | 
 93 |     @staticmethod
 94 |     def get_preprocess_shape(oldh: int, oldw: int, long_side_length: int) -> Tuple[int, int]:
 95 |         """
 96 |         Compute the output size given input size and target long side length.
 97 |         """
 98 |         scale = long_side_length * 1.0 / max(oldh, oldw)
 99 |         newh, neww = oldh * scale, oldw * scale
100 |         neww = int(neww + 0.5)
101 |         newh = int(newh + 0.5)
102 |         return (newh, neww)
103 | 


--------------------------------------------------------------------------------