├── .gitattributes
├── .gitignore
├── DATASET.md
├── README.md
├── __init__.py
├── configs
├── dinov_sam_ade_eval.yaml
├── dinov_sam_coco_swinl_train.yaml
└── dinov_sam_coco_train.yaml
├── datasets
├── __init__.py
├── build.py
├── custom_dataset_dataloader.py
├── dataset_mappers
│ ├── __init__.py
│ ├── bdd_semseg_dataset_mapper.py
│ ├── coco_instance_new_baseline_dataset_mapper.py
│ ├── coco_interactive_panoptic_new_baseline_dataset_mapper.py
│ ├── coco_panoptic_new_baseline_dataset_mapper.py
│ ├── davis_dataset_mapper.py
│ ├── inference_mapper_with_gt.py
│ ├── instance_inference_mapper_with_gt.py
│ ├── lvis_dataset_mapper.py
│ ├── lvis_dataset_mapper_with_gt.py
│ ├── mask_former_instance_dataset_mapper.py
│ ├── mask_former_interactive_panoptic_dataset_mapper.py
│ ├── mask_former_panoptic_dataset_mapper.py
│ ├── mask_former_semantic_dataset_mapper.py
│ ├── o365_instance_new_baseline_dataset_mapper.py
│ ├── object365_dataset_mapper.py
│ ├── part_data_filter_whole_new_instance_dataset_mapper.py
│ ├── pascal_instance_new_baseline_dataset_mapper.py
│ ├── pascalcontext_dataset_mapper.py
│ ├── sam_baseline_dataset_mapper.py
│ ├── sam_baseline_dataset_mapper_content.py
│ ├── sam_baseline_dataset_mapper_json.py
│ ├── scannet_dataset_mapper.py
│ ├── scannet_pano_dataset_mapper.py
│ ├── seginw_dataset_mapper.py
│ ├── sunrgbd_dataset_mapper.py
│ └── ytvos_dataset_mapper.py
├── evaluation
│ ├── __init__.py
│ ├── instance_evaluation.py
│ ├── interactive_evaluation.py
│ ├── panoptic_evaluation.py
│ ├── pascal_part_evaluation.py
│ └── segmentation_evaluation.py
├── registration
│ ├── __init__.py
│ ├── register_ade20k_full.py
│ ├── register_ade20k_instance.py
│ ├── register_ade20k_panoptic.py
│ ├── register_bdd100k_panoseg.py
│ ├── register_bdd100k_semseg.py
│ ├── register_coco_panoptic_annos_semseg.py
│ ├── register_coco_panoptic_annos_semseg_interactive.py
│ ├── register_coco_panoptic_annos_semseg_interactive_jointboxpoint.py
│ ├── register_coco_stuff_10k.py
│ ├── register_context_semseg.py
│ ├── register_davis_dataset.py
│ ├── register_lvis_eval.py
│ ├── register_object365_od.py
│ ├── register_odinw_od.py
│ ├── register_paco_part_all.py
│ ├── register_partimagenet_part_all.py
│ ├── register_pascal_part_all.py
│ ├── register_pascal_part_all_interactive.py
│ ├── register_sam.py
│ ├── register_sam_json.py
│ ├── register_scannet_panoptic.py
│ ├── register_scannet_semseg.py
│ ├── register_seginw_instance.py
│ ├── register_sunrgbd_semseg.py
│ └── register_ytvos_dataset.py
├── semseg_loader.py
├── shapes
│ ├── __init__.py
│ ├── mask_generators.py
│ ├── sampler.py
│ ├── scribble.py
│ └── simpleclick_sampler.py
└── utils
│ └── tsv
│ ├── __init__.py
│ ├── io_common.py
│ └── tsv_io.py
├── demo
├── __init__.py
├── examples
│ ├── bags.jpg
│ ├── corgi2.jpg
│ ├── img.png
│ └── ref_cat.jpeg
└── openset_task.py
├── demo_openset.py
├── dinov
├── BaseModel.py
├── __init__.py
├── architectures
│ ├── __init__.py
│ ├── build.py
│ ├── dinov.py
│ └── registry.py
├── backbone
│ ├── __init__.py
│ ├── backbone.py
│ ├── build.py
│ ├── focal.py
│ ├── focal_dw.py
│ ├── registry.py
│ └── swin.py
├── body
│ ├── __init__.py
│ ├── build.py
│ ├── decoder
│ │ ├── __init__.py
│ │ ├── build.py
│ │ ├── dinov_openset_decoder.py
│ │ ├── dinov_refer_decoder.py
│ │ ├── registry.py
│ │ └── utils
│ │ │ ├── __init__.py
│ │ │ ├── dino_decoder.py
│ │ │ └── utils.py
│ ├── encoder
│ │ ├── __init__.py
│ │ ├── build.py
│ │ ├── encoder_deform.py
│ │ ├── ops
│ │ │ ├── functions
│ │ │ │ ├── __init__.py
│ │ │ │ └── ms_deform_attn_func.py
│ │ │ ├── modules
│ │ │ │ ├── __init__.py
│ │ │ │ └── ms_deform_attn.py
│ │ │ ├── setup.py
│ │ │ ├── src
│ │ │ │ ├── cpu
│ │ │ │ │ ├── ms_deform_attn_cpu.cpp
│ │ │ │ │ └── ms_deform_attn_cpu.h
│ │ │ │ ├── cuda
│ │ │ │ │ ├── ms_deform_attn_cuda.cu
│ │ │ │ │ ├── ms_deform_attn_cuda.h
│ │ │ │ │ └── ms_deform_im2col_cuda.cuh
│ │ │ │ ├── ms_deform_attn.h
│ │ │ │ └── vision.cpp
│ │ │ └── test.py
│ │ ├── registry.py
│ │ └── transformer_encoder_fpn.py
│ ├── general_head.py
│ ├── registry.py
│ └── transformer_blocks.py
├── language
│ ├── __init__.py
│ └── build.py
├── modules
│ ├── __init__.py
│ ├── criterion_visual_openset.py
│ ├── criterion_visual_refer_many2many.py
│ ├── criterion_visual_refer_one2one.py
│ ├── matcher.py
│ ├── matcher_many2many.py
│ ├── position_encoding.py
│ └── postprocessing.py
└── utils
│ ├── __init__.py
│ ├── box_ops.py
│ ├── config.py
│ └── misc.py
├── repo.diff
├── requirements.txt
├── train_net.py
└── utils
├── Config.py
├── __init__.py
├── arguments.py
├── constants.py
├── dist.py
├── lvis_cat.py
├── misc.py
├── model.py
├── sam_utils
├── __init__.py
├── amg.py
├── onnx.py
└── transforms.py
└── visualizer.py
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # IntelliJ project files
2 | #repo.diff
3 | .idea
4 | .vscode
5 | .amltignore
6 | *.iml
7 | out
8 | gen
9 | visinf
10 | coco_caption
11 | ### Vim template
12 | [._]*.s[a-w][a-z]
13 | [._]s[a-w][a-z]
14 | *.un~
15 | Session.vim
16 | .netrwhist
17 | *~
18 | *.sh
19 | vis_scribble
20 |
21 | ### IPythonNotebook template
22 | # Temporary data
23 | .ipynb_checkpoints/
24 |
25 | ### Python template
26 | # Byte-compiled / optimized / DLL files
27 | __pycache__/
28 | *.py[cod]
29 | *$py.class
30 |
31 | # C extensions
32 | *.so
33 |
34 | # Distribution / packaging
35 | .Python
36 | env/
37 | build/
38 | develop-eggs/
39 | dist/
40 | downloads/
41 | eggs/
42 | .eggs/
43 | #lib/
44 | #lib64/
45 | parts/
46 | sdist/
47 | var/
48 | *.egg-info/
49 | .installed.cfg
50 | *.egg
51 |
52 | # PyInstaller
53 | # Usually these files are written by a python script from a template
54 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
55 | *.manifest
56 | *.spec
57 |
58 | # Installer logs
59 | pip-log.txt
60 | pip-delete-this-directory.txt
61 |
62 | # Unit test / coverage reports
63 | htmlcov/
64 | .tox/
65 | .coverage
66 | .coverage.*
67 | .cache
68 | nosetests.xml
69 | coverage.xml
70 | *,cover
71 |
72 | # Translations
73 | *.mo
74 | *.pot
75 |
76 | # Django stuff:
77 | *.log
78 |
79 | # Sphinx documentation
80 | docs/_build/
81 |
82 | # PyBuilder
83 | target/
84 |
85 | *.ipynb
86 | *.params
87 | # *.json
88 | #.vscode/
89 | *.code-workspace/
90 |
91 | lib/pycocotools/_mask.c
92 | lib/nms/cpu_nms.c
93 |
94 | OUTPUT
95 | OUTPUT/*
96 | models/*
97 | DATASET
98 | DATASET/*
99 | external/
100 | MODELS
101 | MODELS/*
102 |
103 | kill.sh
104 |
105 | draws/
106 | plot/
107 |
108 |
109 |
110 |
111 | *venv/*
112 | *.pt
113 | *.pth
114 |
--------------------------------------------------------------------------------
/DATASET.md:
--------------------------------------------------------------------------------
1 | # Preparing Dataset
2 | Our dataloader follows [Detectron2](https://github.com/facebookresearch/detectron2) contains (1) A dataset registrator. (2) A dataset mapper. (3) A dataset loader. We modify the dataset registrator and mapper for different datasets.
3 |
4 | ## Training Dataset Note
5 |
6 | There are overlap between COCO2017, COCO-Karpathy and REF-COCO dataset, and ref-coco is all overalp with the COCO2017 training data, we have exclude the refcocog-umd validation, coco-karpathy test split during training.
7 |
8 | ## ADE20K, Cityscapes, COCO
9 | Please Refer to [Mask2Former](https://github.com/facebookresearch/Mask2Former/tree/main/datasets).
10 |
11 | ## BDD100K
12 | Please download the 10k split of BDD100k at https://doc.bdd100k.com/download.html#id1
13 |
14 | ### Expected dataset structure for cityscapes:
15 | ```
16 | .
17 | └── bdd100k/
18 | ├── images/
19 | │ └── 10k/
20 | │ ├── test
21 | │ ├── train
22 | │ └── val
23 | └── labels/
24 | ├── ins_seg
25 | ├── pan_seg
26 | └── sem_seg
27 | ```
28 |
29 | ## RefCOCO
30 | Please download the original refcoco datasets at https://github.com/lichengunc/refer.
31 |
32 | ### Expected dataset structure for refcoco:
33 | ```
34 | .
35 | └── refcocoseg/
36 | └── refcocog/
37 | ├── instances.json
38 | ├── refs(google).p
39 | └── refs(umd).p
40 | ```
41 |
42 | Also download the coco dataset at https://cocodataset.org/#home:
43 | ### Expected dataset structure for coco:
44 | ```
45 | .
46 | └── coco/
47 | ├── annotations
48 | ├── train2017
49 | └── val2017
50 | ```
51 |
52 | After preparing the dataset, run the following command:
53 |
54 | ```sh
55 | # NOTE: Please modify coco_root and ref_root
56 | python3 refcoco2json.py
57 | ```
58 |
59 | ## SUN-RGBD
60 |
61 |
62 | ## SCAN-Net
63 |
64 |
65 |
--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UX-Decoder/DINOv/53bf20d5cfdbb86fa35141a1cff432d4923599f2/__init__.py
--------------------------------------------------------------------------------
/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | from . import registration
2 | from .build import *
--------------------------------------------------------------------------------
/datasets/dataset_mappers/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from .coco_instance_new_baseline_dataset_mapper import COCOInstanceNewBaselineDatasetMapper
3 | from .coco_panoptic_new_baseline_dataset_mapper import COCOPanopticNewBaselineDatasetMapper
4 | from .coco_interactive_panoptic_new_baseline_dataset_mapper import COCOInteractivePanopticNewBaselineDatasetMapper
5 | from .mask_former_instance_dataset_mapper import MaskFormerInstanceDatasetMapper
6 | from .mask_former_panoptic_dataset_mapper import MaskFormerPanopticDatasetMapper
7 | from .mask_former_interactive_panoptic_dataset_mapper import MaskFormerPanopticDatasetMapperInteractive
8 | from .mask_former_semantic_dataset_mapper import MaskFormerSemanticDatasetMapper
9 | from .sunrgbd_dataset_mapper import SunRGBDSegDatasetMapper
10 | from .scannet_dataset_mapper import ScanNetSegDatasetMapper
11 | from .bdd_semseg_dataset_mapper import BDDSemDatasetMapper
12 | from .scannet_pano_dataset_mapper import ScanNetPanoDatasetMapper
13 | from .o365_instance_new_baseline_dataset_mapper import O365InstanceNewBaselineDatasetMapper
14 | from .sam_baseline_dataset_mapper import build_transform_gen as sam_transform_gen
15 | from .sam_baseline_dataset_mapper import SamBaselineDatasetMapper
16 | from .sam_baseline_dataset_mapper_json import SamBaselineDatasetMapperJSON
17 | from .sam_baseline_dataset_mapper_content import SamBaselineDatasetMapperContent
18 | from .pascal_instance_new_baseline_dataset_mapper import PascalInstanceNewBaselineDatasetMapper
19 | from .part_data_filter_whole_new_instance_dataset_mapper import PartFilterWholeInstanceNewBaselineDatasetMapper
20 | from .inference_mapper_with_gt import CoCoInferenceDatasetMapper
21 | from .instance_inference_mapper_with_gt import InstanceInferenceDatasetMapperGT
22 |
23 | from .davis_dataset_mapper import DAVISDatasetMapper
24 | from .ytvos_dataset_mapper import YTVOSDatasetMapper
25 | from .seginw_dataset_mapper import SeginWDatasetMapper
26 | from .lvis_dataset_mapper_with_gt import LVISInferenceMapperWithGT
27 | from .pascalcontext_dataset_mapper import PascalContextSegDatasetMapper
--------------------------------------------------------------------------------
/datasets/dataset_mappers/bdd_semseg_dataset_mapper.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # X-Decoder -- Generalized Decoding for Pixel, Image, and Language
3 | # Copyright (c) 2022 Microsoft
4 | # Licensed under The MIT License [see LICENSE for details]
5 | # Modified by Xueyan Zou (xueyan@cs.wisc.edu)
6 | # --------------------------------------------------------
7 | # Copyright (c) Facebook, Inc. and its affiliates.
8 | import copy
9 |
10 | import scipy.io
11 | import numpy as np
12 | import torch
13 | from PIL import Image
14 |
15 | from torchvision import transforms
16 | from dinov.utils import configurable
17 |
18 | __all__ = ["BDDSemDatasetMapper"]
19 |
20 |
21 | # This is specifically designed for the COCO dataset.
22 | class BDDSemDatasetMapper:
23 | """
24 | A callable which takes a dataset dict in Detectron2 Dataset format,
25 | and map it into a format used by MaskFormer.
26 |
27 | This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation.
28 |
29 | The callable currently does the following:
30 |
31 | 1. Read the image from "file_name"
32 | 2. Applies geometric transforms to the image and annotation
33 | 3. Find and applies suitable cropping to the image and annotation
34 | 4. Prepare image and annotation to Tensors
35 | """
36 |
37 | @configurable
38 | def __init__(
39 | self,
40 | is_train=True,
41 | min_size_test=None,
42 | max_size_test=None,
43 | mean=None,
44 | std=None,
45 | ):
46 | """
47 | NOTE: this interface is experimental.
48 | Args:
49 | is_train: for training or inference
50 | augmentations: a list of augmentations or deterministic transforms to apply
51 | tfm_gens: data augmentation
52 | image_format: an image format supported by :func:`detection_utils.read_image`.
53 | """
54 | self.is_train = is_train
55 | self.min_size_test = min_size_test
56 | self.max_size_test = max_size_test
57 | self.pixel_mean = torch.tensor(mean)[:,None,None]
58 | self.pixel_std = torch.tensor(std)[:,None,None]
59 |
60 | t = []
61 | t.append(transforms.Resize(self.min_size_test, interpolation=Image.BICUBIC))
62 | self.transform = transforms.Compose(t)
63 |
64 | @classmethod
65 | def from_config(cls, cfg, is_train=True):
66 | ret = {
67 | "is_train": is_train,
68 | "min_size_test": cfg['INPUT']['MIN_SIZE_TEST'],
69 | "max_size_test": cfg['INPUT']['MAX_SIZE_TEST'],
70 | "mean": cfg['INPUT']['PIXEL_MEAN'],
71 | "std": cfg['INPUT']['PIXEL_STD'],
72 | }
73 | return ret
74 |
75 | def read_semseg(self, file_name):
76 | if '.png' in file_name:
77 | semseg = np.asarray(Image.open(file_name))
78 | elif '.mat' in file_name:
79 | semseg = scipy.io.loadmat(file_name)['LabelMap']
80 | return semseg
81 |
82 | def __call__(self, dataset_dict):
83 | """
84 | Args:
85 | dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
86 |
87 | Returns:
88 | dict: a format that builtin models in detectron2 accept
89 | """
90 | dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below
91 | file_name = dataset_dict['file_name']
92 | semseg_name = dataset_dict['sem_seg_file_name']
93 | image = Image.open(file_name).convert('RGB')
94 |
95 | dataset_dict['width'] = image.size[0]
96 | dataset_dict['height'] = image.size[1]
97 |
98 | if self.is_train == False:
99 | image = self.transform(image)
100 | image = torch.from_numpy(np.asarray(image).copy())
101 | image = image.permute(2,0,1)
102 |
103 | semseg = self.read_semseg(semseg_name)
104 | semseg = torch.from_numpy(semseg.astype(np.int32))
105 | dataset_dict['image'] = image
106 | dataset_dict['semseg'] = semseg
107 | return dataset_dict
--------------------------------------------------------------------------------
/datasets/dataset_mappers/coco_panoptic_new_baseline_dataset_mapper.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------
2 | # Copyright (c) 2022 IDEA. All Rights Reserved.
3 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
4 | # ------------------------------------------------------------------------
5 | # Modified from Mask2Former https://github.com/facebookresearch/Mask2Former by Feng Li.
6 | import copy
7 | import logging
8 |
9 | import numpy as np
10 | import torch
11 |
12 | from detectron2.config import configurable
13 | from detectron2.data import detection_utils as utils
14 | from detectron2.data import transforms as T
15 | from detectron2.data.transforms import TransformGen
16 | from detectron2.structures import BitMasks, Boxes, Instances
17 |
18 | __all__ = ["COCOPanopticNewBaselineDatasetMapper"]
19 |
20 |
21 | def build_transform_gen(cfg, is_train):
22 | """
23 | Create a list of default :class:`Augmentation` from config.
24 | Now it includes resizing and flipping.
25 | Returns:
26 | list[Augmentation]
27 | """
28 | assert is_train, "Only support training augmentation"
29 | image_size = cfg.INPUT.IMAGE_SIZE
30 | min_scale = cfg.INPUT.MIN_SCALE
31 | max_scale = cfg.INPUT.MAX_SCALE
32 |
33 | augmentation = []
34 |
35 | if cfg.INPUT.RANDOM_FLIP != "none":
36 | augmentation.append(
37 | T.RandomFlip(
38 | horizontal=cfg.INPUT.RANDOM_FLIP == "horizontal",
39 | vertical=cfg.INPUT.RANDOM_FLIP == "vertical",
40 | )
41 | )
42 |
43 | augmentation.extend([
44 | T.ResizeScale(
45 | min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size
46 | ),
47 | T.FixedSizeCrop(crop_size=(image_size, image_size)),
48 | ])
49 |
50 | return augmentation
51 |
52 |
53 | # This is specifically designed for the COCO dataset.
54 | class COCOPanopticNewBaselineDatasetMapper:
55 | """
56 | A callable which takes a dataset dict in Detectron2 Dataset format,
57 | and map it into a format used by MaskFormer.
58 |
59 | This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation.
60 |
61 | The callable currently does the following:
62 |
63 | 1. Read the image from "file_name"
64 | 2. Applies geometric transforms to the image and annotation
65 | 3. Find and applies suitable cropping to the image and annotation
66 | 4. Prepare image and annotation to Tensors
67 | """
68 |
69 | @configurable
70 | def __init__(
71 | self,
72 | is_train=True,
73 | *,
74 | tfm_gens,
75 | image_format,
76 | ):
77 | """
78 | NOTE: this interface is experimental.
79 | Args:
80 | is_train: for training or inference
81 | augmentations: a list of augmentations or deterministic transforms to apply
82 | crop_gen: crop augmentation
83 | tfm_gens: data augmentation
84 | image_format: an image format supported by :func:`detection_utils.read_image`.
85 | """
86 | self.tfm_gens = tfm_gens
87 | logging.getLogger(__name__).info(
88 | "[COCOPanopticNewBaselineDatasetMapper] Full TransformGens used in training: {}".format(
89 | str(self.tfm_gens)
90 | )
91 | )
92 |
93 | self.img_format = image_format
94 | self.is_train = is_train
95 |
96 | @classmethod
97 | def from_config(cls, cfg, is_train=True):
98 | # Build augmentation
99 | tfm_gens = build_transform_gen(cfg, is_train)
100 |
101 | ret = {
102 | "is_train": is_train,
103 | "tfm_gens": tfm_gens,
104 | "image_format": cfg.INPUT.FORMAT,
105 | }
106 | return ret
107 |
108 | def __call__(self, dataset_dict):
109 | """
110 | Args:
111 | dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
112 |
113 | Returns:
114 | dict: a format that builtin models in detectron2 accept
115 | """
116 | dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below
117 | image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
118 | utils.check_image_size(dataset_dict, image)
119 |
120 | image, transforms = T.apply_transform_gens(self.tfm_gens, image)
121 | image_shape = image.shape[:2] # h, w
122 |
123 | # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
124 | # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
125 | # Therefore it's important to use torch.Tensor.
126 | dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
127 |
128 | if not self.is_train:
129 | # USER: Modify this if you want to keep them for some reason.
130 | dataset_dict.pop("annotations", None)
131 | return dataset_dict
132 |
133 | if "pan_seg_file_name" in dataset_dict:
134 | pan_seg_gt = utils.read_image(dataset_dict.pop("pan_seg_file_name"), "RGB")
135 | segments_info = dataset_dict["segments_info"]
136 |
137 | # apply the same transformation to panoptic segmentation
138 | pan_seg_gt = transforms.apply_segmentation(pan_seg_gt)
139 |
140 | from panopticapi.utils import rgb2id
141 |
142 | pan_seg_gt = rgb2id(pan_seg_gt)
143 |
144 | instances = Instances(image_shape)
145 | classes = []
146 | masks = []
147 | for segment_info in segments_info:
148 | class_id = segment_info["category_id"]
149 | if not segment_info["iscrowd"]:
150 | classes.append(class_id)
151 | masks.append(pan_seg_gt == segment_info["id"])
152 |
153 | classes = np.array(classes)
154 | instances.gt_classes = torch.tensor(classes, dtype=torch.int64)
155 | if len(masks) == 0:
156 | # Some image does not have annotation (all ignored)
157 | instances.gt_masks = torch.zeros((0, pan_seg_gt.shape[-2], pan_seg_gt.shape[-1]))
158 | instances.gt_boxes = Boxes(torch.zeros((0, 4)))
159 | else:
160 | masks = BitMasks(
161 | torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks])
162 | )
163 | instances.gt_masks = masks.tensor
164 | instances.gt_boxes = masks.get_bounding_boxes()
165 |
166 | dataset_dict["instances"] = instances
167 |
168 | return dataset_dict
169 |
--------------------------------------------------------------------------------
/datasets/dataset_mappers/lvis_dataset_mapper.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/d2/detr/dataset_mapper.py
3 | import copy
4 | import random
5 |
6 | import scipy.io
7 | import numpy as np
8 | import torch
9 | from PIL import Image
10 |
11 | from torchvision import transforms
12 |
13 | from pycocotools import mask
14 | from detectron2.data import detection_utils as utils
15 | from detectron2.data import transforms as T
16 | from detectron2.data import MetadataCatalog
17 |
18 | # from ...Networks.Mask2Former.utils import configurable
19 |
20 | __all__ = ["LVISDatasetMapper"]
21 |
22 | def build_transform_gen(cfg, is_train):
23 | """
24 | Create a list of default :class:`Augmentation` from config.
25 | Now it includes resizing and flipping.
26 | Returns:
27 | list[Augmentation]
28 | """
29 | assert is_train, "Only support training augmentation"
30 | cfg_input = cfg['INPUT']
31 | image_size = cfg_input['IMAGE_SIZE']
32 | min_scale = cfg_input['MIN_SCALE']
33 | max_scale = cfg_input['MAX_SCALE']
34 |
35 | augmentation = []
36 |
37 |
38 | if cfg_input['RANDOM_FLIP'] != "none":
39 | augmentation.append(
40 | T.RandomFlip(
41 | horizontal=cfg_input['RANDOM_FLIP'] == "horizontal",
42 | vertical=cfg_input['RANDOM_FLIP'] == "vertical",
43 | )
44 | )
45 |
46 | augmentation.extend([
47 | T.ResizeScale(
48 | min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size
49 | ),
50 | T.FixedSizeCrop(crop_size=(image_size, image_size)),
51 | ])
52 |
53 | return augmentation
54 |
55 |
56 | # This is specifically designed for the COCO dataset.
57 | class LVISDatasetMapper:
58 | """
59 | A callable which takes a dataset dict in Detectron2 Dataset format,
60 | and map it into a format used by MaskFormer.
61 |
62 | This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation.
63 |
64 | The callable currently does the following:
65 |
66 | 1. Read the image from "file_name"
67 | 2. Applies geometric transforms to the image and annotation
68 | 3. Find and applies suitable cropping to the image and annotation
69 | 4. Prepare image and annotation to Tensors
70 | """
71 |
72 | @configurable
73 | def __init__(
74 | self,
75 | is_train=True,
76 | tfm_gens=None,
77 | image_format=None,
78 | min_size_test=None,
79 | max_size_test=None,
80 | mean=None,
81 | std=None,
82 | max_len=None,
83 | ):
84 | """
85 | NOTE: this interface is experimental.
86 | Args:
87 | is_train: for training or inference
88 | augmentations: a list of augmentations or deterministic transforms to apply
89 | tfm_gens: data augmentation
90 | image_format: an image format supported by :func:`detection_utils.read_image`.
91 | """
92 | self.tfm_gens = tfm_gens
93 | self.img_format = image_format
94 | self.is_train = is_train
95 | self.min_size_test = min_size_test
96 | self.max_size_test = max_size_test
97 | self.pixel_mean = torch.tensor(mean)[:,None,None]
98 | self.pixel_std = torch.tensor(std)[:,None,None]
99 | self.max_grounding_num = max_len
100 |
101 | t = []
102 | t.append(transforms.Resize(self.min_size_test, interpolation=Image.BICUBIC))
103 | self.transform = transforms.Compose(t)
104 | self.categories = torch.load(MetadataCatalog.get('logistic').get('cat_root'))
105 |
106 | @classmethod
107 | def from_config(cls, cfg, is_train=True):
108 | # Build augmentation
109 | if is_train:
110 | tfm_gens = build_transform_gen(cfg, is_train)
111 | else:
112 | tfm_gens = None
113 |
114 | ret = {
115 | "is_train": is_train,
116 | "tfm_gens": tfm_gens,
117 | "image_format": cfg['INPUT']['FORMAT'],
118 | "min_size_test": cfg['INPUT']['MIN_SIZE_TEST'],
119 | "max_size_test": cfg['INPUT']['MAX_SIZE_TEST'],
120 | "mean": cfg['INPUT']['PIXEL_MEAN'],
121 | "std": cfg['INPUT']['PIXEL_STD'],
122 | "max_len": cfg['MODEL']['DECODER']['GROUNDING']['MAX_LEN'],
123 | }
124 | return ret
125 |
126 | def __call__(self, dataset_dict):
127 | """
128 | Args:
129 | dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
130 |
131 | Returns:
132 | dict: a format that builtin models in detectron2 accept
133 | """
134 | dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below
135 | file_name = dataset_dict['file_name']
136 | if self.is_train == False:
137 | assert False, "Only support training."
138 | else:
139 | image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
140 | utils.check_image_size(dataset_dict, image)
141 | image, transforms = T.apply_transform_gens(self.tfm_gens, image)
142 | image_shape = image.shape[:2] # h, w
143 | dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
144 |
145 | assert len(dataset_dict['instance']) > 0
146 | masks_grd = []
147 | texts_grd = []
148 | boxes_grd = []
149 | hash_grd = []
150 | for inst, label in zip(dataset_dict['instance'], dataset_dict['labels']):
151 | rle = mask.frPyObjects(inst, dataset_dict['height'], dataset_dict['width'])
152 | m = mask.decode(rle)
153 | # sometimes there are multiple binary map (corresponding to multiple segs)
154 | m = np.sum(m, axis=2)
155 | m = m.astype(np.uint8) # convert to np.uint8
156 | m = transforms.apply_segmentation(m[:,:,None])[:,:,0]
157 | masks_grd += [m]
158 | label_names = self.categories[label]
159 | rand_id = random.randint(0, len(label_names)-1)
160 | texts_grd.append(label_names[rand_id].lower())
161 | hash_grd.append(hash(label_names[rand_id].lower()))
162 |
163 | indices = torch.randperm(len(hash_grd))[:self.max_grounding_num]
164 | masks_grd = torch.from_numpy(np.stack(masks_grd))[indices]
165 | boxes_grd = torch.tensor(boxes_grd)
166 | texts_grd = np.array(texts_grd)[indices.numpy()].tolist()
167 | hash_grd = np.array(hash_grd)[indices.numpy()].tolist()
168 | groundings = {'masks': masks_grd, 'texts': texts_grd, 'hash': hash_grd, 'mode': 'text'}
169 | dataset_dict["groundings"] = groundings
170 | return dataset_dict
--------------------------------------------------------------------------------
/datasets/dataset_mappers/mask_former_panoptic_dataset_mapper.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | import copy
3 | import logging
4 |
5 | import numpy as np
6 | import torch
7 | from torch.nn import functional as F
8 |
9 | from detectron2.data import detection_utils as utils
10 | from detectron2.data import transforms as T
11 | from detectron2.structures import BitMasks, Instances
12 |
13 | from .mask_former_semantic_dataset_mapper import MaskFormerSemanticDatasetMapper
14 | from dinov.utils import configurable
15 |
16 |
17 |
18 | __all__ = ["MaskFormerPanopticDatasetMapper"]
19 |
20 |
21 | class MaskFormerPanopticDatasetMapper(MaskFormerSemanticDatasetMapper):
22 | """
23 | A callable which takes a dataset dict in Detectron2 Dataset format,
24 | and map it into a format used by MaskFormer for panoptic segmentation.
25 |
26 | The callable currently does the following:
27 |
28 | 1. Read the image from "file_name"
29 | 2. Applies geometric transforms to the image and annotation
30 | 3. Find and applies suitable cropping to the image and annotation
31 | 4. Prepare image and annotation to Tensors
32 | """
33 |
34 | @configurable
35 | def __init__(
36 | self,
37 | is_train=True,
38 | *,
39 | augmentations,
40 | image_format,
41 | ignore_label,
42 | size_divisibility,
43 | ):
44 | """
45 | NOTE: this interface is experimental.
46 | Args:
47 | is_train: for training or inference
48 | augmentations: a list of augmentations or deterministic transforms to apply
49 | image_format: an image format supported by :func:`detection_utils.read_image`.
50 | ignore_label: the label that is ignored to evaluation
51 | size_divisibility: pad image size to be divisible by this value
52 | """
53 | super().__init__(
54 | is_train,
55 | augmentations=augmentations,
56 | image_format=image_format,
57 | ignore_label=ignore_label,
58 | size_divisibility=size_divisibility,
59 | )
60 |
61 | def __call__(self, dataset_dict):
62 | """
63 | Args:
64 | dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
65 |
66 | Returns:
67 | dict: a format that builtin models in detectron2 accept
68 | """
69 | assert self.is_train, "MaskFormerPanopticDatasetMapper should only be used for training!"
70 |
71 | dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below
72 | image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
73 | utils.check_image_size(dataset_dict, image)
74 |
75 | # semantic segmentation
76 | if "sem_seg_file_name" in dataset_dict:
77 | # PyTorch transformation not implemented for uint16, so converting it to double first
78 | sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name")).astype("double")
79 | else:
80 | sem_seg_gt = None
81 |
82 | # panoptic segmentation
83 | if "pan_seg_file_name" in dataset_dict:
84 | pan_seg_gt = utils.read_image(dataset_dict.pop("pan_seg_file_name"), "RGB")
85 | segments_info = dataset_dict["segments_info"]
86 | else:
87 | pan_seg_gt = None
88 | segments_info = None
89 |
90 | if pan_seg_gt is None:
91 | raise ValueError(
92 | "Cannot find 'pan_seg_file_name' for panoptic segmentation dataset {}.".format(
93 | dataset_dict["file_name"]
94 | )
95 | )
96 |
97 | aug_input = T.AugInput(image, sem_seg=sem_seg_gt)
98 | aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input)
99 | image = aug_input.image
100 | if sem_seg_gt is not None:
101 | sem_seg_gt = aug_input.sem_seg
102 |
103 | # apply the same transformation to panoptic segmentation
104 | pan_seg_gt = transforms.apply_segmentation(pan_seg_gt)
105 |
106 | from panopticapi.utils import rgb2id
107 |
108 | pan_seg_gt = rgb2id(pan_seg_gt)
109 |
110 | # Pad image and segmentation label here!
111 | image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
112 | if sem_seg_gt is not None:
113 | sem_seg_gt = torch.as_tensor(sem_seg_gt.astype("long"))
114 | pan_seg_gt = torch.as_tensor(pan_seg_gt.astype("long"))
115 |
116 | if self.size_divisibility > 0:
117 | image_size = (image.shape[-2], image.shape[-1])
118 | padding_size = [
119 | 0,
120 | self.size_divisibility - image_size[1],
121 | 0,
122 | self.size_divisibility - image_size[0],
123 | ]
124 | image = F.pad(image, padding_size, value=128).contiguous()
125 | if sem_seg_gt is not None:
126 | sem_seg_gt = F.pad(sem_seg_gt, padding_size, value=self.ignore_label).contiguous()
127 | pan_seg_gt = F.pad(
128 | pan_seg_gt, padding_size, value=0
129 | ).contiguous() # 0 is the VOID panoptic label
130 |
131 | image_shape = (image.shape[-2], image.shape[-1]) # h, w
132 |
133 | # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
134 | # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
135 | # Therefore it's important to use torch.Tensor.
136 | dataset_dict["image"] = image
137 | if sem_seg_gt is not None:
138 | dataset_dict["sem_seg"] = sem_seg_gt.long()
139 |
140 | if "annotations" in dataset_dict:
141 | raise ValueError("Pemantic segmentation dataset should not have 'annotations'.")
142 |
143 | # Prepare per-category binary masks
144 | pan_seg_gt = pan_seg_gt.numpy()
145 | instances = Instances(image_shape)
146 | classes = []
147 | masks = []
148 | for segment_info in segments_info:
149 | class_id = segment_info["category_id"]
150 | if not segment_info["iscrowd"]:
151 | classes.append(class_id)
152 | masks.append(pan_seg_gt == segment_info["id"])
153 |
154 | classes = np.array(classes)
155 | instances.gt_classes = torch.tensor(classes, dtype=torch.int64)
156 | if len(masks) == 0:
157 | # Some image does not have annotation (all ignored)
158 | instances.gt_masks = torch.zeros((0, pan_seg_gt.shape[-2], pan_seg_gt.shape[-1]))
159 | else:
160 | masks = BitMasks(
161 | torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks])
162 | )
163 | instances.gt_masks = masks.tensor
164 | instances.gt_boxes = masks.get_bounding_boxes()
165 |
166 | dataset_dict["instances"] = instances
167 |
168 | return dataset_dict
169 |
--------------------------------------------------------------------------------
/datasets/dataset_mappers/scannet_dataset_mapper.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # X-Decoder -- Generalized Decoding for Pixel, Image, and Language
3 | # Copyright (c) 2022 Microsoft
4 | # Licensed under The MIT License [see LICENSE for details]
5 | # Modified by Xueyan Zou (xueyan@cs.wisc.edu)
6 | # --------------------------------------------------------
7 | # Copyright (c) Facebook, Inc. and its affiliates.
8 | import copy
9 |
10 | import scipy.io
11 | import numpy as np
12 | import torch
13 | from PIL import Image
14 |
15 | from torchvision import transforms
16 | from dinov.utils import configurable
17 |
18 | __all__ = ["ScanNetSegDatasetMapper"]
19 |
20 |
21 | # This is specifically designed for the COCO dataset.
22 | class ScanNetSegDatasetMapper:
23 | """
24 | A callable which takes a dataset dict in Detectron2 Dataset format,
25 | and map it into a format used by MaskFormer.
26 |
27 | This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation.
28 |
29 | The callable currently does the following:
30 |
31 | 1. Read the image from "file_name"
32 | 2. Applies geometric transforms to the image and annotation
33 | 3. Find and applies suitable cropping to the image and annotation
34 | 4. Prepare image and annotation to Tensors
35 | """
36 |
37 | @configurable
38 | def __init__(
39 | self,
40 | is_train=True,
41 | min_size_test=None,
42 | max_size_test=None,
43 | mean=None,
44 | std=None,
45 | ):
46 | """
47 | NOTE: this interface is experimental.
48 | Args:
49 | is_train: for training or inference
50 | augmentations: a list of augmentations or deterministic transforms to apply
51 | tfm_gens: data augmentation
52 | image_format: an image format supported by :func:`detection_utils.read_image`.
53 | """
54 | self.is_train = is_train
55 | self.min_size_test = min_size_test
56 | self.max_size_test = max_size_test
57 | self.pixel_mean = torch.tensor(mean)[:,None,None]
58 | self.pixel_std = torch.tensor(std)[:,None,None]
59 |
60 | t = []
61 | t.append(transforms.Resize(self.min_size_test, interpolation=Image.BICUBIC))
62 | self.transform = transforms.Compose(t)
63 |
64 | @classmethod
65 | def from_config(cls, cfg, is_train=True):
66 | ret = {
67 | "is_train": is_train,
68 | "min_size_test": cfg['INPUT']['MIN_SIZE_TEST'],
69 | "max_size_test": cfg['INPUT']['MAX_SIZE_TEST'],
70 | "mean": cfg['INPUT']['PIXEL_MEAN'],
71 | "std": cfg['INPUT']['PIXEL_STD'],
72 | }
73 | return ret
74 |
75 | def read_semseg(self, file_name):
76 | if '.png' in file_name:
77 | semseg = np.asarray(Image.open(file_name))
78 | elif '.mat' in file_name:
79 | semseg = scipy.io.loadmat(file_name)['LabelMap']
80 | return semseg
81 |
82 | def __call__(self, dataset_dict):
83 | """
84 | Args:
85 | dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
86 |
87 | Returns:
88 | dict: a format that builtin models in detectron2 accept
89 | """
90 | dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below
91 | file_name = dataset_dict['file_name']
92 | semseg_name = dataset_dict['sem_seg_file_name']
93 | image = Image.open(file_name).convert('RGB')
94 |
95 | dataset_dict['width'] = image.size[0]
96 | dataset_dict['height'] = image.size[1]
97 |
98 | if self.is_train == False:
99 | image = self.transform(image)
100 | image = torch.from_numpy(np.asarray(image).copy())
101 | image = image.permute(2,0,1)
102 |
103 | semseg = self.read_semseg(semseg_name)
104 | semseg = torch.from_numpy(semseg.astype(np.int32))
105 | dataset_dict['image'] = image
106 | dataset_dict['semseg'] = semseg
107 | return dataset_dict
--------------------------------------------------------------------------------
/datasets/dataset_mappers/scannet_pano_dataset_mapper.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # X-Decoder -- Generalized Decoding for Pixel, Image, and Language
3 | # Copyright (c) 2022 Microsoft
4 | # Licensed under The MIT License [see LICENSE for details]
5 | # Modified by Xueyan Zou (xueyan@cs.wisc.edu)
6 | # --------------------------------------------------------
7 | # Copyright (c) Facebook, Inc. and its affiliates.
8 | import copy
9 |
10 | import scipy.io
11 | import numpy as np
12 | import torch
13 | from PIL import Image
14 |
15 | from torchvision import transforms
16 | from dinov.utils import configurable
17 |
18 | __all__ = ["ScanNetPanoDatasetMapper"]
19 |
20 |
21 | # This is specifically designed for the COCO dataset.
22 | class ScanNetPanoDatasetMapper:
23 | """
24 | A callable which takes a dataset dict in Detectron2 Dataset format,
25 | and map it into a format used by MaskFormer.
26 |
27 | This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation.
28 |
29 | The callable currently does the following:
30 |
31 | 1. Read the image from "file_name"
32 | 2. Applies geometric transforms to the image and annotation
33 | 3. Find and applies suitable cropping to the image and annotation
34 | 4. Prepare image and annotation to Tensors
35 | """
36 |
37 | @configurable
38 | def __init__(
39 | self,
40 | is_train=True,
41 | min_size_test=None,
42 | max_size_test=None,
43 | mean=None,
44 | std=None,
45 | ):
46 | """
47 | NOTE: this interface is experimental.
48 | Args:
49 | is_train: for training or inference
50 | augmentations: a list of augmentations or deterministic transforms to apply
51 | tfm_gens: data augmentation
52 | image_format: an image format supported by :func:`detection_utils.read_image`.
53 | """
54 | self.is_train = is_train
55 | self.min_size_test = min_size_test
56 | self.max_size_test = max_size_test
57 | self.pixel_mean = torch.tensor(mean)[:,None,None]
58 | self.pixel_std = torch.tensor(std)[:,None,None]
59 |
60 | t = []
61 | t.append(transforms.Resize(self.min_size_test, interpolation=Image.BICUBIC))
62 | self.transform = transforms.Compose(t)
63 |
64 | @classmethod
65 | def from_config(cls, cfg, is_train=True):
66 | ret = {
67 | "is_train": is_train,
68 | "min_size_test": cfg['INPUT']['MIN_SIZE_TEST'],
69 | "max_size_test": cfg['INPUT']['MAX_SIZE_TEST'],
70 | "mean": cfg['INPUT']['PIXEL_MEAN'],
71 | "std": cfg['INPUT']['PIXEL_STD'],
72 | }
73 | return ret
74 |
75 | def __call__(self, dataset_dict):
76 | """
77 | Args:
78 | dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
79 |
80 | Returns:
81 | dict: a format that builtin models in detectron2 accept
82 | """
83 | dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below
84 | file_name = dataset_dict['file_name']
85 | image = Image.open(file_name).convert('RGB')
86 |
87 | dataset_dict['file_name'] = '_'.join(file_name.split('/')[-3:]) # HACK for /tmp file storage on predictions.
88 | dataset_dict['width'] = image.size[0]
89 | dataset_dict['height'] = image.size[1]
90 |
91 | image = self.transform(image)
92 | image = torch.from_numpy(np.asarray(image).copy())
93 | image = image.permute(2,0,1)
94 | dataset_dict['image'] = image
95 | return dataset_dict
--------------------------------------------------------------------------------
/datasets/dataset_mappers/sunrgbd_dataset_mapper.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # X-Decoder -- Generalized Decoding for Pixel, Image, and Language
3 | # Copyright (c) 2022 Microsoft
4 | # Licensed under The MIT License [see LICENSE for details]
5 | # Modified by Xueyan Zou (xueyan@cs.wisc.edu)
6 | # --------------------------------------------------------
7 | # Copyright (c) Facebook, Inc. and its affiliates.
8 | import copy
9 |
10 | import scipy.io
11 | import numpy as np
12 | import torch
13 | from PIL import Image
14 |
15 | from torchvision import transforms
16 | from dinov.utils import configurable
17 |
18 | __all__ = ["SunRGBDSegDatasetMapper"]
19 |
20 |
21 | # This is specifically designed for the COCO dataset.
22 | class SunRGBDSegDatasetMapper:
23 | """
24 | A callable which takes a dataset dict in Detectron2 Dataset format,
25 | and map it into a format used by MaskFormer.
26 |
27 | This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation.
28 |
29 | The callable currently does the following:
30 |
31 | 1. Read the image from "file_name"
32 | 2. Applies geometric transforms to the image and annotation
33 | 3. Find and applies suitable cropping to the image and annotation
34 | 4. Prepare image and annotation to Tensors
35 | """
36 |
37 | @configurable
38 | def __init__(
39 | self,
40 | is_train=True,
41 | min_size_test=None,
42 | max_size_test=None,
43 | mean=None,
44 | std=None,
45 | ):
46 | """
47 | NOTE: this interface is experimental.
48 | Args:
49 | is_train: for training or inference
50 | augmentations: a list of augmentations or deterministic transforms to apply
51 | tfm_gens: data augmentation
52 | image_format: an image format supported by :func:`detection_utils.read_image`.
53 | """
54 | self.is_train = is_train
55 | self.min_size_test = min_size_test
56 | self.max_size_test = max_size_test
57 | self.pixel_mean = torch.tensor(mean)[:,None,None]
58 | self.pixel_std = torch.tensor(std)[:,None,None]
59 |
60 | t = []
61 | t.append(transforms.Resize(self.min_size_test, interpolation=Image.BICUBIC))
62 | self.transform = transforms.Compose(t)
63 |
64 | @classmethod
65 | def from_config(cls, cfg, is_train=True):
66 | ret = {
67 | "is_train": is_train,
68 | "min_size_test": cfg['INPUT']['MIN_SIZE_TEST'],
69 | "max_size_test": cfg['INPUT']['MAX_SIZE_TEST'],
70 | "mean": cfg['INPUT']['PIXEL_MEAN'],
71 | "std": cfg['INPUT']['PIXEL_STD'],
72 | }
73 | return ret
74 |
75 | def read_semseg(self, file_name):
76 | if '.png' in file_name:
77 | semseg = np.asarray(Image.open(file_name))
78 | elif '.mat' in file_name:
79 | semseg = scipy.io.loadmat(file_name)['LabelMap']
80 | return semseg
81 |
82 | def __call__(self, dataset_dict):
83 | """
84 | Args:
85 | dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
86 |
87 | Returns:
88 | dict: a format that builtin models in detectron2 accept
89 | """
90 | dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below
91 | file_name = dataset_dict['file_name']
92 | semseg_name = dataset_dict['sem_seg_file_name']
93 | image = Image.open(file_name).convert('RGB')
94 |
95 | dataset_dict['width'] = image.size[0]
96 | dataset_dict['height'] = image.size[1]
97 |
98 | if self.is_train == False:
99 | image = self.transform(image)
100 | image = torch.from_numpy(np.asarray(image).copy())
101 | image = image.permute(2,0,1)
102 |
103 | semseg = self.read_semseg(semseg_name)
104 | semseg = torch.from_numpy(semseg.astype(np.int32))
105 | dataset_dict['image'] = image
106 | dataset_dict['semseg'] = semseg
107 | return dataset_dict
--------------------------------------------------------------------------------
/datasets/evaluation/__init__.py:
--------------------------------------------------------------------------------
1 | from .instance_evaluation import *
2 | from .segmentation_evaluation import *
3 | from .panoptic_evaluation import *
4 | from .interactive_evaluation import *
--------------------------------------------------------------------------------
/datasets/evaluation/instance_evaluation.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | import contextlib
3 | import copy
4 | import io
5 | import itertools
6 | import json
7 | import logging
8 | import numpy as np
9 | import os
10 | import pickle
11 | from collections import OrderedDict
12 | import pycocotools.mask as mask_util
13 | import torch
14 | from pycocotools.coco import COCO
15 | from pycocotools.cocoeval import COCOeval
16 | from tabulate import tabulate
17 |
18 | import detectron2.utils.comm as comm
19 | from detectron2.config import CfgNode
20 | from detectron2.data import MetadataCatalog
21 | from detectron2.data.datasets.coco import convert_to_coco_json
22 | from detectron2.evaluation.coco_evaluation import COCOEvaluator, _evaluate_predictions_on_coco
23 | from detectron2.evaluation.fast_eval_api import COCOeval_opt
24 | from detectron2.structures import Boxes, BoxMode, pairwise_iou
25 | from detectron2.utils.file_io import PathManager
26 | from detectron2.utils.logger import create_small_table
27 |
28 |
29 | # modified from COCOEvaluator for instance segmetnat
30 | class InstanceSegEvaluator(COCOEvaluator):
31 | """
32 | Evaluate AR for object proposals, AP for instance detection/segmentation, AP
33 | for keypoint detection outputs using COCO's metrics.
34 | See http://cocodataset.org/#detection-eval and
35 | http://cocodataset.org/#keypoints-eval to understand its metrics.
36 | The metrics range from 0 to 100 (instead of 0 to 1), where a -1 or NaN means
37 | the metric cannot be computed (e.g. due to no predictions made).
38 |
39 | In addition to COCO, this evaluator is able to support any bounding box detection,
40 | instance segmentation, or keypoint detection dataset.
41 | """
42 |
43 | def _eval_predictions(self, predictions, img_ids=None):
44 | """
45 | Evaluate predictions. Fill self._results with the metrics of the tasks.
46 | """
47 | self._logger.info("Preparing results for COCO format ...")
48 | coco_results = list(itertools.chain(*[x["instances"] for x in predictions]))
49 | tasks = self._tasks or self._tasks_from_predictions(coco_results)
50 |
51 | # unmap the category ids for COCO
52 | if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"):
53 | dataset_id_to_contiguous_id = self._metadata.thing_dataset_id_to_contiguous_id
54 | # all_contiguous_ids = list(dataset_id_to_contiguous_id.values())
55 | # num_classes = len(all_contiguous_ids)
56 | # assert min(all_contiguous_ids) == 0 and max(all_contiguous_ids) == num_classes - 1
57 |
58 | reverse_id_mapping = {v: k for k, v in dataset_id_to_contiguous_id.items()}
59 | for result in coco_results:
60 | category_id = result["category_id"]
61 | # assert category_id < num_classes, (
62 | # f"A prediction has class={category_id}, "
63 | # f"but the dataset only has {num_classes} classes and "
64 | # f"predicted class id should be in [0, {num_classes - 1}]."
65 | # )
66 | assert category_id in reverse_id_mapping, (
67 | f"A prediction has class={category_id}, "
68 | f"but the dataset only has class ids in {dataset_id_to_contiguous_id}."
69 | )
70 | result["category_id"] = reverse_id_mapping[category_id]
71 |
72 | if self._output_dir:
73 | file_path = os.path.join(self._output_dir, "coco_instances_results.json")
74 | self._logger.info("Saving results to {}".format(file_path))
75 | with PathManager.open(file_path, "w") as f:
76 | f.write(json.dumps(coco_results))
77 | f.flush()
78 |
79 | if not self._do_evaluation:
80 | self._logger.info("Annotations are not available for evaluation.")
81 | return
82 |
83 | self._logger.info(
84 | "Evaluating predictions with {} COCO API...".format(
85 | "unofficial" if self._use_fast_impl else "official"
86 | )
87 | )
88 | for task in sorted(tasks):
89 | assert task in {"bbox", "segm", "keypoints"}, f"Got unknown task: {task}!"
90 | coco_eval = (
91 | _evaluate_predictions_on_coco(
92 | self._coco_api,
93 | coco_results,
94 | task,
95 | kpt_oks_sigmas=self._kpt_oks_sigmas,
96 | use_fast_impl=self._use_fast_impl,
97 | img_ids=img_ids,
98 | max_dets_per_image=self._max_dets_per_image,
99 | )
100 | if len(coco_results) > 0
101 | else None # cocoapi does not handle empty results very well
102 | )
103 |
104 | res = self._derive_coco_results(
105 | coco_eval, task, class_names=self._metadata.get("thing_classes")
106 | )
107 | self._results[task] = res
108 |
--------------------------------------------------------------------------------
/datasets/evaluation/pascal_part_evaluation.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | import contextlib
3 | import copy
4 | import io
5 | import itertools
6 | import json
7 | import logging
8 | import numpy as np
9 | import os
10 | import pickle
11 | from collections import OrderedDict
12 | import pycocotools.mask as mask_util
13 | import torch
14 | from pycocotools.coco import COCO
15 | from pycocotools.cocoeval import COCOeval
16 | from tabulate import tabulate
17 |
18 | import detectron2.utils.comm as comm
19 | from detectron2.config import CfgNode
20 | from detectron2.data import MetadataCatalog
21 | from detectron2.data.datasets.coco import convert_to_coco_json
22 | from detectron2.evaluation.coco_evaluation import COCOEvaluator
23 | from detectron2.structures import Boxes, BoxMode, pairwise_iou
24 | from detectron2.utils.file_io import PathManager
25 | from detectron2.utils.logger import create_small_table
26 | from ..registration.register_pascal_part_all import (
27 | PASCAL_PART_BASE_CATEGORIES as categories_seen,
28 | PASCAL_PART_NOVEL_CATEGORIES as categories_unseen,
29 | )
30 |
31 |
32 | class PASCALPARTEvaluator(COCOEvaluator):
33 | """
34 | PASCALPARTEvaluator on open_vocabulary
35 | """
36 |
37 | def _derive_coco_results(self, coco_eval, iou_type, class_names=None):
38 | """
39 | Additionally plot mAP for 'seen classes' and 'unseen classes'
40 | """
41 |
42 | metrics = {
43 | "bbox": ["AP", "AP50", "AP75", "APs", "APm", "APl"],
44 | "segm": ["AP", "AP50", "AP75", "APs", "APm", "APl"],
45 | "keypoints": ["AP", "AP50", "AP75", "APm", "APl"],
46 | }[iou_type]
47 |
48 | if coco_eval is None:
49 | self._logger.warn("No predictions from the model!")
50 | return {metric: float("nan") for metric in metrics}
51 |
52 | # the standard metrics
53 | results = {
54 | metric: float(coco_eval.stats[idx] * 100 if coco_eval.stats[idx] >= 0 else "nan")
55 | for idx, metric in enumerate(metrics)
56 | }
57 | self._logger.info(
58 | "Evaluation results for {}: \n".format(iou_type) + create_small_table(results)
59 | )
60 | if not np.isfinite(sum(results.values())):
61 | self._logger.info("Some metrics cannot be computed and is shown as NaN.")
62 |
63 | if class_names is None or len(class_names) <= 1:
64 | return results
65 | # Compute per-category AP
66 | # from https://github.com/facebookresearch/Detectron/blob/a6a835f5b8208c45d0dce217ce9bbda915f44df7/detectron/datasets/json_dataset_evaluator.py#L222-L252 # noqa
67 | precisions = coco_eval.eval["precision"]
68 | # precision has dims (iou, recall, cls, area range, max dets)
69 | assert len(class_names) == precisions.shape[2]
70 |
71 | seen_names = set([x['name'] for x in categories_seen])
72 | unseen_names = set([x['name'] for x in categories_unseen])
73 | results_per_category = []
74 | results_per_category50 = []
75 | results_per_category_seen = []
76 | results_per_category_unseen = []
77 | results_per_category50_seen = []
78 | results_per_category50_unseen = []
79 | for idx, name in enumerate(class_names):
80 | # area range index 0: all area ranges
81 | # max dets index -1: typically 100 per image
82 | precision = precisions[:, :, idx, 0, -1]
83 | precision = precision[precision > -1]
84 | ap = np.mean(precision) if precision.size else float("nan")
85 | results_per_category.append(("{}".format(name), float(ap * 100)))
86 | precision50 = precisions[0, :, idx, 0, -1]
87 | precision50 = precision50[precision50 > -1]
88 | ap50 = np.mean(precision50) if precision50.size else float("nan")
89 | results_per_category50.append(("{}".format(name), float(ap50 * 100)))
90 | if name in seen_names:
91 | results_per_category_seen.append(float(ap * 100))
92 | results_per_category50_seen.append(float(ap50 * 100))
93 | if name in unseen_names:
94 | results_per_category_unseen.append(float(ap * 100))
95 | results_per_category50_unseen.append(float(ap50 * 100))
96 |
97 | # tabulate it
98 | N_COLS = min(6, len(results_per_category) * 2)
99 | results_flatten = list(itertools.chain(*results_per_category))
100 | results_2d = itertools.zip_longest(*[results_flatten[i::N_COLS] for i in range(N_COLS)])
101 | table = tabulate(
102 | results_2d,
103 | tablefmt="pipe",
104 | floatfmt=".3f",
105 | headers=["category", "AP"] * (N_COLS // 2),
106 | numalign="left",
107 | )
108 | self._logger.info("Per-category {} AP: \n".format(iou_type) + table)
109 |
110 | N_COLS = min(6, len(results_per_category50) * 2)
111 | results_flatten = list(itertools.chain(*results_per_category50))
112 | results_2d = itertools.zip_longest(*[results_flatten[i::N_COLS] for i in range(N_COLS)])
113 | table = tabulate(
114 | results_2d,
115 | tablefmt="pipe",
116 | floatfmt=".3f",
117 | headers=["category", "AP50"] * (N_COLS // 2),
118 | numalign="left",
119 | )
120 | self._logger.info("Per-category {} AP50: \n".format(iou_type) + table)
121 |
122 | self._logger.info(
123 | "Seen {} AP: {}".format(
124 | iou_type,
125 | sum(results_per_category_seen) / len(results_per_category_seen),
126 | ))
127 | self._logger.info(
128 | "Unseen {} AP: {}".format(
129 | iou_type,
130 | sum(results_per_category_unseen) / len(results_per_category_unseen),
131 | ))
132 |
133 | self._logger.info(
134 | "Seen {} AP50: {}".format(
135 | iou_type,
136 | sum(results_per_category50_seen) / len(results_per_category50_seen),
137 | ))
138 | self._logger.info(
139 | "Unseen {} AP50: {}".format(
140 | iou_type,
141 | sum(results_per_category50_unseen) / len(results_per_category50_unseen),
142 | ))
143 |
144 | results.update({"AP-" + name: ap for name, ap in results_per_category})
145 | results["AP-seen"] = sum(results_per_category_seen) / len(results_per_category_seen)
146 | results["AP-unseen"] = sum(results_per_category_unseen) / len(results_per_category_unseen)
147 | results["AP50-seen"] = sum(results_per_category50_seen) / len(results_per_category50_seen)
148 | results["AP50-unseen"] = sum(results_per_category50_unseen) / len(results_per_category50_unseen)
149 | return results
--------------------------------------------------------------------------------
/datasets/registration/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from . import (
3 | register_ade20k_full,
4 | register_ade20k_panoptic,
5 | register_coco_stuff_10k,
6 | register_coco_panoptic_annos_semseg,
7 | register_coco_panoptic_annos_semseg_interactive,
8 | register_coco_panoptic_annos_semseg_interactive_jointboxpoint,
9 | register_ade20k_instance,
10 | register_sam,
11 | register_sunrgbd_semseg,
12 | register_scannet_semseg,
13 | register_bdd100k_semseg,
14 | register_scannet_panoptic,
15 | register_bdd100k_panoseg,
16 | register_object365_od,
17 | register_pascal_part_all,
18 | register_pascal_part_all_interactive,
19 | register_paco_part_all,
20 | register_partimagenet_part_all,
21 | )
22 |
23 | from . import (
24 | register_ytvos_dataset,
25 | register_davis_dataset,
26 | register_seginw_instance,
27 | register_lvis_eval,
28 | register_context_semseg,
29 | register_odinw_od,
30 | )
--------------------------------------------------------------------------------
/datasets/registration/register_ade20k_instance.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | import json
3 | import logging
4 | import numpy as np
5 | import os
6 | from PIL import Image
7 |
8 | from detectron2.data import DatasetCatalog, MetadataCatalog
9 | from detectron2.data.datasets.coco import load_coco_json, register_coco_instances
10 | from detectron2.utils.file_io import PathManager
11 |
12 | ADE_CATEGORIES = [{'id': 7, 'name': 'bed'}, {'id': 8, 'name': 'windowpane'}, {'id': 10, 'name': 'cabinet'}, {'id': 12, 'name': 'person'}, {'id': 14, 'name': 'door'}, {'id': 15, 'name': 'table'}, {'id': 18, 'name': 'curtain'}, {'id': 19, 'name': 'chair'}, {'id': 20, 'name': 'car'}, {'id': 22, 'name': 'painting'}, {'id': 23, 'name': 'sofa'}, {'id': 24, 'name': 'shelf'}, {'id': 27, 'name': 'mirror'}, {'id': 30, 'name': 'armchair'}, {'id': 31, 'name': 'seat'}, {'id': 32, 'name': 'fence'}, {'id': 33, 'name': 'desk'}, {'id': 35, 'name': 'wardrobe'}, {'id': 36, 'name': 'lamp'}, {'id': 37, 'name': 'bathtub'}, {'id': 38, 'name': 'railing'}, {'id': 39, 'name': 'cushion'}, {'id': 41, 'name': 'box'}, {'id': 42, 'name': 'column'}, {'id': 43, 'name': 'signboard'}, {'id': 44, 'name': 'chest of drawers'}, {'id': 45, 'name': 'counter'}, {'id': 47, 'name': 'sink'}, {'id': 49, 'name': 'fireplace'}, {'id': 50, 'name': 'refrigerator'}, {'id': 53, 'name': 'stairs'}, {'id': 55, 'name': 'case'}, {'id': 56, 'name': 'pool table'}, {'id': 57, 'name': 'pillow'}, {'id': 58, 'name': 'screen door'}, {'id': 62, 'name': 'bookcase'}, {'id': 64, 'name': 'coffee table'}, {'id': 65, 'name': 'toilet'}, {'id': 66, 'name': 'flower'}, {'id': 67, 'name': 'book'}, {'id': 69, 'name': 'bench'}, {'id': 70, 'name': 'countertop'}, {'id': 71, 'name': 'stove'}, {'id': 72, 'name': 'palm'}, {'id': 73, 'name': 'kitchen island'}, {'id': 74, 'name': 'computer'}, {'id': 75, 'name': 'swivel chair'}, {'id': 76, 'name': 'boat'}, {'id': 78, 'name': 'arcade machine'}, {'id': 80, 'name': 'bus'}, {'id': 81, 'name': 'towel'}, {'id': 82, 'name': 'light'}, {'id': 83, 'name': 'truck'}, {'id': 85, 'name': 'chandelier'}, {'id': 86, 'name': 'awning'}, {'id': 87, 'name': 'streetlight'}, {'id': 88, 'name': 'booth'}, {'id': 89, 'name': 'television receiver'}, {'id': 90, 'name': 'airplane'}, {'id': 92, 'name': 'apparel'}, {'id': 93, 'name': 'pole'}, {'id': 95, 'name': 'bannister'}, {'id': 97, 'name': 'ottoman'}, {'id': 98, 'name': 'bottle'}, {'id': 102, 'name': 'van'}, {'id': 103, 'name': 'ship'}, {'id': 104, 'name': 'fountain'}, {'id': 107, 'name': 'washer'}, {'id': 108, 'name': 'plaything'}, {'id': 110, 'name': 'stool'}, {'id': 111, 'name': 'barrel'}, {'id': 112, 'name': 'basket'}, {'id': 115, 'name': 'bag'}, {'id': 116, 'name': 'minibike'}, {'id': 118, 'name': 'oven'}, {'id': 119, 'name': 'ball'}, {'id': 120, 'name': 'food'}, {'id': 121, 'name': 'step'}, {'id': 123, 'name': 'trade name'}, {'id': 124, 'name': 'microwave'}, {'id': 125, 'name': 'pot'}, {'id': 126, 'name': 'animal'}, {'id': 127, 'name': 'bicycle'}, {'id': 129, 'name': 'dishwasher'}, {'id': 130, 'name': 'screen'}, {'id': 132, 'name': 'sculpture'}, {'id': 133, 'name': 'hood'}, {'id': 134, 'name': 'sconce'}, {'id': 135, 'name': 'vase'}, {'id': 136, 'name': 'traffic light'}, {'id': 137, 'name': 'tray'}, {'id': 138, 'name': 'ashcan'}, {'id': 139, 'name': 'fan'}, {'id': 142, 'name': 'plate'}, {'id': 143, 'name': 'monitor'}, {'id': 144, 'name': 'bulletin board'}, {'id': 146, 'name': 'radiator'}, {'id': 147, 'name': 'glass'}, {'id': 148, 'name': 'clock'}, {'id': 149, 'name': 'flag'}]
13 |
14 |
15 | _PREDEFINED_SPLITS = {
16 | # point annotations without masks
17 | "ade20k_instance_train": (
18 | "ADEChallengeData2016/images/training",
19 | "ADEChallengeData2016/ade20k_instance_train.json",
20 | ),
21 | "ade20k_instance_val": (
22 | "ADEChallengeData2016/images/validation",
23 | "ADEChallengeData2016/ade20k_instance_val.json",
24 | ),
25 | }
26 |
27 |
28 | def _get_ade_instances_meta():
29 | thing_ids = [k["id"] for k in ADE_CATEGORIES]
30 | assert len(thing_ids) == 100, len(thing_ids)
31 | # Mapping from the incontiguous ADE category id to an id in [0, 99]
32 | thing_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(thing_ids)}
33 | thing_classes = [k["name"] for k in ADE_CATEGORIES]
34 | ret = {
35 | "thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id,
36 | "thing_classes": thing_classes,
37 | }
38 | return ret
39 |
40 |
41 | def register_all_ade20k_instance(root):
42 | for key, (image_root, json_file) in _PREDEFINED_SPLITS.items():
43 | # Assume pre-defined datasets live in `./datasets`.
44 | register_coco_instances(
45 | key,
46 | _get_ade_instances_meta(),
47 | os.path.join(root, json_file) if "://" not in json_file else json_file,
48 | os.path.join(root, image_root),
49 | )
50 |
51 |
52 | _root = os.getenv("DETECTRON2_DATASETS", "datasets")
53 | if _root!='datasets':
54 | register_all_ade20k_instance(_root)
55 |
--------------------------------------------------------------------------------
/datasets/registration/register_bdd100k_semseg.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # X-Decoder -- Generalized Decoding for Pixel, Image, and Language
3 | # Copyright (c) 2022 Microsoft
4 | # Licensed under The MIT License [see LICENSE for details]
5 | # Modified by Xueyan Zou (xueyan@cs.wisc.edu)
6 | # --------------------------------------------------------
7 | # Copyright (c) Facebook, Inc. and its affiliates.
8 | import numpy as np
9 | import os
10 | import glob
11 | from typing import List, Tuple, Union
12 |
13 | from detectron2.data import DatasetCatalog, MetadataCatalog
14 | from detectron2.utils.file_io import PathManager
15 |
16 | from utils.constants import BDD_SEM
17 |
18 | __all__ = ["load_scannet_instances", "register_scannet_context"]
19 |
20 |
21 | def load_bdd_instances(name: str, dirname: str, split: str, class_names: Union[List[str], Tuple[str, ...]]):
22 | """
23 | Load BDD annotations to Detectron2 format.
24 |
25 | Args:
26 | dirname: Contain "Annotations", "ImageSets", "JPEGImages"
27 | split (str): one of "train", "test", "val", "trainval"
28 | class_names: list or tuple of class names
29 | """
30 | img_folder = os.path.join(dirname, 'images', '10k', split)
31 | img_pths = sorted(glob.glob(os.path.join(img_folder, '*.jpg')))
32 |
33 | sem_folder = os.path.join(dirname, 'labels', 'sem_seg', 'masks', split)
34 | sem_pths = sorted(glob.glob(os.path.join(sem_folder, '*.png')))
35 |
36 | assert len(img_pths) == len(sem_pths)
37 |
38 | dicts = []
39 | for img_pth, sem_pth in zip(img_pths, sem_pths):
40 | r = {
41 | "file_name": img_pth,
42 | "sem_seg_file_name": sem_pth,
43 | "image_id": img_pth.split('/')[-1].split('.')[0],
44 | }
45 | dicts.append(r)
46 | return dicts
47 |
48 |
49 | def register_bdd_context(name, dirname, split, class_names=BDD_SEM):
50 | DatasetCatalog.register(name, lambda: load_bdd_instances(name, dirname, split, class_names))
51 | MetadataCatalog.get(name).set(
52 | stuff_classes=class_names,
53 | dirname=dirname,
54 | split=split,
55 | ignore_label=[255],
56 | thing_dataset_id_to_contiguous_id={},
57 | class_offset=0,
58 | keep_sem_bgd=False
59 | )
60 |
61 |
62 | def register_all_sunrgbd_seg(root):
63 | SPLITS = [
64 | ("bdd10k_val_sem_seg", "bdd100k", "val"),
65 | ]
66 |
67 | for name, dirname, split in SPLITS:
68 | register_bdd_context(name, os.path.join(root, dirname), split)
69 | MetadataCatalog.get(name).evaluator_type = "sem_seg"
70 |
71 |
72 | _root = os.getenv("DATSETW", "datasets")
73 | if _root!='datasets':
74 | register_all_sunrgbd_seg(_root)
--------------------------------------------------------------------------------
/datasets/registration/register_context_semseg.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Copyright (c) Facebook, Inc. and its affiliates.
3 | import numpy as np
4 | import os
5 | import xml.etree.ElementTree as ET
6 | from typing import List, Tuple, Union
7 |
8 | from detectron2.data import DatasetCatalog, MetadataCatalog
9 | from detectron2.structures import BoxMode
10 | from detectron2.utils.file_io import PathManager
11 |
12 | from utils.constants import PASCAL_CONTEXT_459, PASCAL_CONTEXT_59, PASCAL_CONTEXT_33
13 |
14 | __all__ = ["load_context_instances", "register_pascal_context"]
15 | dataset2class = {"context_459_val_seg": PASCAL_CONTEXT_459,
16 | "context_59_val_seg": PASCAL_CONTEXT_59}
17 | dataset2labelfolder = {"context_459_val_seg": "trainval",
18 | "context_59_val_seg": "59_context_labels"}
19 | dataset2postfix = {"context_459_val_seg": ".mat",
20 | "context_59_val_seg": ".png"}
21 | dataset2segloader = {"context_459_val_seg": "MAT",
22 | "context_59_val_seg": "PIL"}
23 |
24 |
25 | def load_context_instances(name: str, dirname: str, split: str, class_names: Union[List[str], Tuple[str, ...]]):
26 | """
27 | Load Pascal VOC detection annotations to Detectron2 format.
28 |
29 | Args:
30 | dirname: Contain "Annotations", "ImageSets", "JPEGImages"
31 | split (str): one of "train", "test", "val", "trainval"
32 | class_names: list or tuple of class names
33 | """
34 | with PathManager.open(os.path.join(dirname, "VOC2010", "ImageSets", "Main", split + ".txt")) as f:
35 | fileids = np.loadtxt(f, dtype=np.str)
36 |
37 | # Needs to read many small annotation files. Makes sense at local
38 | image_dirname = PathManager.get_local_path(os.path.join(dirname, "VOC2010"))
39 | semseg_dirname = PathManager.get_local_path(os.path.join(dirname, dataset2labelfolder[name]))
40 |
41 | dicts = []
42 | for fileid in fileids:
43 | jpeg_file = os.path.join(image_dirname, "JPEGImages", fileid + ".jpg")
44 | seg_file = os.path.join(semseg_dirname, fileid + dataset2postfix[name])
45 |
46 | r = {
47 | "file_name": jpeg_file,
48 | "sem_seg_file_name": seg_file,
49 | "image_id": fileid,
50 | }
51 | dicts.append(r)
52 | return dicts
53 |
54 |
55 | def register_pascal_context(name, dirname, split, year, class_names=dataset2class):
56 | DatasetCatalog.register(name, lambda: load_context_instances(name, dirname, split, class_names))
57 | MetadataCatalog.get(name).set(
58 | stuff_classes=class_names[name],
59 | dirname=dirname,
60 | year=year,
61 | split=split,
62 | ignore_label=[0],
63 | thing_dataset_id_to_contiguous_id={},
64 | class_offset=1,
65 | semseg_loader=dataset2segloader[name],
66 | keep_sem_bgd=False
67 | )
68 |
69 |
70 | def register_all_context_seg(root):
71 | SPLITS = [
72 | ("context_459_val_seg", "pascal_context", "val"),
73 | ("context_59_val_seg", "pascal_context", "val"),
74 | ]
75 | year = 2010
76 | for name, dirname, split in SPLITS:
77 | register_pascal_context(name, os.path.join(root, dirname), split, year)
78 | MetadataCatalog.get(name).evaluator_type = "sem_seg"
79 |
80 |
81 | _root = os.getenv("DATSETW", "datasets")
82 | if _root!='datasets':
83 | register_all_context_seg(_root)
--------------------------------------------------------------------------------
/datasets/registration/register_davis_dataset.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Copyright (c) Facebook, Inc. and its affiliates.
3 | import os
4 | import glob
5 | import json
6 | from typing import List, Tuple, Union
7 |
8 | import cv2
9 | import numpy as np
10 | from scipy.io import loadmat
11 |
12 | from detectron2.data import DatasetCatalog, MetadataCatalog
13 | from detectron2.structures import BoxMode
14 | from detectron2.utils.file_io import PathManager
15 |
16 |
17 | __all__ = ["load_davis_instances", "register_davis_context"]
18 |
19 | def load_davis_instances(name: str, dirname: str, split: str, year: str):
20 | """
21 | Load Pascal VOC detection annotations to Detectron2 format.
22 |
23 | Args:
24 | dirname: Contain "Annotations", "ImageSets", "JPEGImages"
25 | split (str): one of "train", "test", "val", "trainval"
26 | class_names: list or tuple of class names
27 | """
28 | meta_txt = os.path.join(dirname, 'ImageSets', year, "{}.txt".format(split))
29 | meta_json = os.path.join(dirname, 'video_objects_info.json')
30 | meta_json = json.load(open(meta_json))['videos']
31 | video_names = [line.strip() for line in open(meta_txt).readlines()]
32 |
33 | video_dir = os.path.join(dirname, 'JPEGImages', '480p')
34 | mask_dir = os.path.join(dirname, 'Annotations', '480p')
35 | scibble_dir = os.path.join(dirname, 'Scribbles', '480p')
36 | semantic_dir = os.path.join(dirname, 'Annotations_semantics', '480p')
37 |
38 | dicts = []
39 | for vid_name in video_names:
40 | objects = meta_json[vid_name]['objects']
41 | r = {
42 | "file_name": os.path.join(video_dir, vid_name),
43 | "mask_name": os.path.join(mask_dir, vid_name),
44 | "scibble_name": os.path.join(scibble_dir, vid_name),
45 | "semantic_name": os.path.join(semantic_dir, vid_name),
46 | "objects": objects,
47 | }
48 | dicts.append(r)
49 | return dicts
50 |
51 | def register_davis_context(name, dirname, split, year):
52 | if not os.path.exists(dirname):
53 | print("not register for ", name)
54 | return -1
55 | load_davis_instances(name, dirname, split, year)
56 | DatasetCatalog.register("{}".format(name), lambda: load_davis_instances(name, dirname, split, year))
57 | MetadataCatalog.get("{}".format(name)).set(
58 | dirname=dirname,
59 | thing_dataset_id_to_contiguous_id={},
60 | )
61 |
62 | def register_all_davis(root):
63 | SPLITS = [
64 | ("davis17_val", "DAVIS17", "val", "2017"),
65 | ("davis16_val", "DAVIS17", "val", "2016"),
66 | ]
67 |
68 | for name, dirname, split, year in SPLITS:
69 | register_davis_context(name, os.path.join(root, dirname), split, year)
70 | MetadataCatalog.get("{}".format(name)).evaluator_type = None
71 |
72 | _root = os.getenv("TRACKING_DATASET", "datasets")
73 | if _root!='datasets':
74 | register_all_davis(_root)
75 |
--------------------------------------------------------------------------------
/datasets/registration/register_lvis_eval.py:
--------------------------------------------------------------------------------
1 | from detectron2.data.datasets import get_lvis_instances_meta
2 | from detectron2.data import DatasetCatalog, MetadataCatalog
3 | from utils.lvis_cat import LVIS_CATEGORIES as LVIS_V1_CATEGORIES
4 | # from utils.constants import LVIS_CATEGORIES as LVIS_V1_CATEGORIES
5 | import logging
6 | import os
7 | from detectron2.utils.file_io import PathManager
8 | from fvcore.common.timer import Timer
9 | import json
10 |
11 |
12 |
13 | _PREDEFINED_SPLITS_LVIS = {
14 | "lvis_v1": {
15 | "lvis_v1_minival": ("coco/", "coco/annotations/lvis_v1_minival_inserted_image_name.json"),
16 | "lvis_train": ("coco/", "lvis/lvis_v1_train.json"),
17 | },
18 | }
19 |
20 | def get_lvis_instances_meta_v1():
21 | assert len(LVIS_V1_CATEGORIES) == 1203
22 | cat_ids = [k["id"] for k in LVIS_V1_CATEGORIES]
23 | assert min(cat_ids) == 1 and max(cat_ids) == len(
24 | cat_ids
25 | ), "Category ids are not in [1, #categories], as expected"
26 | # Ensure that the category list is sorted by id
27 | thing_ids = [k["id"] for k in LVIS_V1_CATEGORIES]
28 | # lvis_categories = sorted(LVIS_V1_CATEGORIES, key=lambda x: x["id"])
29 | thing_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(thing_ids)}
30 | # thing_classes = [k["name"] for k in O365_CATEGORIES]
31 | def preprocess_name(name):
32 | name = name.lower().strip()
33 | name = name.replace('_', ' ')
34 | return name
35 | thing_classes = [preprocess_name(k["synonyms"][0]) for k in LVIS_V1_CATEGORIES]
36 | meta = {
37 | "thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id,
38 | "thing_classes": thing_classes,
39 | }
40 | return meta
41 |
42 |
43 | def register_lvis_instances(name, metadata, json_file, image_root):
44 | """
45 | Register a dataset in LVIS's json annotation format for instance detection and segmentation.
46 |
47 | Args:
48 | name (str): a name that identifies the dataset, e.g. "lvis_v0.5_train".
49 | metadata (dict): extra metadata associated with this dataset. It can be an empty dict.
50 | json_file (str): path to the json instance annotation file.
51 | image_root (str or path-like): directory which contains all the images.
52 | """
53 | DatasetCatalog.register(name, lambda: load_lvis_json(image_root, json_file, name))
54 | MetadataCatalog.get(name).set(
55 | json_file=json_file, image_root=image_root, evaluator_type="lvis", **metadata
56 | )
57 |
58 |
59 | def load_lvis_json(image_root, annot_json, metadata):
60 | """
61 | Args:
62 | image_dir (str): path to the raw dataset. e.g., "~/coco/train2017".
63 | gt_dir (str): path to the raw annotations. e.g., "~/coco/panoptic_train2017".
64 | json_file (str): path to the json file. e.g., "~/coco/annotations/panoptic_train2017.json".
65 | Returns:
66 | list[dict]: a list of dicts in Detectron2 standard format. (See
67 | `Using Custom Datasets `_ )
68 | """
69 | with PathManager.open(annot_json) as f:
70 | json_info = json.load(f)
71 |
72 | imageid2seg = {}
73 | imageid2box = {}
74 | imageid2lable = {}
75 | for anno in json_info["annotations"]:
76 | image_id = anno['image_id']
77 | seg = anno["segmentation"]
78 | bbox = anno["bbox"]
79 | label = anno["category_id"]
80 | if image_id not in imageid2seg:
81 | imageid2seg[image_id] = []
82 | if image_id not in imageid2box:
83 | imageid2box[image_id] = []
84 | if image_id not in imageid2lable:
85 | imageid2lable[image_id] = []
86 | imageid2seg[image_id] += [seg]
87 | imageid2box[image_id] += [bbox]
88 | imageid2lable[image_id] += [label]
89 |
90 | ret = []
91 | cnt_empty = 0
92 | for image in json_info["images"]:
93 | image_file = os.path.join(image_root ,'/'.join(image["coco_url"].split('/')[-2:]))
94 | image_id = image['id']
95 | if image_id not in imageid2lable:
96 | cnt_empty += 1
97 | continue
98 | ret.append(
99 | {
100 | "file_name": image_file,
101 | "image_id": image_id,
102 | "height": image['height'],
103 | "width": image['width'],
104 | "instance": imageid2seg[image_id],
105 | "box": imageid2box[image_id],
106 | "labels": imageid2lable[image_id],
107 | }
108 | )
109 |
110 | print("Empty annotations: {}".format(cnt_empty))
111 | assert len(ret), f"No images found in {image_root}!"
112 | assert PathManager.isfile(ret[0]["file_name"]), ret[0]["file_name"]
113 | return ret
114 |
115 |
116 | def register_all_lvis(_root_eval, _root_train):
117 | for dataset_name, splits_per_dataset in _PREDEFINED_SPLITS_LVIS.items():
118 | for key, (image_root, json_file) in splits_per_dataset.items():
119 | if 'val' in key:
120 | root = _root_eval
121 | else:
122 | root = _root_train
123 | register_lvis_instances(
124 | key,
125 | get_lvis_instances_meta_v1(),
126 | os.path.join(root, json_file) if "://" not in json_file else json_file,
127 | os.path.join(root, image_root),
128 | )
129 |
130 |
131 | _root_eval = os.getenv("DATASET3", "datasets")
132 | _root_train = os.getenv("DATASET", "datasets")
133 | if _root_train!='datasets' and _root_eval!='datasets':
134 | register_all_lvis(_root_eval, _root_train)
--------------------------------------------------------------------------------
/datasets/registration/register_paco_part_all.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | import logging
3 | import os
4 | from detectron2.data import DatasetCatalog, MetadataCatalog
5 | import copy
6 | # from detectron2.data.datasets.register_coco import register_coco_instances
7 | from detectron2.data.datasets.coco import load_coco_json
8 | import json
9 |
10 |
11 | def _get_paco_metadata(key):
12 | # if '_base' in key:
13 | # id_to_name = {x['id']: x['name'] for x in PASCAL_PART_BASE_CATEGORIES}
14 | # else:
15 | id_to_name = {x['id']: x['name'] for x in PACO_CATEGORIES}
16 |
17 | thing_classes_ = [id_to_name[k] for k in sorted(id_to_name)]
18 | PACO_CATEGORIES_=copy.deepcopy(PACO_CATEGORIES)
19 | for cat in PACO_CATEGORIES_:
20 | if ':' not in cat['name']:
21 | cat['name']=cat['name']+':whole'
22 | if '_(' in cat['name']:
23 | cat['name']=cat['name'][:cat['name'].find('_(')]+cat['name'][cat['name'].find(')')+1:]
24 | if '_' in cat['name']:
25 | cat['name']=cat['name'].replace('_',' ')
26 | id_to_name = {x['id']: x['name'] for x in PACO_CATEGORIES_}
27 | thing_dataset_id_to_contiguous_id = {
28 | x: i for i, x in enumerate(sorted(id_to_name))}
29 | thing_classes = [id_to_name[k] for k in sorted(id_to_name)]
30 |
31 | part_classes = [a.split(":")[1].lower() for a in thing_classes]
32 | thing_clases_id_to_part_id={v: sorted(set(part_classes)).index(n) for v, n in enumerate(part_classes)}
33 | whole_classes = [a.split(":")[0].lower() for a in thing_classes]
34 |
35 | no_part_index = sorted(set(part_classes)).index('whole')
36 | thing_classes_id_without_part = [k for k, v in thing_clases_id_to_part_id.items() if no_part_index==v]
37 |
38 | thing_clases_id_to_whole_id={v: sorted(set(whole_classes)).index(n) for v, n in enumerate(whole_classes)}
39 | thing_clases_id_to_flattened_wholepart = {tid: thing_clases_id_to_whole_id[tid]*len(set(part_classes))+pid for tid, pid in thing_clases_id_to_part_id.items()}
40 | return {
41 | "thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id,
42 | "thing_classes": thing_classes_,
43 | "thing_clases_id_to_part_id": thing_clases_id_to_part_id,
44 | "part_classes": sorted(set(part_classes)),
45 | "thing_clases_id_to_whole_id": thing_clases_id_to_whole_id,
46 | "whole_classes": sorted(set(whole_classes)),
47 | "thing_clases_id_to_flattened_wholepart": thing_clases_id_to_flattened_wholepart,
48 | "thing_classes_id_without_part": thing_classes_id_without_part,
49 | }
50 |
51 |
52 | def register_paco_part_instances(name, metadata, json_file, image_root):
53 | DatasetCatalog.register(name, lambda: load_coco_json(
54 | json_file, image_root, name))
55 | MetadataCatalog.get(name).set(
56 | json_file=json_file, image_root=image_root,
57 | evaluator_type="pascal_part_interactive", **metadata
58 | )
59 |
60 | _PACO = {
61 | "paco_train": ("coco", "paco/annotations/paco_lvis_v1_train.json"),
62 | # "pascal_part_train_one": ("pascal_part/VOCdevkit/VOC2010/JPEGImages", "pascal_part/train_one.json"),
63 | "paco_val_inter": ("coco", "paco/annotations/paco_lvis_v1_val_mini.json"),
64 | # "paco_test": ("paco/val2017", "paco/annotations/paco_lvis_v1_val.json"),
65 | # "pascal_part_base_train": ("pascal_part/VOCdevkit/VOC2010/JPEGImages", "pascal_part/train_base.json"),
66 | # "pascal_part_base_train_one": ("pascal_part/VOCdevkit/VOC2010/JPEGImages", "pascal_part/train_base_one.json"),
67 | # "imagenet_voc_parsed": ("imagenet/train", "imagenet/imagenet_voc_image_parsed.json"),
68 | # "imagenet_golden_pascal_parsed": ("imagenet/train", "imagenet/imagenet_golden_pascal_parsed.json"),
69 | # "imagenet_golden_pascal_parsed_swinbase": ("imagenet/train", "imagenet/imagenet_golden_pascal_parsed_swinbase.json"),
70 | }
71 |
72 |
73 | def register_paco_part(root):
74 | for key, (image_root, json_file) in _PACO.items():
75 | register_paco_part_instances(
76 | key,
77 | _get_paco_metadata(key),
78 | os.path.join(root, json_file) if "://" not in json_file else json_file,
79 | os.path.join(root, image_root),
80 | )
81 |
82 | _root = os.getenv("PACO", "datasets")
83 | if _root!="datasets":
84 | with open(os.path.join(_root,"paco/annotations/paco_lvis_v1_val.json")) as f:
85 | j=json.load(f)
86 | PACO_CATEGORIES=j['categories']
87 |
88 | register_paco_part(_root)
89 | else:
90 | print("skip paco register")
--------------------------------------------------------------------------------
/datasets/registration/register_partimagenet_part_all.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | import logging
3 | import os
4 | from detectron2.data import DatasetCatalog, MetadataCatalog
5 | from detectron2.data.datasets.coco import load_coco_json
6 |
7 | PART_IN_CATEGORIES = [{'id': 0, 'name': 'Quadruped Head', 'supercategory': 'Quadruped'},
8 | {'id': 1, 'name': 'Quadruped Body', 'supercategory': 'Quadruped'},
9 | {'id': 2, 'name': 'Quadruped Foot', 'supercategory': 'Quadruped'},
10 | {'id': 3, 'name': 'Quadruped Tail', 'supercategory': 'Quadruped'},
11 | {'id': 4, 'name': 'Biped Head', 'supercategory': 'Biped'},
12 | {'id': 5, 'name': 'Biped Body', 'supercategory': 'Biped'},
13 | {'id': 6, 'name': 'Biped Hand', 'supercategory': 'Biped'},
14 | {'id': 7, 'name': 'Biped Foot', 'supercategory': 'Biped'},
15 | {'id': 8, 'name': 'Biped Tail', 'supercategory': 'Biped'},
16 | {'id': 9, 'name': 'Fish Head', 'supercategory': 'Fish'},
17 | {'id': 10, 'name': 'Fish Body', 'supercategory': 'Fish'},
18 | {'id': 11, 'name': 'Fish Fin', 'supercategory': 'Fish'},
19 | {'id': 12, 'name': 'Fish Tail', 'supercategory': 'Fish'},
20 | {'id': 13, 'name': 'Bird Head', 'supercategory': 'Bird'},
21 | {'id': 14, 'name': 'Bird Body', 'supercategory': 'Bird'},
22 | {'id': 15, 'name': 'Bird Wing', 'supercategory': 'Bird'},
23 | {'id': 16, 'name': 'Bird Foot', 'supercategory': 'Bird'},
24 | {'id': 17, 'name': 'Bird Tail', 'supercategory': 'Bird'},
25 | {'id': 18, 'name': 'Snake Head', 'supercategory': 'Snake'},
26 | {'id': 19, 'name': 'Snake Body', 'supercategory': 'Snake'},
27 | {'id': 20, 'name': 'Reptile Head', 'supercategory': 'Reptile'},
28 | {'id': 21, 'name': 'Reptile Body', 'supercategory': 'Reptile'},
29 | {'id': 22, 'name': 'Reptile Foot', 'supercategory': 'Reptile'},
30 | {'id': 23, 'name': 'Reptile Tail', 'supercategory': 'Reptile'},
31 | {'id': 24, 'name': 'Car Body', 'supercategory': 'Car'},
32 | {'id': 25, 'name': 'Car Tier', 'supercategory': 'Car'},
33 | {'id': 26, 'name': 'Car Side Mirror', 'supercategory': 'Car'},
34 | {'id': 27, 'name': 'Bicycle Body', 'supercategory': 'Bicycle'},
35 | {'id': 28, 'name': 'Bicycle Head', 'supercategory': 'Bicycle'},
36 | {'id': 29, 'name': 'Bicycle Seat', 'supercategory': 'Bicycle'},
37 | {'id': 30, 'name': 'Bicycle Tier', 'supercategory': 'Bicycle'},
38 | {'id': 31, 'name': 'Boat Body', 'supercategory': 'Boat'},
39 | {'id': 32, 'name': 'Boat Sail', 'supercategory': 'Boat'},
40 | {'id': 33, 'name': 'Aeroplane Head', 'supercategory': 'Aeroplane'},
41 | {'id': 34, 'name': 'Aeroplane Body', 'supercategory': 'Aeroplane'},
42 | {'id': 35, 'name': 'Aeroplane Engine', 'supercategory': 'Aeroplane'},
43 | {'id': 36, 'name': 'Aeroplane Wing', 'supercategory': 'Aeroplane'},
44 | {'id': 37, 'name': 'Aeroplane Tail', 'supercategory': 'Aeroplane'},
45 | {'id': 38, 'name': 'Bottle Mouth', 'supercategory': 'Bottle'},
46 | {'id': 39, 'name': 'Bottle Body', 'supercategory': 'Bottle'}]
47 |
48 |
49 | def _get_partimagenet_metadata(key):
50 | id_to_name = {x['id']: x['name'] for x in PART_IN_CATEGORIES}
51 | thing_dataset_id_to_contiguous_id = {
52 | x: i for i, x in enumerate(sorted(id_to_name))}
53 | thing_classes = [id_to_name[k] for k in sorted(id_to_name)]
54 |
55 | part_classes = [a.split(" ")[1].lower() for a in thing_classes]
56 | thing_clases_id_to_part_id = {v: sorted(set(part_classes)).index(n) for v, n in enumerate(part_classes)}
57 | whole_classes = [a.split(" ")[0].lower() for a in thing_classes]
58 | thing_clases_id_to_whole_id = {v: sorted(set(whole_classes)).index(n) for v, n in enumerate(whole_classes)}
59 | thing_clases_id_to_flattened_wholepart = {tid: thing_clases_id_to_whole_id[tid] * len(set(part_classes)) + pid for
60 | tid, pid in thing_clases_id_to_part_id.items()}
61 | return {
62 | "thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id,
63 | "thing_classes": thing_classes,
64 | "thing_clases_id_to_part_id": thing_clases_id_to_part_id,
65 | "part_classes": sorted(set(part_classes)),
66 | "thing_clases_id_to_whole_id": thing_clases_id_to_whole_id,
67 | "whole_classes": sorted(set(whole_classes)),
68 | "thing_clases_id_to_flattened_wholepart": thing_clases_id_to_flattened_wholepart,
69 | }
70 |
71 |
72 | def register_partimagenet_part_instances(name, metadata, json_file, image_root):
73 | DatasetCatalog.register(name, lambda: load_coco_json(
74 | json_file, image_root, name))
75 | MetadataCatalog.get(name).set(
76 | json_file=json_file, image_root=image_root,
77 | evaluator_type="pascal_part_interactive", **metadata
78 | )
79 |
80 |
81 | _PART_IN = {
82 | "partimagenet_train": ("imagenet/train", "partimagenet/train_format.json"),
83 | "partimagenet_val_inter": ("imagenet/val", "partimagenet/val_format_mini.json"),
84 | }
85 |
86 |
87 | def register_partimagenet_part(root):
88 | for key, (image_root, json_file) in _PART_IN.items():
89 | register_partimagenet_part_instances(
90 | key,
91 | _get_partimagenet_metadata(key),
92 | os.path.join(root, json_file) if "://" not in json_file else json_file,
93 | os.path.join(root, image_root),
94 | )
95 |
96 |
97 | _root = os.getenv("PART_IN", "datasets")
98 | if _root!='datasets':
99 | register_partimagenet_part(_root)
100 |
--------------------------------------------------------------------------------
/datasets/registration/register_sam.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2022 The IDEA Authors. All rights reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ------------------------------------------------------------------------------------------------
16 | # Copyright (c) Facebook, Inc. and its affiliates.
17 | # ------------------------------------------------------------------------------------------------
18 | # Modified from:
19 | # https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/data/datasets/register_ade20k_instance.py
20 | # ------------------------------------------------------------------------------------------------
21 |
22 | import json
23 | import logging
24 | import numpy as np
25 | import os
26 | from PIL import Image
27 |
28 | from detectron2.data import DatasetCatalog, MetadataCatalog
29 | from detectron2.utils.file_io import PathManager
30 | import detectron2.utils.comm as comm
31 | import torch.distributed as dist
32 |
33 | import os.path as op
34 |
35 | SAM_CATEGORIES = [{'id': 1, 'name': 'stuff'}]
36 |
37 | _PREDEFINED_SPLITS = {
38 | # point annotations without masks
39 | "sam_train": (
40 | "",
41 | ),
42 | "sam_val": (
43 | "",
44 | ),
45 | }
46 |
47 |
48 | def _get_sam_instances_meta():
49 | thing_ids = [k["id"] for k in SAM_CATEGORIES]
50 | assert len(thing_ids) == 1, len(thing_ids)
51 | # Mapping from the incontiguous ADE category id to an id in [0, 99]
52 | thing_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(thing_ids)}
53 | thing_classes = [k["name"] for k in SAM_CATEGORIES]
54 | ret = {
55 | "thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id,
56 | "thing_classes": thing_classes,
57 | }
58 | return ret
59 |
60 | def load_sam_index(tsv_file, dataset_name=None, extra_annotation_keys=None):
61 | """
62 | Load a json file with COCO's instances annotation format.
63 | Currently supports instance detection, instance segmentation,
64 | and person keypoints annotations.
65 | """
66 | dataset_dicts = []
67 | tsv_id = 0
68 | files = os.listdir(tsv_file)
69 | start = int(os.getenv("SAM_SUBSET_START", "90"))
70 | end = int(os.getenv("SAM_SUBSET_END", "100"))
71 | if len(files)>0 and 'part' in files[0]: # for hgx
72 | files = [f for f in files if '.tsv' in f and int(f.split('.')[1].split('_')[-1])>=start and int(f.split('.')[1].split('_')[-1])=start and int(f.split('.')[0].split('-')[-1])`_ )
48 | """
49 |
50 | with PathManager.open(annot_json) as f:
51 | json_info = json.load(f)
52 |
53 | # build dictionary for grounding
54 | grd_dict = collections.defaultdict(list)
55 | for grd_ann in json_info['annotations']:
56 | image_id = int(grd_ann["image_id"])
57 | grd_dict[image_id].append(grd_ann)
58 |
59 | ret = []
60 | for image in json_info["images"]:
61 | image_id = int(image["id"])
62 | image_file = os.path.join(image_root, image['file_name'])
63 | grounding_anno = grd_dict[image_id]
64 |
65 | if 'train' in name and len(grounding_anno) == 0:
66 | continue
67 |
68 | ret.append(
69 | {
70 | "file_name": image_file,
71 | "image_id": image_id,
72 | "inst_info": grounding_anno,
73 | }
74 | )
75 |
76 | assert len(ret), f"No images found in {image_root}!"
77 | assert PathManager.isfile(ret[0]["file_name"]), ret[0]["file_name"]
78 | return ret
79 |
80 |
81 | def register_seginw(
82 | name, metadata, image_root, annot_json):
83 | DatasetCatalog.register(
84 | name,
85 | lambda: load_seginw_json(name, image_root, annot_json, metadata),
86 | )
87 | MetadataCatalog.get(name).set(
88 | image_root=image_root,
89 | json_file=annot_json,
90 | evaluator_type="seginw",
91 | ignore_label=255,
92 | label_divisor=1000,
93 | **metadata,
94 | )
95 |
96 |
97 | def register_all_seginw(root):
98 | for (
99 | prefix,
100 | (split, folder_name, annot_name),
101 | ) in _PREDEFINED_SPLITS_SEGINW.items():
102 | register_seginw(
103 | prefix,
104 | get_metadata(),
105 | os.path.join(root, folder_name, split),
106 | os.path.join(root, folder_name, split, annot_name),
107 | )
108 |
109 |
110 | _root = os.getenv("DATSETW", "datasets")
111 | if _root!='datasets':
112 | register_all_seginw(_root)
113 |
--------------------------------------------------------------------------------
/datasets/registration/register_sunrgbd_semseg.py:
--------------------------------------------------------------------------------
1 |
2 | # Copyright (c) Facebook, Inc. and its affiliates.
3 | # --------------------------------------------------------
4 | # X-Decoder -- Generalized Decoding for Pixel, Image, and Language
5 | # Copyright (c) 2022 Microsoft
6 | # Licensed under The MIT License [see LICENSE for details]
7 | # Modified by Xueyan Zou (xueyan@cs.wisc.edu)
8 | # --------------------------------------------------------
9 | import numpy as np
10 | import os
11 | import glob
12 | from typing import List, Tuple, Union
13 |
14 | from detectron2.data import DatasetCatalog, MetadataCatalog
15 | from detectron2.structures import BoxMode
16 | from detectron2.utils.file_io import PathManager
17 |
18 | from utils.constants import SUN_RGBD_37
19 |
20 | __all__ = ["load_sunrgbd_instances", "register_sunrgbd_context"]
21 |
22 | def load_sunrgbd_instances(name: str, dirname: str, split: str, class_names: Union[List[str], Tuple[str, ...]]):
23 | """
24 | Load SUN-RGBD detection annotations to Detectron2 format.
25 |
26 | Args:
27 | dirname: Contain "Annotations", "ImageSets", "JPEGImages"
28 | split (str): one of "train", "test", "val", "trainval"
29 | class_names: list or tuple of class names
30 | """
31 | if split == 'val':
32 | split = 'test'
33 |
34 | # Needs to read many small annotation files. Makes sense at local
35 | image_pths = sorted(glob.glob(os.path.join(dirname, 'image', split, '*.jpg')))
36 | semseg_pths = sorted(glob.glob(os.path.join(dirname, 'label37', split, '*.png')))
37 |
38 | assert len(image_pths) == len(semseg_pths)
39 | # 5k images
40 | dicts = []
41 | for image_dir, semseg_dir in zip(image_pths, semseg_pths):
42 | r = {
43 | "file_name": image_dir,
44 | "sem_seg_file_name": semseg_dir,
45 | "image_id": semseg_dir.split('/')[-1].split('.')[0],
46 | }
47 | dicts.append(r)
48 | return dicts
49 |
50 |
51 | def register_sun_context(name, dirname, split, class_names=SUN_RGBD_37):
52 | DatasetCatalog.register(name, lambda: load_sunrgbd_instances(name, dirname, split, class_names))
53 | MetadataCatalog.get(name).set(
54 | stuff_classes=class_names,
55 | dirname=dirname,
56 | split=split,
57 | ignore_label=[0],
58 | thing_dataset_id_to_contiguous_id={},
59 | class_offset=1,
60 | keep_sem_bgd=False
61 | )
62 |
63 |
64 | def register_all_sunrgbd_seg(root):
65 | SPLITS = [
66 | ("sunrgbd_37_val_seg", "sun_rgbd", "val"),
67 | ]
68 |
69 | for name, dirname, split in SPLITS:
70 | register_sun_context(name, os.path.join(root, dirname), split)
71 | MetadataCatalog.get(name).evaluator_type = "sem_seg"
72 |
73 |
74 | _root = os.getenv("DATSETW", "datasets")
75 | if _root!='datasets':
76 | register_all_sunrgbd_seg(_root)
--------------------------------------------------------------------------------
/datasets/registration/register_ytvos_dataset.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Copyright (c) Facebook, Inc. and its affiliates.
3 | import os
4 | import glob
5 | import json
6 | from typing import List, Tuple, Union
7 |
8 | import cv2
9 | import numpy as np
10 | from scipy.io import loadmat
11 |
12 | from detectron2.data import DatasetCatalog, MetadataCatalog
13 | from detectron2.structures import BoxMode
14 | from detectron2.utils.file_io import PathManager
15 |
16 |
17 | __all__ = ["load_ytovs_instances", "register_ytvos_context"]
18 |
19 | def load_ytvos_instances(name: str, dirname: str, split: str):
20 | """
21 | Load Pascal VOC detection annotations to Detectron2 format.
22 |
23 | Args:
24 | dirname: Contain "Annotations", "ImageSets", "JPEGImages"
25 | split (str): one of "train", "test", "val", "trainval"
26 | class_names: list or tuple of class names
27 | """
28 | meta_json = os.path.join(dirname, split, "meta.json")
29 | video_dir = os.path.join(dirname, split, 'JPEGImages')
30 | mask_dir = os.path.join(dirname, split, 'Annotations')
31 | video_names = os.listdir(video_dir)
32 | meta = json.load(open(meta_json))['videos']
33 |
34 | dicts = []
35 | for vid_name in video_names:
36 | objects = meta[vid_name]['objects']
37 | r = {
38 | "file_name": os.path.join(video_dir, vid_name),
39 | "mask_name": os.path.join(mask_dir, vid_name),
40 | "objects": objects,
41 | }
42 | dicts.append(r)
43 |
44 | return dicts
45 |
46 | def register_ytvos_context(name, dirname, split):
47 | if not os.path.exists(dirname):
48 | print("not register for ", name)
49 | return -1
50 | DatasetCatalog.register("{}".format(name), lambda: load_ytvos_instances(name, dirname, split))
51 | MetadataCatalog.get("{}".format(name)).set(
52 | dirname=dirname,
53 | thing_dataset_id_to_contiguous_id={},
54 | )
55 |
56 | def register_all_davis(root):
57 | SPLITS = [
58 | ("ytvos19_val", "ytvos2019", "valid"),
59 | ("ytvos18_val", "ytvos2018", "valid"),
60 | ]
61 |
62 | for name, dirname, split in SPLITS:
63 | register_ytvos_context(name, os.path.join(root, dirname), split)
64 | MetadataCatalog.get("{}".format(name)).evaluator_type = None
65 |
66 | _root = os.getenv("TRACKING_DATASET", "datasets")
67 | if _root!='datasets':
68 | register_all_davis(_root)
--------------------------------------------------------------------------------
/datasets/semseg_loader.py:
--------------------------------------------------------------------------------
1 | from PIL import Image
2 | import scipy.io
3 | import numpy as np
4 |
5 | def load_semseg(filename, loader_type):
6 | if loader_type == 'PIL':
7 | semseg = np.array(Image.open(filename), dtype=np.int)
8 | elif loader_type == 'MAT':
9 | semseg = scipy.io.loadmat(filename)['LabelMap']
10 | return semseg
--------------------------------------------------------------------------------
/datasets/shapes/__init__.py:
--------------------------------------------------------------------------------
1 | from .sampler import ShapeSampler
2 | from .simpleclick_sampler import SimpleClickSampler
3 |
4 |
5 | def build_shape_sampler(cfg, **kwargs):
6 | sampler_name = cfg['STROKE_SAMPLER']['EVAL']['MODE']
7 | if sampler_name == 'random':
8 | return ShapeSampler(cfg, **kwargs)
9 | elif sampler_name == 'best':
10 | return SimpleClickSampler(cfg, **kwargs)
11 | else:
12 | assert False, "not implemented"
--------------------------------------------------------------------------------
/datasets/shapes/sampler.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import random
3 |
4 | import torch
5 | import torch.nn as nn
6 |
7 | from dinov.utils import configurable
8 | from .scribble import Scribble
9 |
10 |
11 | class ShapeSampler(nn.Module):
12 | @configurable
13 | def __init__(self, max_candidate=1, shape_prob=[], shape_candidate=[], is_train=True):
14 | super().__init__()
15 | self.max_candidate = max_candidate
16 | self.shape_prob = shape_prob
17 | self.shape_candidate = shape_candidate
18 | self.is_train = is_train
19 |
20 | @classmethod
21 | def from_config(cls, cfg, is_train=True, mode=None):
22 | max_candidate = cfg['STROKE_SAMPLER']['MAX_CANDIDATE']
23 | candidate_probs = cfg['STROKE_SAMPLER']['CANDIDATE_PROBS']
24 | candidate_names = cfg['STROKE_SAMPLER']['CANDIDATE_NAMES']
25 | candidate_classes = [getattr(sys.modules[__name__], class_name)(cfg, is_train) for class_name in candidate_names]
26 |
27 | # overwrite condidate_prob
28 | if not is_train:
29 | candidate_probs = [0.0 for x in range(len(candidate_names))]
30 | candidate_probs[candidate_names.index(mode)] = 1.0
31 |
32 | # Build augmentation
33 | return {
34 | "max_candidate": max_candidate,
35 | "shape_prob": candidate_probs,
36 | "shape_candidate": candidate_classes,
37 | "is_train": is_train,
38 | }
39 |
40 | def forward(self, masks, boxes, max_candidate=50):
41 | # masks = instances.gt_masks.tensor
42 | # boxes = instances.gt_boxes.tensor
43 |
44 | if len(masks) == 0:
45 | gt_masks = torch.zeros(masks.shape[-2:]).bool()
46 | rand_masks = torch.zeros(masks.shape[-2:]).bool()
47 | return {'gt_masks': gt_masks[None,:], 'rand_shape': torch.stack([rand_masks]), 'types': ['none']}
48 | indices = [x for x in range(len(masks))]
49 |
50 | if self.is_train:
51 | # random.shuffle(indices)
52 | candidate_mask = masks[indices[:max_candidate]]
53 | # candidate_box = boxes[indices[:max_candidate]]
54 | else:
55 | candidate_mask = masks
56 | candidate_box = boxes
57 |
58 | draw_funcs = random.choices(self.shape_candidate, weights=self.shape_prob, k=len(candidate_mask)) # sample one shape, i.e., point
59 | rand_shapes = [d.draw(x, b).cuda() for d,x, b in zip(draw_funcs, candidate_mask, candidate_box)]
60 | types = [repr(x) for x in draw_funcs]
61 | for i in range(0, len(rand_shapes)):
62 | if rand_shapes[i].sum() == 0:
63 | candidate_mask[i] = candidate_mask[i] * 0
64 | types[i] = 'none'
65 |
66 | # candidate_mask: (c,h,w), bool. rand_shape: (c, iter, h, w), bool. types: list(c)
67 | try:
68 | rand_shapess=torch.stack(rand_shapes)
69 | except RuntimeError:
70 | for r in rand_shapes:
71 | print('r ', r.device())
72 | print(candidate_mask.device())
73 | return {'gt_masks': candidate_mask, 'rand_shape': torch.stack(rand_shapes), 'types': types, 'sampler': self}
74 |
75 | def build_shape_sampler(cfg, **kwargs):
76 | return ShapeSampler(cfg, **kwargs)
--------------------------------------------------------------------------------
/datasets/shapes/scribble.py:
--------------------------------------------------------------------------------
1 | import random
2 |
3 | import torch
4 |
5 | from .mask_generators import get_mask_by_input_strokes
6 |
7 |
8 | class Scribble:
9 | def __init__(self, cfg, is_train):
10 | self.num_stroke = cfg['STROKE_SAMPLER']['SCRIBBLE']['NUM_STROKES']
11 | self.stroke_preset = cfg['STROKE_SAMPLER']['SCRIBBLE']['STROKE_PRESET']
12 | self.stroke_prob = cfg['STROKE_SAMPLER']['SCRIBBLE']['STROKE_PROB']
13 | self.eval_stroke = cfg['STROKE_SAMPLER']['EVAL']['MAX_ITER']
14 | self.is_train = is_train
15 |
16 | @staticmethod
17 | def get_stroke_preset(stroke_preset):
18 | if stroke_preset == 'rand_curve':
19 | return {
20 | "nVertexBound": [20, 50],
21 | "maxHeadSpeed": 30,
22 | "maxHeadAcceleration": (30, 0.5),
23 | "brushWidthBound": (3, 15),
24 | "nMovePointRatio": 0.5,
25 | "maxPiontMove": 6,
26 | "maxLineAcceleration": (9, 0.5),
27 | "boarderGap": None,
28 | "maxInitSpeed": 10
29 | }
30 | elif stroke_preset == 'rand_curve_small':
31 | return {
32 | "nVertexBound": [6, 22],
33 | "maxHeadSpeed": 12,
34 | "maxHeadAcceleration": (8, 0.5),
35 | "brushWidthBound": (2.5, 5),
36 | "nMovePointRatio": 0.5,
37 | "maxPiontMove": 1.5,
38 | "maxLineAcceleration": (3, 0.5),
39 | "boarderGap": None,
40 | "maxInitSpeed": 3
41 | }
42 | else:
43 | raise NotImplementedError(f'The stroke presetting "{stroke_preset}" does not exist.')
44 |
45 | def get_random_points_from_mask(self, mask, n=5):
46 | h,w = mask.shape
47 | view_mask = mask.reshape(h*w)
48 | non_zero_idx = view_mask.nonzero()[:,0]
49 | selected_idx = torch.randperm(len(non_zero_idx))[:n]
50 | non_zero_idx = non_zero_idx[selected_idx]
51 | y = (non_zero_idx // w)*1.0
52 | x = (non_zero_idx % w)*1.0
53 | return torch.cat((x[:,None], y[:,None]), dim=1).cpu().numpy()
54 |
55 | def draw(self, mask=None, box=None):
56 | if mask.sum() < 1:
57 | return torch.zeros(mask.shape).bool().cuda() # if mask is empty
58 | if not self.is_train:
59 | return self.draw_eval(mask=mask, box=box)
60 | stroke_preset_name = random.choices(self.stroke_preset, weights=self.stroke_prob, k=1)[0]
61 | preset = Scribble.get_stroke_preset(stroke_preset_name)
62 | nStroke = random.randint(1, min(self.num_stroke, mask.sum().item()))
63 | h,w = mask.shape
64 | points = self.get_random_points_from_mask(mask, n=nStroke)
65 | rand_mask = get_mask_by_input_strokes(
66 | init_points=points,
67 | imageWidth=w, imageHeight=h, nStroke=min(nStroke, len(points)), **preset)
68 | rand_mask = (~torch.from_numpy(rand_mask)).cuda() * mask
69 | return rand_mask
70 |
71 | def draw_eval(self, mask=None, box=None):
72 | stroke_preset_name = random.choices(self.stroke_preset, weights=self.stroke_prob, k=1)[0]
73 | preset = Scribble.get_stroke_preset(stroke_preset_name)
74 | nStroke = min(self.eval_stroke, mask.sum().item())
75 | h,w = mask.shape
76 | points = self.get_random_points_from_mask(mask, n=nStroke)
77 | rand_masks = []
78 | for i in range(len(points)):
79 | rand_mask = get_mask_by_input_strokes(
80 | init_points=points[:i+1],
81 | imageWidth=w, imageHeight=h, nStroke=min(i, len(points)), **preset)
82 | rand_mask = (~torch.from_numpy(rand_mask)).cuda() * mask
83 | rand_masks += [rand_mask]
84 | return torch.stack(rand_masks)
85 |
86 | @staticmethod
87 | def draw_by_points(points, mask, h, w):
88 | preset = Scribble.get_stroke_preset('rand_curve_small')
89 | rand_mask = get_mask_by_input_strokes(
90 | init_points=points,
91 | imageWidth=w, imageHeight=h, nStroke=len(points), **preset)[None,]
92 | rand_masks = (~torch.from_numpy(rand_mask)) * mask
93 | return rand_masks
94 |
95 | def __repr__(self,):
96 | return 'scribble'
--------------------------------------------------------------------------------
/datasets/shapes/simpleclick_sampler.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import random
3 |
4 | import cv2
5 | import numpy as np
6 | from scipy import ndimage
7 | import torch
8 | import torch.nn as nn
9 | import torch.nn.functional as F
10 | from kornia.contrib import distance_transform
11 |
12 | from .scribble import Scribble
13 | from dinov.utils import configurable
14 |
15 |
16 | class SimpleClickSampler(nn.Module):
17 | @configurable
18 | def __init__(self, mask_mode='point', sample_negtive=False, is_train=True, dilation=None, dilation_kernel=None):
19 | super().__init__()
20 | self.mask_mode = mask_mode
21 | self.sample_negtive = sample_negtive
22 | self.is_train = is_train
23 | self.dilation = dilation
24 | self.register_buffer("dilation_kernel", dilation_kernel)
25 |
26 | @classmethod
27 | def from_config(cls, cfg, is_train=True, mode=None):
28 | mask_mode = mode
29 | sample_negtive = cfg['STROKE_SAMPLER']['EVAL']['NEGATIVE']
30 |
31 | dilation = cfg['STROKE_SAMPLER']['DILATION']
32 | dilation_kernel = torch.ones((1, 1, dilation, dilation), device=torch.cuda.current_device())
33 |
34 | # Build augmentation
35 | return {
36 | "mask_mode": mask_mode,
37 | "sample_negtive": sample_negtive,
38 | "is_train": is_train,
39 | "dilation": dilation,
40 | "dilation_kernel": dilation_kernel,
41 | }
42 |
43 | def forward_scribble(self, instances, pred_masks=None, prev_masks=None):
44 | gt_masks_batch = instances.gt_masks
45 | _,h,w = gt_masks_batch.shape
46 |
47 | rand_shapes = []
48 | for i in range(len(gt_masks_batch)):
49 | gt_masks = gt_masks_batch[i:i+1]
50 | assert len(gt_masks) == 1 # it only supports a single image, with a single candidate mask.
51 | # pred_masks is after padding
52 |
53 | # We only consider positive points
54 | pred_masks = torch.zeros(gt_masks.shape).bool() if pred_masks is None else pred_masks[:,:h,:w]
55 | prev_masks = torch.zeros(gt_masks.shape).bool() if prev_masks is None else prev_masks
56 |
57 | fp = gt_masks & (~(gt_masks & pred_masks)) & (~prev_masks)
58 | next_mask = torch.zeros(gt_masks.shape).bool()
59 |
60 | mask_dt = torch.from_numpy(cv2.distanceTransform(fp[0].numpy().astype(np.uint8), cv2.DIST_L2, 0)[None,:])
61 | max_value = mask_dt.max()
62 | next_mask[(mask_dt==max_value).nonzero()[0:1].t().tolist()] = True
63 |
64 | points = next_mask[0].nonzero().flip(dims=[-1])
65 | next_mask = Scribble.draw_by_points(points, gt_masks, h, w)
66 | rand_shapes += [(prev_masks | next_mask)]
67 |
68 | types = ['scribble' for i in range(len(gt_masks_batch))]
69 | return {'gt_masks': instances.gt_masks, 'rand_shape': rand_shapes, 'types': types, 'sampler': self}
70 |
71 | def forward(self, instances, *args, **kwargs):
72 | if self.mask_mode == 'Point':
73 | return self.forward_point(instances, *args, **kwargs)
74 | elif self.mask_mode == 'Circle':
75 | assert False, "Circle not support best path."
76 | elif self.mask_mode == 'Scribble':
77 | assert False, "Scribble not support best path."
78 | elif self.mask_mode == 'Polygon':
79 | assert False, "Polygon not support best path."
80 |
--------------------------------------------------------------------------------
/datasets/utils/tsv/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Author: Yihao Chen
3 | # @Date: 2021-08-16 16:56:22
4 | # @Last Modified by: Yihao Chen
5 | # @Last Modified time: 2021-08-16 17:00:28
6 |
7 | from .io_common import FileProgressingbar, img_from_base64, generate_lineidx
8 | from .tsv_io import TSVFile
9 |
10 | __all__ = [
11 | 'FileProgressingbar', 'img_from_base64', 'generate_lineidx', 'TSVFile'
12 | ]
--------------------------------------------------------------------------------
/datasets/utils/tsv/io_common.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Author: Yihao Chen
3 | # @Date: 2021-08-13 14:35:27
4 | # @Last Modified by: Yihao Chen
5 | # @Last Modified time: 2022-04-24 11:38:58
6 |
7 | import os
8 | import base64
9 | from io import BytesIO
10 | from PIL import Image
11 |
12 | import cv2
13 | import yaml
14 | import progressbar
15 | import numpy as np
16 | import torchvision.transforms as T
17 |
18 | class FileProgressingbar:
19 | fileobj = None
20 | pbar = None
21 | def __init__(self, fileobj, msg):
22 | fileobj.seek(0, os.SEEK_END)
23 | flen = fileobj.tell()
24 | fileobj.seek(0, os.SEEK_SET)
25 | self.fileobj = fileobj
26 | widgets = [msg, progressbar.AnimatedMarker(), ' ', progressbar.Percentage(), ' ', progressbar.Bar(), ' ', progressbar.ETA()]
27 | self.pbar = progressbar.ProgressBar(widgets=widgets, maxval=flen).start()
28 |
29 | def update(self):
30 | self.pbar.update(self.fileobj.tell())
31 |
32 |
33 | def img_from_base64(imagestring):
34 | jpgbytestring = base64.b64decode(imagestring)
35 | image = BytesIO(jpgbytestring)
36 | image = Image.open(image).convert("RGB")
37 | return image
38 |
39 | # jpgbytestring = base64.b64decode(imagestring)
40 | # nparr = np.frombuffer(jpgbytestring, np.uint8)
41 | # try:
42 | # r = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
43 | # # r = cv2.cvtColor(r, cv2.COLOR_BGR2RGB)
44 | # return r
45 | # except:
46 | # return None
47 |
48 |
49 | def generate_lineidx(filein, idxout):
50 | assert not os.path.isfile(idxout)
51 | with open(filein, 'r') as tsvin, open(idxout, 'w') as tsvout:
52 | bar = FileProgressingbar(tsvin, 'Generating lineidx {0}: '.format(idxout))
53 | fsize = os.fstat(tsvin.fileno()).st_size
54 | fpos = 0
55 | while fpos != fsize:
56 | tsvout.write(str(fpos)+"\n")
57 | tsvin.readline()
58 | fpos = tsvin.tell()
59 | bar.update()
60 |
--------------------------------------------------------------------------------
/datasets/utils/tsv/tsv_io.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Author: Yihao Chen
3 | # @Date: 2021-08-13 14:26:21
4 | # @Last Modified by: Yihao Chen
5 | # @Last Modified time: 2022-08-17 00:57:51
6 | import time
7 | import os
8 | import os.path as op
9 | from .io_common import generate_lineidx, FileProgressingbar
10 |
11 |
12 | class TSVFile(object):
13 | def __init__(self, tsv_file, silence=True):
14 | self.tsv_file = tsv_file
15 | self.lineidx = op.splitext(tsv_file)[0] + '.lineidx'
16 |
17 | self.label_file = op.splitext(tsv_file)[0] + '.label'
18 | self.label_lineidx = op.splitext(tsv_file)[0] + '.label.lineidx'
19 |
20 | if os.path.exists(self.label_file):
21 | self.split_label = True
22 | else:
23 | self.split_label = False
24 |
25 | self._fp = None
26 | self._lineidx = None
27 |
28 | self._label_fp = None
29 | self._label_lineidx = None
30 |
31 | self.pid = None
32 | self.silence = silence
33 | self._ensure_lineidx_loaded()
34 |
35 | def num_rows(self):
36 | return len(self._lineidx)
37 |
38 | def seek(self, idx):
39 | self._ensure_tsv_opened()
40 | pos = self._lineidx[idx]
41 | self._fp.seek(pos)
42 | tsv_info = [s.strip() for s in self._fp.readline().split('\t')]
43 |
44 | if self.split_label:
45 | label_pos = self._label_lineidx[idx]
46 | self._label_fp.seek(label_pos)
47 | label_info = [s.strip() for s in self._label_fp.readline().split('\t')]
48 |
49 | assert tsv_info[0] == label_info[0]
50 | tsv_info = [tsv_info[0], label_info[-1], tsv_info[-1]]
51 |
52 | return tsv_info
53 |
54 | def close(self):
55 | if self._fp is not None:
56 | self._fp.close()
57 | del self._fp
58 | del self._lineidx
59 |
60 | self._fp = None
61 | self._lineidx = None
62 |
63 | def _ensure_lineidx_loaded(self):
64 | if not op.isfile(self.lineidx) and not op.islink(self.lineidx):
65 | generate_lineidx(self.tsv_file, self.lineidx)
66 |
67 | if self._lineidx is None:
68 | with open(self.lineidx, 'r') as fp:
69 | lines = fp.readlines()
70 | self._lineidx = [int(i.strip().split()[0]) for i in lines]
71 |
72 | if self.split_label:
73 | with open(self.label_lineidx, 'r') as fp:
74 | lines = fp.readlines()
75 | self._label_lineidx = [int(i.strip().split()[0]) for i in lines]
76 |
77 |
78 | def _ensure_tsv_opened(self):
79 | self._ensure_lineidx_loaded()
80 | if self._fp is None:
81 | self._fp = open(self.tsv_file, 'r')
82 | self.pid = os.getpid()
83 |
84 | if self.split_label:
85 | self._label_fp = open(self.label_file, 'r')
86 |
87 | if self.pid != os.getpid():
88 | print('re-open {} because the process id changed'.format(self.tsv_file))
89 | self._fp = open(self.tsv_file, 'r')
90 | self.pid = os.getpid()
91 |
92 | if self.split_label:
93 | self._label_fp = open(self.label_file, 'r')
94 |
--------------------------------------------------------------------------------
/demo/__init__.py:
--------------------------------------------------------------------------------
1 | from .openset_task import task_openset
--------------------------------------------------------------------------------
/demo/examples/bags.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UX-Decoder/DINOv/53bf20d5cfdbb86fa35141a1cff432d4923599f2/demo/examples/bags.jpg
--------------------------------------------------------------------------------
/demo/examples/corgi2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UX-Decoder/DINOv/53bf20d5cfdbb86fa35141a1cff432d4923599f2/demo/examples/corgi2.jpg
--------------------------------------------------------------------------------
/demo/examples/img.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UX-Decoder/DINOv/53bf20d5cfdbb86fa35141a1cff432d4923599f2/demo/examples/img.png
--------------------------------------------------------------------------------
/demo/examples/ref_cat.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UX-Decoder/DINOv/53bf20d5cfdbb86fa35141a1cff432d4923599f2/demo/examples/ref_cat.jpeg
--------------------------------------------------------------------------------
/demo/openset_task.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # Semantic-SAM: Segment and Recognize Anything at Any Granularity
3 | # Copyright (c) 2023 Microsoft
4 | # Licensed under The MIT License [see LICENSE for details]
5 | # Written by Hao Zhang (hzhangcx@connect.ust.hk)
6 | # --------------------------------------------------------
7 | # Copyright (c) 2024 Microsoft
8 | # Licensed under The MIT License [see LICENSE for details]
9 | # Written by Feng Li (fliay@connect.ust.hk)
10 | # --------------------------------------------------------
11 |
12 | import torch
13 | import numpy as np
14 | from torchvision import transforms
15 | from utils.visualizer import Visualizer
16 | from typing import Tuple
17 | from PIL import Image
18 | from detectron2.data import MetadataCatalog
19 | import os
20 | import cv2
21 |
22 | metadata = MetadataCatalog.get('coco_2017_train_panoptic')
23 |
24 |
25 | def inverse_sigmoid(x, eps=1e-5):
26 | x = x.clamp(min=0, max=1)
27 | x1 = x.clamp(min=eps)
28 | x2 = (1 - x).clamp(min=eps)
29 | return torch.log(x1/x2)
30 |
31 | def task_openset(model,generic_vp1, generic_vp2, generic_vp3, generic_vp4,
32 | generic_vp5, generic_vp6, generic_vp7, generic_vp8, image_tgt=None, text_size=640,hole_scale=100,island_scale=100):
33 | in_context_examples = [generic_vp1, generic_vp2, generic_vp3, generic_vp4,
34 | generic_vp5, generic_vp6, generic_vp7, generic_vp8]
35 | in_context_examples = [x for x in in_context_examples if x is not None]
36 | t = []
37 | t.append(transforms.Resize(int(text_size), interpolation=Image.BICUBIC))
38 | def prepare_image(image_ori):
39 | width = image_ori.size[0]
40 | height = image_ori.size[1]
41 | image_ori = np.asarray(image_ori)
42 | images = torch.from_numpy(image_ori.copy()).permute(2,0,1).cuda()
43 | return images, height, width
44 | transform1 = transforms.Compose(t)
45 | image_ori_tgt = transform1(image_tgt)
46 | images_tgt, height_tgt, width_tgt = prepare_image(image_ori_tgt)
47 | data_tgt = {"image": images_tgt, "height": height_tgt, "width": width_tgt}
48 | batched_inputs = []
49 | batched_inputs_tgt = [data_tgt]
50 | multi_scale_features2, mask_features2, _, _ = model.model.get_encoder_feature(batched_inputs_tgt)
51 | input_query_label_content_all = []
52 | point_coords = torch.ones(1, 4).cuda().float()
53 | point_coords[:, :2] = 0.
54 | input_query_bbox_content_init = inverse_sigmoid(point_coords[None])
55 | for image in in_context_examples:
56 | image_ori = transform1(image['image'])
57 | mask_ori = transform1(image['mask'])
58 | images, height, width = prepare_image(image_ori)
59 |
60 | data = {"image": images, "height": height, "width": width}
61 | data['seg_image'] = data_tgt
62 |
63 | mask_ori = np.asarray(mask_ori)[:,:,0:1].copy()
64 | mask_ori = torch.from_numpy(mask_ori).permute(2,0,1)
65 |
66 | data['targets'] = [dict()]
67 | data['targets'][0]['rand_shape']=mask_ori
68 | data['targets'][0]['pb']=torch.tensor([1.]) # FIXME 0 or 1
69 |
70 | frame = data
71 | rand_shape = mask_ori
72 | frame['targets'][0]['rand_shape'] = rand_shape
73 |
74 | batched_inputs.append(frame)
75 |
76 | multi_scale_features, _, padded_h, padded_w = model.model.get_encoder_feature([frame])
77 | input_query_label_content, input_query_bbox_content, attn_mask_content = model.model. \
78 | get_visual_prompt_content_feature(multi_scale_features, frame['targets'][0]['rand_shape'], padded_h, padded_w)
79 | input_query_label_content_all.append(input_query_label_content)
80 |
81 | # prompt to tgt image
82 | input_query_label_content_current = torch.stack(input_query_label_content_all).mean(0)
83 | masks, ious, ori_masks, scores_per_image_openset = model.model.evaluate_demo_content_openset_multi_with_content_features(
84 | batched_inputs_tgt, mask_features2, multi_scale_features2, input_query_label_content_current,
85 | input_query_bbox_content_init, attn_mask_content, padded_h, padded_w)
86 | if len(ious.shape)>1:
87 | ious=ious[0]
88 | ids=torch.argsort(scores_per_image_openset,descending=True)
89 | areas=[]
90 | image_ori = image_ori_tgt
91 | new_pred_mask = []
92 | new_pred_class_score = []
93 | for i in ids:
94 | new_pred_class_score.append(scores_per_image_openset[i])
95 | new_pred_mask.append(masks[i])
96 | pred_masks_poses = new_pred_mask
97 | ious = new_pred_class_score
98 | visual = Visualizer(image_ori, metadata=metadata)
99 | for i,(pred_masks_pos,iou, _, _) in enumerate(zip(pred_masks_poses,ious, pred_masks_poses, pred_masks_poses)):
100 | iou=round(float(iou),2)
101 | texts=f'{iou}'
102 | mask=(pred_masks_pos>0.0).cpu().numpy()
103 | area=mask.sum()
104 | areas.append(area)
105 | # uncomment for additional postprocessing
106 | # mask,_=remove_small_regions(mask,int(hole_scale),mode="holes")
107 | # mask,_=remove_small_regions(mask,int(island_scale),mode="islands")
108 | mask=(mask).astype(np.float)
109 | color=[0.,0.,1.0]
110 | color=[0.502, 0.0, 0.502]
111 | demo = visual.draw_binary_mask(mask, text='', alpha=0.7, edge_color=color)
112 | res = demo.get_image()
113 |
114 | torch.cuda.empty_cache()
115 |
116 | return res
117 |
118 | def remove_small_regions(
119 | mask: np.ndarray, area_thresh: float, mode: str
120 | ) -> Tuple[np.ndarray, bool]:
121 | """
122 | Removes small disconnected regions and holes in a mask. Returns the
123 | mask and an indicator of if the mask has been modified.
124 | """
125 | import cv2 # type: ignore
126 |
127 | assert mode in ["holes", "islands"]
128 | correct_holes = mode == "holes"
129 | working_mask = (correct_holes ^ mask).astype(np.uint8)
130 | n_labels, regions, stats, _ = cv2.connectedComponentsWithStats(working_mask, 8)
131 | sizes = stats[:, -1][1:] # Row 0 is background label
132 | small_regions = [i + 1 for i, s in enumerate(sizes) if s < area_thresh]
133 | if len(small_regions) == 0:
134 | return mask, False
135 | fill_labels = [0] + small_regions
136 | if not correct_holes:
137 | fill_labels = [i for i in range(n_labels) if i not in fill_labels]
138 | # If every region is below threshold, keep largest
139 | if len(fill_labels) == 0:
140 | fill_labels = [int(np.argmax(sizes)) + 1]
141 | mask = np.isin(regions, fill_labels)
142 | return mask, True
--------------------------------------------------------------------------------
/demo_openset.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # Semantic-SAM: Segment and Recognize Anything at Any Granularity
3 | # Copyright (c) 2023 Microsoft
4 | # Licensed under The MIT License [see LICENSE for details]
5 | # Written by Hao Zhang (hzhangcx@connect.ust.hk)
6 | # --------------------------------------------------------
7 | # Copyright (c) 2024 Microsoft
8 | # Licensed under The MIT License [see LICENSE for details]
9 | # Written by Feng Li (fliay@connect.ust.hk)
10 | # --------------------------------------------------------
11 |
12 |
13 | import gradio as gr
14 | import torch
15 | import argparse
16 |
17 | from dinov.BaseModel import BaseModel
18 | from dinov import build_model
19 | from utils.arguments import load_opt_from_config_file
20 |
21 | from demo import task_openset
22 |
23 | def parse_option():
24 | parser = argparse.ArgumentParser('DINOv Demo', add_help=False)
25 | parser.add_argument('--conf_files', default="configs/dinov_sam_coco_swinl_train.yaml", metavar="FILE", help='path to config file', )
26 | parser.add_argument('--ckpt', default="", metavar="FILE", help='path to ckpt', required=True)
27 | parser.add_argument('--port', default=6099, type=int, help='path to ckpt', )
28 | args = parser.parse_args()
29 |
30 | return args
31 |
32 |
33 | class ImageMask(gr.components.Image):
34 | """
35 | Sets: source="canvas", tool="sketch"
36 | """
37 |
38 | is_template = True
39 |
40 | def __init__(self, **kwargs):
41 | super().__init__(source="upload", tool="sketch", interactive=True, **kwargs)
42 |
43 | def preprocess(self, x):
44 | return super().preprocess(x)
45 |
46 |
47 | '''
48 | build args
49 | '''
50 | args = parse_option()
51 |
52 | '''
53 | build model
54 | '''
55 |
56 | sam_cfg=args.conf_files
57 |
58 | opt = load_opt_from_config_file(sam_cfg)
59 |
60 | model_sam = BaseModel(opt, build_model(opt)).from_pretrained(args.ckpt).eval().cuda()
61 |
62 | @torch.no_grad()
63 | def inference(generic_vp1, generic_vp2, generic_vp3, generic_vp4,
64 | generic_vp5, generic_vp6, generic_vp7, generic_vp8, image2,*args, **kwargs):
65 | with torch.autocast(device_type='cuda', dtype=torch.float16):
66 | model=model_sam
67 | a= task_openset(model, generic_vp1, generic_vp2, generic_vp3, generic_vp4,
68 | generic_vp5, generic_vp6, generic_vp7, generic_vp8, image2, *args, **kwargs)
69 | return a
70 |
71 |
72 | '''
73 | launch app
74 | '''
75 | title = "DINOv: Visual In-Context Prompting"
76 |
77 | article = "The Demo is Run on DINOv."
78 |
79 | demo = gr.Blocks()
80 | image_tgt=gr.components.Image(label="Target Image ",type="pil",brush_radius=15.0)
81 | gallery_output=gr.components.Image(label="Results Image ",type="pil",brush_radius=15.0)
82 |
83 | generic_vp1 = ImageMask(label="scribble on refer Image 1",type="pil",brush_radius=15.0)
84 | generic_vp2 = ImageMask(label="scribble on refer Image 2",type="pil",brush_radius=15.0)
85 | generic_vp3 = ImageMask(label="scribble on refer Image 3",type="pil",brush_radius=15.0)
86 | generic_vp4 = ImageMask(label="scribble on refer Image 5",type="pil",brush_radius=15.0)
87 | generic_vp5 = ImageMask(label="scribble on refer Image 6",type="pil",brush_radius=15.0)
88 | generic_vp6 = ImageMask(label="scribble on refer Image 7",type="pil",brush_radius=15.0)
89 | generic_vp7 = ImageMask(label="scribble on refer Image 8",type="pil",brush_radius=15.0)
90 | generic_vp8 = ImageMask(label="scribble on refer Image 9",type="pil",brush_radius=15.0)
91 | generic = gr.TabbedInterface([
92 | generic_vp1, generic_vp2, generic_vp3, generic_vp4,
93 | generic_vp5, generic_vp6, generic_vp7, generic_vp8
94 | ], ["1", "2", "3", "4", "5", "6", "7", "8"])
95 |
96 | title='''
97 | # DINOv: Visual In-Context Prompting
98 |
99 | # [[Read our arXiv Paper](https://arxiv.org/pdf/2311.13601.pdf)\] \[[Github page](https://github.com/UX-Decoder/DINOv)\]
100 | '''
101 |
102 | with demo:
103 | with gr.Row():
104 | with gr.Column(scale=3.0):
105 | generation_tittle = gr.Markdown(title)
106 | image_tgt.render()
107 | generic.render()
108 | with gr.Row(scale=2.0):
109 | clearBtn = gr.ClearButton(
110 | components=[image_tgt])
111 | runBtn = gr.Button("Run")
112 | with gr.Column(scale=5.0):
113 |
114 | gallery_tittle = gr.Markdown("# Open-set results.")
115 | with gr.Row(scale=9.0):
116 | gallery_output.render()
117 |
118 | example = gr.Examples(
119 | examples=[
120 | ["demo/examples/bags.jpg"],
121 | ["demo/examples/img.png"],
122 | ["demo/examples/corgi2.jpg"],
123 | ["demo/examples/ref_cat.jpeg"],
124 | ],
125 | inputs=image_tgt,
126 | cache_examples=False,
127 | )
128 |
129 | title = title,
130 | article = article,
131 | allow_flagging = 'never',
132 |
133 | runBtn.click(inference, inputs=[generic_vp1, generic_vp2, generic_vp3, generic_vp4,
134 | generic_vp5, generic_vp6, generic_vp7, generic_vp8, image_tgt],
135 | outputs = [gallery_output])
136 |
137 |
138 |
139 | demo.queue().launch(share=True,server_port=args.port)
140 |
141 |
--------------------------------------------------------------------------------
/dinov/BaseModel.py:
--------------------------------------------------------------------------------
1 | import os
2 | import logging
3 |
4 | import torch
5 | import torch.nn as nn
6 |
7 | from utils.model import align_and_update_state_dicts
8 |
9 | logger = logging.getLogger(__name__)
10 |
11 |
12 | class BaseModel(nn.Module):
13 | def __init__(self, opt, module: nn.Module):
14 | super(BaseModel, self).__init__()
15 | self.opt = opt
16 | self.model = module
17 |
18 | def forward(self, *inputs, **kwargs):
19 | outputs = self.model(*inputs, **kwargs)
20 | return outputs
21 |
22 | def from_pretrained(self, load_dir):
23 | state_dict = torch.load(load_dir, map_location='cpu')
24 | if 'model' in state_dict:
25 | state_dict=state_dict['model']
26 | state_dict={k[6:]:v for k,v in state_dict.items() if k.startswith('model.')}
27 | state_dict = align_and_update_state_dicts(self.model.state_dict(), state_dict)
28 | self.model.load_state_dict(state_dict, strict=False)
29 | return self
--------------------------------------------------------------------------------
/dinov/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from __future__ import division
3 | from __future__ import print_function
4 |
5 | from .architectures import build_model
6 | from utils.dist import get_world_size, all_gather
--------------------------------------------------------------------------------
/dinov/architectures/__init__.py:
--------------------------------------------------------------------------------
1 | from .dinov import *
2 | from .build import build_model
3 |
4 |
--------------------------------------------------------------------------------
/dinov/architectures/build.py:
--------------------------------------------------------------------------------
1 | from .registry import model_entrypoints
2 | from .registry import is_model
3 |
4 |
5 | def build_model(config, **kwargs):
6 | model_name = config['MODEL']['NAME']
7 |
8 | if not is_model(model_name):
9 | raise ValueError(f'Unkown model: {model_name}')
10 |
11 | return model_entrypoints(model_name)(config, **kwargs)
--------------------------------------------------------------------------------
/dinov/architectures/registry.py:
--------------------------------------------------------------------------------
1 | _model_entrypoints = {}
2 |
3 | def register_model(fn):
4 | module_name_split = fn.__module__.split('.')
5 | model_name = module_name_split[-1]
6 | _model_entrypoints[model_name] = fn
7 | return fn
8 |
9 | def model_entrypoints(model_name):
10 | return _model_entrypoints[model_name]
11 |
12 | def is_model(model_name):
13 | return model_name in _model_entrypoints
--------------------------------------------------------------------------------
/dinov/backbone/__init__.py:
--------------------------------------------------------------------------------
1 | from .build import build_backbone
2 |
3 | from .focal import *
4 | from .focal_dw import *
5 | from .swin import *
6 | from .backbone import *
--------------------------------------------------------------------------------
/dinov/backbone/backbone.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | import torch.nn as nn
3 |
4 | from detectron2.modeling import ShapeSpec
5 |
6 | __all__ = ["Backbone"]
7 |
8 |
9 | class Backbone(nn.Module):
10 | """
11 | Abstract base class for network backbones.
12 | """
13 |
14 | def __init__(self):
15 | """
16 | The `__init__` method of any subclass can specify its own set of arguments.
17 | """
18 | super().__init__()
19 |
20 | def forward(self):
21 | """
22 | Subclasses must override this method, but adhere to the same return type.
23 |
24 | Returns:
25 | dict[str->Tensor]: mapping from feature name (e.g., "res2") to tensor
26 | """
27 | pass
28 |
29 | @property
30 | def size_divisibility(self) -> int:
31 | """
32 | Some backbones require the input height and width to be divisible by a
33 | specific integer. This is typically true for encoder / decoder type networks
34 | with lateral connection (e.g., FPN) for which feature maps need to match
35 | dimension in the "bottom up" and "top down" paths. Set to 0 if no specific
36 | input size divisibility is required.
37 | """
38 | return 0
39 |
40 | def output_shape(self):
41 | """
42 | Returns:
43 | dict[str->ShapeSpec]
44 | """
45 | # this is a backward-compatible default
46 | return {
47 | name: ShapeSpec(
48 | channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
49 | )
50 | for name in self._out_features
51 | }
52 |
--------------------------------------------------------------------------------
/dinov/backbone/build.py:
--------------------------------------------------------------------------------
1 | from .registry import model_entrypoints
2 | from .registry import is_model
3 |
4 | from .backbone import *
5 |
6 | def build_backbone(config, **kwargs):
7 | model_name = config['MODEL']['BACKBONE']['NAME']
8 | if not is_model(model_name):
9 | raise ValueError(f'Unkown model: {model_name}')
10 |
11 | return model_entrypoints(model_name)(config, **kwargs)
--------------------------------------------------------------------------------
/dinov/backbone/registry.py:
--------------------------------------------------------------------------------
1 | _model_entrypoints = {}
2 |
3 |
4 | def register_backbone(fn):
5 | module_name_split = fn.__module__.split('.')
6 | model_name = module_name_split[-1]
7 | _model_entrypoints[model_name] = fn
8 | return fn
9 |
10 | def model_entrypoints(model_name):
11 | return _model_entrypoints[model_name]
12 |
13 | def is_model(model_name):
14 | return model_name in _model_entrypoints
15 |
--------------------------------------------------------------------------------
/dinov/body/__init__.py:
--------------------------------------------------------------------------------
1 | from .build import build_openseed_head
--------------------------------------------------------------------------------
/dinov/body/build.py:
--------------------------------------------------------------------------------
1 | from .registry import model_entrypoints
2 | from .registry import is_model
3 | from .general_head import *
4 |
5 |
6 | def build_openseed_head(config, *args, **kwargs):
7 | model_name = config['MODEL']['HEAD']
8 | if not is_model(model_name):
9 | raise ValueError(f'Unkown model: {model_name}')
10 |
11 | body = model_entrypoints(model_name)(config, *args, **kwargs)
12 | return body
--------------------------------------------------------------------------------
/dinov/body/decoder/__init__.py:
--------------------------------------------------------------------------------
1 | from .build import build_decoder
2 | from .dinov_openset_decoder import *
3 |
--------------------------------------------------------------------------------
/dinov/body/decoder/build.py:
--------------------------------------------------------------------------------
1 | from .registry import model_entrypoints
2 | from .registry import is_model
3 |
4 |
5 | def build_decoder(config, *args, **kwargs):
6 | model_name = config['MODEL']['DECODER']['NAME']
7 |
8 | if not is_model(model_name):
9 | raise ValueError(f'Unkown model: {model_name}')
10 |
11 | return model_entrypoints(model_name)(config, *args, **kwargs)
--------------------------------------------------------------------------------
/dinov/body/decoder/registry.py:
--------------------------------------------------------------------------------
1 | _model_entrypoints = {}
2 |
3 | def register_decoder(fn):
4 | module_name_split = fn.__module__.split('.')
5 | model_name = module_name_split[-1]
6 | _model_entrypoints[model_name] = fn
7 | return fn
8 |
9 | def model_entrypoints(model_name):
10 | return _model_entrypoints[model_name]
11 |
12 | def is_model(model_name):
13 | return model_name in _model_entrypoints
--------------------------------------------------------------------------------
/dinov/body/decoder/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .utils import *
--------------------------------------------------------------------------------
/dinov/body/decoder/utils/utils.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import copy
3 | from torch import nn, Tensor
4 | import os
5 |
6 | import math
7 | import torch.nn.functional as F
8 | from torch import nn
9 |
10 |
11 | class MLP(nn.Module):
12 | """ Very simple multi-layer perceptron (also called FFN)"""
13 |
14 | def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
15 | super().__init__()
16 | self.num_layers = num_layers
17 | h = [hidden_dim] * (num_layers - 1)
18 | self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
19 |
20 | def forward(self, x):
21 | for i, layer in enumerate(self.layers):
22 | x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
23 | return x
24 |
25 |
26 | def inverse_sigmoid(x, eps=1e-5):
27 | x = x.clamp(min=0, max=1)
28 | x1 = x.clamp(min=eps)
29 | x2 = (1 - x).clamp(min=eps)
30 | return torch.log(x1/x2)
31 |
32 |
33 | def gen_encoder_output_proposals(memory:Tensor, memory_padding_mask:Tensor, spatial_shapes:Tensor):
34 | """
35 | Input:
36 | - memory: bs, \sum{hw}, d_model
37 | - memory_padding_mask: bs, \sum{hw}
38 | - spatial_shapes: nlevel, 2
39 | Output:
40 | - output_memory: bs, \sum{hw}, d_model
41 | - output_proposals: bs, \sum{hw}, 4
42 | """
43 | N_, S_, C_ = memory.shape
44 | base_scale = 4.0
45 | proposals = []
46 | _cur = 0
47 | for lvl, (H_, W_) in enumerate(spatial_shapes):
48 | mask_flatten_ = memory_padding_mask[:, _cur:(_cur + H_ * W_)].view(N_, H_, W_, 1)
49 | valid_H = torch.sum(~mask_flatten_[:, :, 0, 0], 1)
50 | valid_W = torch.sum(~mask_flatten_[:, 0, :, 0], 1)
51 |
52 | grid_y, grid_x = torch.meshgrid(torch.linspace(0, H_ - 1, H_, dtype=torch.float32, device=memory.device),
53 | torch.linspace(0, W_ - 1, W_, dtype=torch.float32, device=memory.device))
54 | grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1)
55 |
56 | scale = torch.cat([valid_W.unsqueeze(-1), valid_H.unsqueeze(-1)], 1).view(N_, 1, 1, 2)
57 | grid = (grid.unsqueeze(0).expand(N_, -1, -1, -1) + 0.5) / scale
58 | wh = torch.ones_like(grid) * 0.05 * (2.0 ** lvl)
59 | proposal = torch.cat((grid, wh), -1).view(N_, -1, 4)
60 | proposals.append(proposal)
61 | _cur += (H_ * W_)
62 | output_proposals = torch.cat(proposals, 1)
63 | output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True)
64 | output_proposals = torch.log(output_proposals / (1 - output_proposals))
65 | output_proposals = output_proposals.masked_fill(memory_padding_mask.unsqueeze(-1), float('inf'))
66 | output_proposals = output_proposals.masked_fill(~output_proposals_valid, float('inf'))
67 |
68 | output_memory = memory
69 | output_memory = output_memory.masked_fill(memory_padding_mask.unsqueeze(-1), float(0))
70 | output_memory = output_memory.masked_fill(~output_proposals_valid, float(0))
71 | return output_memory, output_proposals
72 |
73 |
74 | def gen_sineembed_for_position(pos_tensor, dim=128):
75 | # n_query, bs, _ = pos_tensor.size()
76 | # sineembed_tensor = torch.zeros(n_query, bs, 256)
77 | scale = 2 * math.pi
78 | dim_t = torch.arange(dim, dtype=torch.float32, device=pos_tensor.device)
79 | dim_t = 10000 ** (2 * (dim_t // 2) / dim)
80 | x_embed = pos_tensor[:, :, 0] * scale
81 | y_embed = pos_tensor[:, :, 1] * scale
82 | pos_x = x_embed[:, :, None] / dim_t
83 | pos_y = y_embed[:, :, None] / dim_t
84 | pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2)
85 | pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=3).flatten(2)
86 | if pos_tensor.size(-1) == 2:
87 | pos = torch.cat((pos_y, pos_x), dim=2)
88 | elif pos_tensor.size(-1) == 4:
89 | w_embed = pos_tensor[:, :, 2] * scale
90 | pos_w = w_embed[:, :, None] / dim_t
91 | pos_w = torch.stack((pos_w[:, :, 0::2].sin(), pos_w[:, :, 1::2].cos()), dim=3).flatten(2)
92 |
93 | h_embed = pos_tensor[:, :, 3] * scale
94 | pos_h = h_embed[:, :, None] / dim_t
95 | pos_h = torch.stack((pos_h[:, :, 0::2].sin(), pos_h[:, :, 1::2].cos()), dim=3).flatten(2)
96 |
97 | pos = torch.cat((pos_y, pos_x, pos_w, pos_h), dim=2)
98 | else:
99 | raise ValueError("Unknown pos_tensor shape(-1):{}".format(pos_tensor.size(-1)))
100 | return pos
101 |
102 |
103 | def _get_activation_fn(activation):
104 | """Return an activation function given a string"""
105 | if activation == "relu":
106 | return F.relu
107 | if activation == "gelu":
108 | return F.gelu
109 | if activation == "glu":
110 | return F.glu
111 | if activation == "prelu":
112 | return nn.PReLU()
113 | if activation == "selu":
114 | return F.selu
115 | raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
116 |
117 |
118 | def _get_clones(module, N, layer_share=False):
119 |
120 | if layer_share:
121 | return nn.ModuleList([module for i in range(N)])
122 | else:
123 | return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
124 |
125 | def from_divisablity(x, div):
126 | if x % div == 0:
127 | return x
128 | return (int(x / div) + 1) * div
129 |
130 | def getIdx(a, id_start):
131 | co = a.unsqueeze(0) - a.unsqueeze(1)
132 | uniquer = co.unique(dim=0)
133 | out = []
134 | for r in uniquer:
135 | cover = torch.arange(a.size(0)).to(a)
136 | mask = r == 0
137 | idx = cover[mask]
138 | out.append(idx)
139 | out = [o + id_start for o in out]
140 | return {str(k.cpu().numpy()): v for k, v in zip(a.unique(), out[::-1])}
141 |
142 | def get_world_size():
143 | if torch.distributed.is_initialized():
144 | return torch.distributed.get_world_size()
145 | return 1
146 |
147 | def all_gather(x):
148 | if get_world_size() > 1:
149 | all_x = [torch.zeros_like(x) for _ in range(get_world_size())]
150 | torch.distributed.all_gather(all_x, x.detach())
151 | all_x[torch.distributed.get_rank()] = x
152 | x = torch.stack(all_x, dim=0)
153 | return x
154 |
155 |
156 | def get_unpadded_tensor(tensors, num_examples):
157 | new_tensor_list = []
158 | for i, tensor in enumerate(tensors):
159 | new_tensor_list.append(tensor[:num_examples[i]])
160 | return new_tensor_list
--------------------------------------------------------------------------------
/dinov/body/encoder/__init__.py:
--------------------------------------------------------------------------------
1 | from .build import build_encoder
--------------------------------------------------------------------------------
/dinov/body/encoder/build.py:
--------------------------------------------------------------------------------
1 | from .registry import model_entrypoints
2 | from .registry import is_model
3 | from .encoder_deform import *
4 |
5 | def build_encoder(config, *args, **kwargs):
6 | model_name = config['MODEL']['ENCODER']['NAME']
7 |
8 | if not is_model(model_name):
9 | raise ValueError(f'Unkown model: {model_name}')
10 |
11 | return model_entrypoints(model_name)(config, *args, **kwargs)
--------------------------------------------------------------------------------
/dinov/body/encoder/ops/functions/__init__.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------------------------------
2 | # Deformable DETR
3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5 | # ------------------------------------------------------------------------------------------------
6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
7 | # ------------------------------------------------------------------------------------------------
8 |
9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 |
12 | from .ms_deform_attn_func import MSDeformAttnFunction
13 |
14 |
--------------------------------------------------------------------------------
/dinov/body/encoder/ops/functions/ms_deform_attn_func.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------------------------------
2 | # Deformable DETR
3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5 | # ------------------------------------------------------------------------------------------------
6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
7 | # ------------------------------------------------------------------------------------------------
8 |
9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 |
12 | from __future__ import absolute_import
13 | from __future__ import print_function
14 | from __future__ import division
15 |
16 | import torch
17 | import torch.nn.functional as F
18 | from torch.autograd import Function
19 | from torch.autograd.function import once_differentiable
20 |
21 | try:
22 | import MultiScaleDeformableAttention as MSDA
23 | except ModuleNotFoundError as e:
24 | info_string = (
25 | "\n\nPlease compile MultiScaleDeformableAttention CUDA op with the following commands:\n"
26 | "\t`cd mask2former/modeling/pixel_decoder/ops`\n"
27 | "\t`sh make.sh`\n"
28 | )
29 | raise ModuleNotFoundError(info_string)
30 |
31 |
32 | class MSDeformAttnFunction(Function):
33 | @staticmethod
34 | def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step):
35 | ctx.im2col_step = im2col_step
36 | output = MSDA.ms_deform_attn_forward(
37 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step)
38 | ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights)
39 | return output
40 |
41 | @staticmethod
42 | @once_differentiable
43 | def backward(ctx, grad_output):
44 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors
45 | grad_value, grad_sampling_loc, grad_attn_weight = \
46 | MSDA.ms_deform_attn_backward(
47 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step)
48 |
49 | return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
50 |
51 |
52 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
53 | # for debug and test only,
54 | # need to use cuda version instead
55 | N_, S_, M_, D_ = value.shape
56 | _, Lq_, M_, L_, P_, _ = sampling_locations.shape
57 | value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
58 | sampling_grids = 2 * sampling_locations - 1
59 | sampling_value_list = []
60 | for lid_, (H_, W_) in enumerate(value_spatial_shapes):
61 | # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
62 | value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_)
63 | # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
64 | sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)
65 | # N_*M_, D_, Lq_, P_
66 | sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,
67 | mode='bilinear', padding_mode='zeros', align_corners=False)
68 | sampling_value_list.append(sampling_value_l_)
69 | # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
70 | attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_)
71 | output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)
72 | return output.transpose(1, 2).contiguous()
73 |
--------------------------------------------------------------------------------
/dinov/body/encoder/ops/modules/__init__.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------------------------------
2 | # Deformable DETR
3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5 | # ------------------------------------------------------------------------------------------------
6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
7 | # ------------------------------------------------------------------------------------------------
8 |
9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 |
12 | from .ms_deform_attn import MSDeformAttn
13 |
--------------------------------------------------------------------------------
/dinov/body/encoder/ops/setup.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------------------------------
2 | # Deformable DETR
3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5 | # ------------------------------------------------------------------------------------------------
6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
7 | # ------------------------------------------------------------------------------------------------
8 |
9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 |
12 | import os
13 | import glob
14 |
15 | import torch
16 |
17 | from torch.utils.cpp_extension import CUDA_HOME
18 | from torch.utils.cpp_extension import CppExtension
19 | from torch.utils.cpp_extension import CUDAExtension
20 |
21 | from setuptools import find_packages
22 | from setuptools import setup
23 |
24 | requirements = ["torch", "torchvision"]
25 |
26 | def get_extensions():
27 | this_dir = os.path.dirname(os.path.abspath(__file__))
28 | extensions_dir = os.path.join(this_dir, "src")
29 |
30 | main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
31 | source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
32 | source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
33 |
34 | sources = main_file + source_cpu
35 | extension = CppExtension
36 | extra_compile_args = {"cxx": []}
37 | define_macros = []
38 |
39 | # Force cuda since torch ask for a device, not if cuda is in fact available.
40 | if (os.environ.get('FORCE_CUDA') or torch.cuda.is_available()) and CUDA_HOME is not None:
41 | extension = CUDAExtension
42 | sources += source_cuda
43 | define_macros += [("WITH_CUDA", None)]
44 | extra_compile_args["nvcc"] = [
45 | "-DCUDA_HAS_FP16=1",
46 | "-D__CUDA_NO_HALF_OPERATORS__",
47 | "-D__CUDA_NO_HALF_CONVERSIONS__",
48 | "-D__CUDA_NO_HALF2_OPERATORS__",
49 | ]
50 | else:
51 | if CUDA_HOME is None:
52 | raise NotImplementedError('CUDA_HOME is None. Please set environment variable CUDA_HOME.')
53 | else:
54 | raise NotImplementedError('No CUDA runtime is found. Please set FORCE_CUDA=1 or test it by running torch.cuda.is_available().')
55 |
56 | sources = [os.path.join(extensions_dir, s) for s in sources]
57 | include_dirs = [extensions_dir]
58 | ext_modules = [
59 | extension(
60 | "MultiScaleDeformableAttention",
61 | sources,
62 | include_dirs=include_dirs,
63 | define_macros=define_macros,
64 | extra_compile_args=extra_compile_args,
65 | )
66 | ]
67 | return ext_modules
68 |
69 | setup(
70 | name="MultiScaleDeformableAttention",
71 | version="1.0",
72 | author="Weijie Su",
73 | url="https://github.com/fundamentalvision/Deformable-DETR",
74 | description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention",
75 | packages=find_packages(exclude=("configs", "tests",)),
76 | ext_modules=get_extensions(),
77 | cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
78 | )
79 |
--------------------------------------------------------------------------------
/dinov/body/encoder/ops/src/cpu/ms_deform_attn_cpu.cpp:
--------------------------------------------------------------------------------
1 | /*!
2 | **************************************************************************************************
3 | * Deformable DETR
4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6 | **************************************************************************************************
7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
8 | **************************************************************************************************
9 | */
10 |
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 |
16 | #include
17 |
18 | #include
19 | #include
20 |
21 |
22 | at::Tensor
23 | ms_deform_attn_cpu_forward(
24 | const at::Tensor &value,
25 | const at::Tensor &spatial_shapes,
26 | const at::Tensor &level_start_index,
27 | const at::Tensor &sampling_loc,
28 | const at::Tensor &attn_weight,
29 | const int im2col_step)
30 | {
31 | AT_ERROR("Not implement on cpu");
32 | }
33 |
34 | std::vector
35 | ms_deform_attn_cpu_backward(
36 | const at::Tensor &value,
37 | const at::Tensor &spatial_shapes,
38 | const at::Tensor &level_start_index,
39 | const at::Tensor &sampling_loc,
40 | const at::Tensor &attn_weight,
41 | const at::Tensor &grad_output,
42 | const int im2col_step)
43 | {
44 | AT_ERROR("Not implement on cpu");
45 | }
46 |
47 |
--------------------------------------------------------------------------------
/dinov/body/encoder/ops/src/cpu/ms_deform_attn_cpu.h:
--------------------------------------------------------------------------------
1 | /*!
2 | **************************************************************************************************
3 | * Deformable DETR
4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6 | **************************************************************************************************
7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
8 | **************************************************************************************************
9 | */
10 |
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 |
16 | #pragma once
17 | #include
18 |
19 | at::Tensor
20 | ms_deform_attn_cpu_forward(
21 | const at::Tensor &value,
22 | const at::Tensor &spatial_shapes,
23 | const at::Tensor &level_start_index,
24 | const at::Tensor &sampling_loc,
25 | const at::Tensor &attn_weight,
26 | const int im2col_step);
27 |
28 | std::vector
29 | ms_deform_attn_cpu_backward(
30 | const at::Tensor &value,
31 | const at::Tensor &spatial_shapes,
32 | const at::Tensor &level_start_index,
33 | const at::Tensor &sampling_loc,
34 | const at::Tensor &attn_weight,
35 | const at::Tensor &grad_output,
36 | const int im2col_step);
37 |
38 |
39 |
--------------------------------------------------------------------------------
/dinov/body/encoder/ops/src/cuda/ms_deform_attn_cuda.h:
--------------------------------------------------------------------------------
1 | /*!
2 | **************************************************************************************************
3 | * Deformable DETR
4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6 | **************************************************************************************************
7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
8 | **************************************************************************************************
9 | */
10 |
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 |
16 | #pragma once
17 | #include
18 |
19 | at::Tensor ms_deform_attn_cuda_forward(
20 | const at::Tensor &value,
21 | const at::Tensor &spatial_shapes,
22 | const at::Tensor &level_start_index,
23 | const at::Tensor &sampling_loc,
24 | const at::Tensor &attn_weight,
25 | const int im2col_step);
26 |
27 | std::vector ms_deform_attn_cuda_backward(
28 | const at::Tensor &value,
29 | const at::Tensor &spatial_shapes,
30 | const at::Tensor &level_start_index,
31 | const at::Tensor &sampling_loc,
32 | const at::Tensor &attn_weight,
33 | const at::Tensor &grad_output,
34 | const int im2col_step);
35 |
36 |
--------------------------------------------------------------------------------
/dinov/body/encoder/ops/src/ms_deform_attn.h:
--------------------------------------------------------------------------------
1 | /*!
2 | **************************************************************************************************
3 | * Deformable DETR
4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6 | **************************************************************************************************
7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
8 | **************************************************************************************************
9 | */
10 |
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 |
16 | #pragma once
17 |
18 | #include "cpu/ms_deform_attn_cpu.h"
19 |
20 | #ifdef WITH_CUDA
21 | #include "cuda/ms_deform_attn_cuda.h"
22 | #endif
23 |
24 |
25 | at::Tensor
26 | ms_deform_attn_forward(
27 | const at::Tensor &value,
28 | const at::Tensor &spatial_shapes,
29 | const at::Tensor &level_start_index,
30 | const at::Tensor &sampling_loc,
31 | const at::Tensor &attn_weight,
32 | const int im2col_step)
33 | {
34 | if (value.type().is_cuda())
35 | {
36 | #ifdef WITH_CUDA
37 | return ms_deform_attn_cuda_forward(
38 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step);
39 | #else
40 | AT_ERROR("Not compiled with GPU support");
41 | #endif
42 | }
43 | AT_ERROR("Not implemented on the CPU");
44 | }
45 |
46 | std::vector
47 | ms_deform_attn_backward(
48 | const at::Tensor &value,
49 | const at::Tensor &spatial_shapes,
50 | const at::Tensor &level_start_index,
51 | const at::Tensor &sampling_loc,
52 | const at::Tensor &attn_weight,
53 | const at::Tensor &grad_output,
54 | const int im2col_step)
55 | {
56 | if (value.type().is_cuda())
57 | {
58 | #ifdef WITH_CUDA
59 | return ms_deform_attn_cuda_backward(
60 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step);
61 | #else
62 | AT_ERROR("Not compiled with GPU support");
63 | #endif
64 | }
65 | AT_ERROR("Not implemented on the CPU");
66 | }
67 |
68 |
--------------------------------------------------------------------------------
/dinov/body/encoder/ops/src/vision.cpp:
--------------------------------------------------------------------------------
1 | /*!
2 | **************************************************************************************************
3 | * Deformable DETR
4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6 | **************************************************************************************************
7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
8 | **************************************************************************************************
9 | */
10 |
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 |
16 | #include "ms_deform_attn.h"
17 |
18 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
19 | m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward");
20 | m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward");
21 | }
22 |
--------------------------------------------------------------------------------
/dinov/body/encoder/ops/test.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------------------------------
2 | # Deformable DETR
3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5 | # ------------------------------------------------------------------------------------------------
6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
7 | # ------------------------------------------------------------------------------------------------
8 |
9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 |
12 | from __future__ import absolute_import
13 | from __future__ import print_function
14 | from __future__ import division
15 |
16 | import time
17 | import torch
18 | import torch.nn as nn
19 | from torch.autograd import gradcheck
20 |
21 | from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch
22 |
23 |
24 | N, M, D = 1, 2, 2
25 | Lq, L, P = 2, 2, 2
26 | shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda()
27 | level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1]))
28 | S = sum([(H*W).item() for H, W in shapes])
29 |
30 |
31 | torch.manual_seed(3)
32 |
33 |
34 | @torch.no_grad()
35 | def check_forward_equal_with_pytorch_double():
36 | value = torch.rand(N, S, M, D).cuda() * 0.01
37 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
38 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
39 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
40 | im2col_step = 2
41 | output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu()
42 | output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu()
43 | fwdok = torch.allclose(output_cuda, output_pytorch)
44 | max_abs_err = (output_cuda - output_pytorch).abs().max()
45 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
46 |
47 | print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
48 |
49 |
50 | @torch.no_grad()
51 | def check_forward_equal_with_pytorch_float():
52 | value = torch.rand(N, S, M, D).cuda() * 0.01
53 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
54 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
55 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
56 | im2col_step = 2
57 | output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu()
58 | output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu()
59 | fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3)
60 | max_abs_err = (output_cuda - output_pytorch).abs().max()
61 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
62 |
63 | print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
64 |
65 |
66 | def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True):
67 |
68 | value = torch.rand(N, S, M, channels).cuda() * 0.01
69 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
70 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
71 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
72 | im2col_step = 2
73 | func = MSDeformAttnFunction.apply
74 |
75 | value.requires_grad = grad_value
76 | sampling_locations.requires_grad = grad_sampling_loc
77 | attention_weights.requires_grad = grad_attn_weight
78 |
79 | gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step))
80 |
81 | print(f'* {gradok} check_gradient_numerical(D={channels})')
82 |
83 |
84 | if __name__ == '__main__':
85 | check_forward_equal_with_pytorch_double()
86 | check_forward_equal_with_pytorch_float()
87 |
88 | for channels in [30, 32, 64, 71, 1025, 2048, 3096]:
89 | check_gradient_numerical(channels, True, True, True)
90 |
91 |
92 |
93 |
--------------------------------------------------------------------------------
/dinov/body/encoder/registry.py:
--------------------------------------------------------------------------------
1 | _model_entrypoints = {}
2 |
3 | def register_encoder(fn):
4 | module_name_split = fn.__module__.split('.')
5 | model_name = module_name_split[-1]
6 | _model_entrypoints[model_name] = fn
7 | return fn
8 |
9 | def model_entrypoints(model_name):
10 | return _model_entrypoints[model_name]
11 |
12 | def is_model(model_name):
13 | return model_name in _model_entrypoints
14 |
--------------------------------------------------------------------------------
/dinov/body/general_head.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------
2 | # Copyright (c) MicroSoft, Inc. and its affiliates.
3 | # Modified from DINO https://github.com/IDEA-Research/MaskDINO by Feng Li.
4 | # ------------------------------------------------------------------------
5 | import logging
6 | from typing import Callable, Dict, List, Optional, Tuple, Union
7 |
8 | from torch import nn
9 |
10 | from detectron2.layers import Conv2d, ShapeSpec, get_norm
11 |
12 | from .registry import register_body
13 | from .encoder import build_encoder
14 | from .decoder import build_decoder
15 | from ..utils import configurable
16 |
17 |
18 | class IMaskDINOHead(nn.Module):
19 | @configurable
20 | def __init__(
21 | self,
22 | input_shape: Dict[str, ShapeSpec],
23 | *,
24 | num_classes: int,
25 | pixel_decoder: nn.Module,
26 | loss_weight: float = 1.0,
27 | ignore_value: int = -1,
28 | transformer_predictor: nn.Module,
29 | ):
30 | """
31 | Args:
32 | input_shape: shapes (channels and stride) of the input features
33 | num_classes: number of classes to predict
34 | pixel_decoder: the pixel decoder module
35 | loss_weight: loss weight
36 | ignore_value: category id to be ignored during training.
37 | transformer_predictor: the transformer decoder that makes prediction
38 | transformer_in_feature: input feature name to the transformer_predictor
39 | """
40 | super().__init__()
41 | input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride)
42 | self.in_features = [k for k, v in input_shape]
43 | self.ignore_value = ignore_value
44 | self.common_stride = 4
45 | self.loss_weight = loss_weight
46 |
47 | self.pixel_decoder = pixel_decoder
48 | self.predictor = transformer_predictor
49 |
50 | self.num_classes = num_classes
51 | # store processed features
52 | self.processed_features = None
53 |
54 | @classmethod
55 | def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec], lang_encoder: nn.Module, extra: dict):
56 | enc_cfg = cfg['MODEL']['ENCODER']
57 | dec_cfg = cfg['MODEL']['DECODER']
58 | transformer_predictor_in_channels = enc_cfg['CONVS_DIM']
59 |
60 | return {
61 | "input_shape": {
62 | k: v for k, v in input_shape.items() if k in enc_cfg['IN_FEATURES']
63 | },
64 | "ignore_value": enc_cfg['IGNORE_VALUE'],
65 | "num_classes": enc_cfg.get('NUM_CLASSES', None),
66 | "pixel_decoder": build_encoder(cfg, input_shape),
67 | "loss_weight": enc_cfg['LOSS_WEIGHT'],
68 | "transformer_predictor": build_decoder(
69 | cfg,
70 | transformer_predictor_in_channels,
71 | lang_encoder,
72 | mask_classification=True,
73 | extra=extra,
74 | ),
75 | }
76 |
77 | def forward_encoder(self, features, mask=None,targets=None, target_queries=None, target_vlp=None, prediction_switch=None, task='seg', extra={}):
78 | mask_features, transformer_encoder_features, multi_scale_features = self.pixel_decoder.forward_features(
79 | features, mask)
80 | self.processed_features = (mask_features, transformer_encoder_features, multi_scale_features)
81 |
82 | def forward_decoder(self, features, mask=None,targets=None, target_queries=None, target_vlp=None, prediction_switch=None, task='seg', extra={}):
83 | assert self.processed_features is not None, "need to precess features first"
84 | mask_features, transformer_encoder_features, multi_scale_features = self.processed_features
85 | predictions = self.predictor(multi_scale_features, mask_features, mask, targets=targets,
86 | target_queries=target_queries, target_vlp=target_vlp, task=task, extra=extra)
87 | return predictions
88 |
89 | def forward(self, features, mask=None, targets=None, target_queries=None, target_vlp=None, task='seg', extra={}):
90 | return self.layers(features, mask, targets=targets, target_queries=target_queries, target_vlp=target_vlp, task=task, extra=extra)
91 |
92 | def layers(self, features, mask=None,targets=None, target_queries=None, target_vlp=None, prediction_switch=None, task='seg', extra={}):
93 | mask_features, transformer_encoder_features, multi_scale_features = self.pixel_decoder.forward_features(features, mask)
94 | predictions = self.predictor(multi_scale_features, mask_features, mask, targets=targets,
95 | target_queries=target_queries, target_vlp=target_vlp, task=task, extra=extra)
96 | return predictions
97 |
98 |
99 | @register_body
100 | def get_interactive_maskdino_head(cfg, input_shape, lang_encoder, extra):
101 | return IMaskDINOHead(cfg, input_shape, lang_encoder, extra)
--------------------------------------------------------------------------------
/dinov/body/registry.py:
--------------------------------------------------------------------------------
1 | _model_entrypoints = {}
2 |
3 |
4 | def register_body(fn):
5 | module_name_split = fn.__module__.split('.')
6 | model_name = module_name_split[-1]
7 | _model_entrypoints[model_name] = fn
8 | return fn
9 |
10 | def model_entrypoints(model_name):
11 | return _model_entrypoints[model_name]
12 |
13 | def is_model(model_name):
14 | return model_name in _model_entrypoints
--------------------------------------------------------------------------------
/dinov/language/__init__.py:
--------------------------------------------------------------------------------
1 | from .build import build_language_encoder
--------------------------------------------------------------------------------
/dinov/language/build.py:
--------------------------------------------------------------------------------
1 | """
2 | placeholder for language open-set or grounding
3 | """
4 |
5 |
6 | def build_language_encoder(config, **kwargs):
7 | model_name = config['MODEL']['TEXT']['ARCH']
8 | if model_name=='noencoder':
9 | return None
10 | else:
11 | raise NotImplementedError
--------------------------------------------------------------------------------
/dinov/modules/__init__.py:
--------------------------------------------------------------------------------
1 | from .position_encoding import *
2 | from .postprocessing import *
3 | from .matcher import *
4 | from .criterion_visual_refer_one2one import *
5 | from .criterion_visual_openset import *
6 | from .criterion_visual_refer_many2many import *
7 | from .matcher_many2many import *
8 |
9 |
--------------------------------------------------------------------------------
/dinov/modules/position_encoding.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py
3 | """
4 | Various positional encodings for the transformer.
5 | """
6 | import math
7 |
8 | import torch
9 | from torch import nn
10 |
11 |
12 | class PositionEmbeddingSine(nn.Module):
13 | """
14 | This is a more standard version of the position embedding, very similar to the one
15 | used by the Attention is all you need paper, generalized to work on images.
16 | """
17 |
18 | def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
19 | super().__init__()
20 | self.num_pos_feats = num_pos_feats
21 | self.temperature = temperature
22 | self.normalize = normalize
23 | if scale is not None and normalize is False:
24 | raise ValueError("normalize should be True if scale is passed")
25 | if scale is None:
26 | scale = 2 * math.pi
27 | self.scale = scale
28 |
29 | def forward(self, x, mask=None):
30 | if mask is None:
31 | mask = torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool)
32 | not_mask = ~mask
33 | y_embed = not_mask.cumsum(1, dtype=x.dtype)
34 | x_embed = not_mask.cumsum(2, dtype=x.dtype)
35 | if self.normalize:
36 | eps = 1e-6
37 | y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
38 | x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
39 |
40 | dim_t = torch.arange(self.num_pos_feats, dtype=x.dtype, device=x.device)
41 | dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
42 |
43 | pos_x = x_embed[:, :, :, None] / dim_t
44 | pos_y = y_embed[:, :, :, None] / dim_t
45 | pos_x = torch.stack(
46 | (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4
47 | ).flatten(3)
48 | pos_y = torch.stack(
49 | (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4
50 | ).flatten(3)
51 | pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
52 | return pos
53 |
54 | def __repr__(self, _repr_indent=4):
55 | head = "Positional encoding " + self.__class__.__name__
56 | body = [
57 | "num_pos_feats: {}".format(self.num_pos_feats),
58 | "temperature: {}".format(self.temperature),
59 | "normalize: {}".format(self.normalize),
60 | "scale: {}".format(self.scale),
61 | ]
62 | # _repr_indent = 4
63 | lines = [head] + [" " * _repr_indent + line for line in body]
64 | return "\n".join(lines)
65 |
--------------------------------------------------------------------------------
/dinov/modules/postprocessing.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | import torch
3 | from torch.nn import functional as F
4 |
5 | from detectron2.structures import Instances, ROIMasks
6 |
7 |
8 | def sem_seg_postprocess(result, img_size, output_height, output_width):
9 | """
10 | Return semantic segmentation predictions in the original resolution.
11 |
12 | The input images are often resized when entering semantic segmentor. Moreover, in same
13 | cases, they also padded inside segmentor to be divisible by maximum network stride.
14 | As a result, we often need the predictions of the segmentor in a different
15 | resolution from its inputs.
16 |
17 | Args:
18 | result (Tensor): semantic segmentation prediction logits. A tensor of shape (C, H, W),
19 | where C is the number of classes, and H, W are the height and width of the prediction.
20 | img_size (tuple): image size that segmentor is taking as input.
21 | output_height, output_width: the desired output resolution.
22 |
23 | Returns:
24 | semantic segmentation prediction (Tensor): A tensor of the shape
25 | (C, output_height, output_width) that contains per-pixel soft predictions.
26 | """
27 | if len(result.shape)>3:
28 | result = result[:, :, : img_size[0], : img_size[1]].expand(1, -1, -1, -1)
29 | else:
30 | result = result[:, : img_size[0], : img_size[1]].expand(1, -1, -1, -1)
31 | result = F.interpolate(
32 | result, size=(output_height, output_width), mode="bicubic", align_corners=False, antialias=True
33 | )[0]
34 | return result
35 |
--------------------------------------------------------------------------------
/dinov/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .config import *
2 | from .misc import *
3 | from .box_ops import *
--------------------------------------------------------------------------------
/dinov/utils/box_ops.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
2 | """
3 | Utilities for bounding box manipulation and GIoU.
4 | """
5 | import torch
6 | from torchvision.ops.boxes import box_area
7 | import numpy as np
8 |
9 |
10 | def build_point_grid(n_per_side: int) -> np.ndarray:
11 | """Generates a 2D grid of points evenly spaced in [0,1]x[0,1]."""
12 | offset = 1 / (2 * n_per_side)
13 | points_one_side = np.linspace(offset, 1 - offset, n_per_side)
14 | points_x = np.tile(points_one_side[None, :], (n_per_side, 1))
15 | points_y = np.tile(points_one_side[:, None], (1, n_per_side))
16 | points = np.stack([points_x, points_y], axis=-1).reshape(-1, 2)
17 | return points
18 |
19 | def box_cxcywh_to_xyxy(x):
20 | x_c, y_c, w, h = x.unbind(-1)
21 | b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
22 | (x_c + 0.5 * w), (y_c + 0.5 * h)]
23 | return torch.stack(b, dim=-1)
24 |
25 |
26 | def box_xyxy_to_cxcywh(x):
27 | x0, y0, x1, y1 = x.unbind(-1)
28 | b = [(x0 + x1) / 2, (y0 + y1) / 2,
29 | (x1 - x0), (y1 - y0)]
30 | return torch.stack(b, dim=-1)
31 |
32 | def box_xywh_to_xyxy(x):
33 | x0, y0, x1, y1 = x.unbind(-1)
34 | b = [x0, y0, (x0 + x1), (y0 + y1)]
35 | return torch.stack(b, dim=-1)
36 |
37 |
38 | # modified from torchvision to also return the union
39 | def box_iou(boxes1, boxes2):
40 | area1 = box_area(boxes1)
41 | area2 = box_area(boxes2)
42 |
43 | lt = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2]
44 | rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2]
45 |
46 | wh = (rb - lt).clamp(min=0) # [N,M,2]
47 | inter = wh[:, :, 0] * wh[:, :, 1] # [N,M]
48 |
49 | union = area1[:, None] + area2 - inter
50 |
51 | iou = inter / (union+1e-6)
52 | return iou, union
53 |
54 |
55 | def generalized_box_iou(boxes1, boxes2):
56 | """
57 | Generalized IoU from https://giou.stanford.edu/
58 |
59 | The boxes should be in [x0, y0, x1, y1] format
60 |
61 | Returns a [N, M] pairwise matrix, where N = len(boxes1)
62 | and M = len(boxes2)
63 | """
64 | # degenerate boxes gives inf / nan results
65 | # so do an early check
66 | assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
67 | assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
68 | iou, union = box_iou(boxes1, boxes2)
69 |
70 | lt = torch.min(boxes1[:, None, :2], boxes2[:, :2])
71 | rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
72 |
73 | wh = (rb - lt).clamp(min=0) # [N,M,2]
74 | area = wh[:, :, 0] * wh[:, :, 1]
75 |
76 | return iou - (area - union) / (area+1e-6)
77 |
78 | def generalized_box_iou_padded(boxes1, boxes2):
79 | """
80 | Generalized IoU from https://giou.stanford.edu/
81 |
82 | The boxes should be in [x0, y0, x1, y1] format
83 |
84 | Returns a [N, M] pairwise matrix, where N = len(boxes1)
85 | and M = len(boxes2)
86 | """
87 | # degenerate boxes gives inf / nan results
88 | # so do an early check
89 | # assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
90 | # assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
91 | iou, union = box_iou(boxes1, boxes2)
92 |
93 | lt = torch.min(boxes1[:, None, :2], boxes2[:, :2])
94 | rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
95 |
96 | wh = (rb - lt).clamp(min=0) # [N,M,2]
97 | area = wh[:, :, 0] * wh[:, :, 1]
98 |
99 | return iou - (area - union) / (area+1e-6)
100 |
101 |
102 | def masks_to_boxes(masks):
103 | """Compute the bounding boxes around the provided masks
104 |
105 | The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions.
106 |
107 | Returns a [N, 4] tensors, with the boxes in xyxy format
108 | """
109 | if masks.numel() == 0:
110 | return torch.zeros((0, 4), device=masks.device)
111 |
112 | h, w = masks.shape[-2:]
113 |
114 | y = torch.arange(0, h, dtype=torch.float)
115 | x = torch.arange(0, w, dtype=torch.float)
116 | y, x = torch.meshgrid(y, x)
117 |
118 | x_mask = (masks * x.unsqueeze(0))
119 | x_max = x_mask.flatten(1).max(-1)[0]
120 | x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
121 |
122 | y_mask = (masks * y.unsqueeze(0))
123 | y_max = y_mask.flatten(1).max(-1)[0]
124 | y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
125 |
126 | return torch.stack([x_min, y_min, x_max, y_max], 1)
--------------------------------------------------------------------------------
/dinov/utils/config.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Copyright (c) Facebook, Inc. and its affiliates.
3 |
4 | import functools
5 | import inspect
6 |
7 | def configurable(init_func=None, *, from_config=None):
8 | """
9 | Decorate a function or a class's __init__ method so that it can be called
10 | with a :class:`CfgNode` object using a :func:`from_config` function that translates
11 | :class:`CfgNode` to arguments.
12 |
13 | Examples:
14 | ::
15 | # Usage 1: Decorator on __init__:
16 | class A:
17 | @configurable
18 | def __init__(self, a, b=2, c=3):
19 | pass
20 |
21 | @classmethod
22 | def from_config(cls, cfg): # 'cfg' must be the first argument
23 | # Returns kwargs to be passed to __init__
24 | return {"a": cfg.A, "b": cfg.B}
25 |
26 | a1 = A(a=1, b=2) # regular construction
27 | a2 = A(cfg) # construct with a cfg
28 | a3 = A(cfg, b=3, c=4) # construct with extra overwrite
29 |
30 | # Usage 2: Decorator on any function. Needs an extra from_config argument:
31 | @configurable(from_config=lambda cfg: {"a: cfg.A, "b": cfg.B})
32 | def a_func(a, b=2, c=3):
33 | pass
34 |
35 | a1 = a_func(a=1, b=2) # regular call
36 | a2 = a_func(cfg) # call with a cfg
37 | a3 = a_func(cfg, b=3, c=4) # call with extra overwrite
38 |
39 | Args:
40 | init_func (callable): a class's ``__init__`` method in usage 1. The
41 | class must have a ``from_config`` classmethod which takes `cfg` as
42 | the first argument.
43 | from_config (callable): the from_config function in usage 2. It must take `cfg`
44 | as its first argument.
45 | """
46 |
47 | if init_func is not None:
48 | assert (
49 | inspect.isfunction(init_func)
50 | and from_config is None
51 | and init_func.__name__ == "__init__"
52 | ), "Incorrect use of @configurable. Check API documentation for examples."
53 |
54 | @functools.wraps(init_func)
55 | def wrapped(self, *args, **kwargs):
56 | try:
57 | from_config_func = type(self).from_config
58 | except AttributeError as e:
59 | raise AttributeError(
60 | "Class with @configurable must have a 'from_config' classmethod."
61 | ) from e
62 | if not inspect.ismethod(from_config_func):
63 | raise TypeError("Class with @configurable must have a 'from_config' classmethod.")
64 |
65 | # import ipdb; ipdb.set_trace()
66 | if _called_with_cfg(*args, **kwargs):
67 | explicit_args = _get_args_from_config(from_config_func, *args, **kwargs)
68 | init_func(self, **explicit_args)
69 | else:
70 | init_func(self, *args, **kwargs)
71 |
72 | return wrapped
73 |
74 | else:
75 | if from_config is None:
76 | return configurable # @configurable() is made equivalent to @configurable
77 | assert inspect.isfunction(
78 | from_config
79 | ), "from_config argument of configurable must be a function!"
80 |
81 | def wrapper(orig_func):
82 | @functools.wraps(orig_func)
83 | def wrapped(*args, **kwargs):
84 | if _called_with_cfg(*args, **kwargs):
85 | explicit_args = _get_args_from_config(from_config, *args, **kwargs)
86 | return orig_func(**explicit_args)
87 | else:
88 | return orig_func(*args, **kwargs)
89 |
90 | wrapped.from_config = from_config
91 | return wrapped
92 |
93 | return wrapper
94 |
95 | def _called_with_cfg(*args, **kwargs):
96 | """
97 | Returns:
98 | bool: whether the arguments contain CfgNode and should be considered
99 | forwarded to from_config.
100 | """
101 | from omegaconf import DictConfig, OmegaConf, ListConfig
102 | # from detectron2.config import LazyConfig
103 |
104 | if len(args) and (isinstance(args[0], (dict)) or (isinstance(args[0], (DictConfig)))):
105 | return True
106 | if isinstance(kwargs.pop("cfg", None), (dict)):
107 | return True
108 | # `from_config`'s first argument is forced to be "cfg".
109 | # So the above check covers all cases.
110 | return False
111 |
112 | def _get_args_from_config(from_config_func, *args, **kwargs):
113 | """
114 | Use `from_config` to obtain explicit arguments.
115 |
116 | Returns:
117 | dict: arguments to be used for cls.__init__
118 | """
119 | signature = inspect.signature(from_config_func)
120 | if list(signature.parameters.keys())[0] != "cfg":
121 | if inspect.isfunction(from_config_func):
122 | name = from_config_func.__name__
123 | else:
124 | name = f"{from_config_func.__self__}.from_config"
125 | raise TypeError(f"{name} must take 'cfg' as the first argument!")
126 | support_var_arg = any(
127 | param.kind in [param.VAR_POSITIONAL, param.VAR_KEYWORD]
128 | for param in signature.parameters.values()
129 | )
130 | if support_var_arg: # forward all arguments to from_config, if from_config accepts them
131 | ret = from_config_func(*args, **kwargs)
132 | else:
133 | # forward supported arguments to from_config
134 | supported_arg_names = set(signature.parameters.keys())
135 | extra_kwargs = {}
136 | for name in list(kwargs.keys()):
137 | if name not in supported_arg_names:
138 | extra_kwargs[name] = kwargs.pop(name)
139 | ret = from_config_func(*args, **kwargs)
140 | # forward the other arguments to __init__
141 | ret.update(extra_kwargs)
142 | return ret
--------------------------------------------------------------------------------
/repo.diff:
--------------------------------------------------------------------------------
1 | diff --git openseed/architectures/joint_oi_model.py openseed/architectures/joint_oi_model.py
2 | index 8086690..a0679fe 100644
3 | --- openseed/architectures/joint_oi_model.py
4 | +++ openseed/architectures/joint_oi_model.py
5 | @@ -286,6 +286,7 @@ class GeneralizedMaskDINO(nn.Module):
6 | "coco_on": dec_cfg.get('COCO', True),
7 | "coco_mask_on": dec_cfg.get('COCO_MASK', True),
8 | "o365_on": dec_cfg.get('O365', True),
9 | + "regenerate_point": dec_cfg.get('RE_POINT', False),
10 | }
11 |
12 | @property
13 | @@ -531,7 +532,7 @@ class GeneralizedMaskDINO(nn.Module):
14 |
15 | # if not self.training:
16 | # box_start = int(num_mask/4*3)
17 | - box_start = random.randint(0, self.max_num_instance - 1) # box based interactive after this number; about 1/4
18 | + box_start = random.randint(1, self.max_num_instance - 1) # box based interactive after this number; about 1/4
19 | point_coords = targets_per_image.point_coords[index[:box_start]]
20 | # FIXME randomly sample one point as the user input
21 | if self.regenerate_point:
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | torch
2 | torchvision
3 | opencv-python
4 | pyyaml
5 | json_tricks
6 | yacs
7 | scikit-learn
8 | pandas
9 | timm==0.4.12
10 | numpy==1.23.5
11 | einops
12 | fvcore
13 | transformers==4.19.2
14 | sentencepiece
15 | ftfy
16 | regex
17 | nltk
18 | vision-datasets==0.2.2
19 | pycocotools==2.0.4
20 | diffdist
21 | pyarrow
22 | cityscapesscripts
23 | shapely
24 | scikit-image
25 | mup
26 | gradio==3.40.0
27 | scann
28 | kornia==0.6.4
29 | torchmetrics==0.6.0
30 | progressbar
31 | pillow==9.4.0
32 |
--------------------------------------------------------------------------------
/utils/Config.py:
--------------------------------------------------------------------------------
1 | from fvcore.common.config import CfgNode as _CfgNode
2 |
3 | class CfgNode(_CfgNode):
4 | """
5 | The same as `fvcore.common.config.CfgNode`, but different in:
6 |
7 | 1. Use unsafe yaml loading by default.
8 | Note that this may lead to arbitrary code execution: you must not
9 | load a config file from untrusted sources before manually inspecting
10 | the content of the file.
11 | 2. Support config versioning.
12 | When attempting to merge an old config, it will convert the old config automatically.
13 |
14 | .. automethod:: clone
15 | .. automethod:: freeze
16 | .. automethod:: defrost
17 | .. automethod:: is_frozen
18 | .. automethod:: load_yaml_with_base
19 | .. automethod:: merge_from_list
20 | .. automethod:: merge_from_other_cfg
21 | """
22 |
23 | def merge_from_dict(self, dict):
24 | pass
25 |
26 | node = CfgNode()
--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .dist import *
--------------------------------------------------------------------------------
/utils/arguments.py:
--------------------------------------------------------------------------------
1 | import yaml
2 | import json
3 | import argparse
4 | import logging
5 |
6 | logger = logging.getLogger(__name__)
7 |
8 |
9 | def load_config_dict_to_opt(opt, config_dict):
10 | """
11 | Load the key, value pairs from config_dict to opt, overriding existing values in opt
12 | if there is any.
13 | """
14 | if not isinstance(config_dict, dict):
15 | raise TypeError("Config must be a Python dictionary")
16 | for k, v in config_dict.items():
17 | k_parts = k.split('.')
18 | pointer = opt
19 | for k_part in k_parts[:-1]:
20 | if k_part not in pointer:
21 | pointer[k_part] = {}
22 | pointer = pointer[k_part]
23 | assert isinstance(pointer, dict), "Overriding key needs to be inside a Python dict."
24 | ori_value = pointer.get(k_parts[-1])
25 | pointer[k_parts[-1]] = v
26 | if ori_value:
27 | logger.warning(f"Overrided {k} from {ori_value} to {pointer[k_parts[-1]]}")
28 |
29 | def load_opt_from_config_file(conf_file):
30 | """
31 | Load opt from the config files, settings in later files can override those in previous files.
32 |
33 | Args:
34 | conf_files: config file path
35 |
36 | Returns:
37 | dict: a dictionary of opt settings
38 | """
39 | opt = {}
40 | with open(conf_file, encoding='utf-8') as f:
41 | config_dict = yaml.safe_load(f)
42 |
43 | load_config_dict_to_opt(opt, config_dict)
44 |
45 | return opt
46 |
47 |
48 | def load_opt_from_config_files(conf_files):
49 | """
50 | Load opt from the config files, settings in later files can override those in previous files.
51 |
52 | Args:
53 | conf_files (list): a list of config file paths
54 |
55 | Returns:
56 | dict: a dictionary of opt settings
57 | """
58 | opt = {}
59 | for conf_file in conf_files:
60 | with open(conf_file, encoding='utf-8') as f:
61 | config_dict = yaml.safe_load(f)
62 |
63 | load_config_dict_to_opt(opt, config_dict)
64 |
65 | return opt
66 |
67 |
68 | def load_opt_command(args):
69 | parser = argparse.ArgumentParser(description='Pretrain or fine-tune models for NLP tasks.')
70 | parser.add_argument('command', help='Command: train/evaluate/train-and-evaluate')
71 | parser.add_argument('--conf_files', nargs='+', required=True, help='Path(s) to the config file(s).')
72 | parser.add_argument('--user_dir', help='Path to the user defined module for tasks (models, criteria), optimizers, and lr schedulers.')
73 | parser.add_argument('--config_overrides', nargs='*', help='Override parameters on config with a json style string, e.g. {"": , "..": }. A key with "." updates the object in the corresponding nested dict. Remember to escape " in command line.')
74 | parser.add_argument('--overrides', help='arguments that used to override the config file in cmdline', nargs=argparse.REMAINDER)
75 |
76 | cmdline_args = parser.parse_args() if not args else parser.parse_args(args)
77 |
78 | opt = load_opt_from_config_files(cmdline_args.conf_files)
79 |
80 | if cmdline_args.config_overrides:
81 | config_overrides_string = ' '.join(cmdline_args.config_overrides)
82 | logger.warning(f"Command line config overrides: {config_overrides_string}")
83 | config_dict = json.loads(config_overrides_string)
84 | load_config_dict_to_opt(opt, config_dict)
85 |
86 | if cmdline_args.overrides:
87 | assert len(cmdline_args.overrides) % 2 == 0, "overrides arguments is not paired, required: key value"
88 | keys = [cmdline_args.overrides[idx*2] for idx in range(len(cmdline_args.overrides)//2)]
89 | vals = [cmdline_args.overrides[idx*2+1] for idx in range(len(cmdline_args.overrides)//2)]
90 | vals = [val.replace('false', '').replace('False','') if len(val.replace(' ', '')) == 5 else val for val in vals]
91 |
92 | types = []
93 | for key in keys:
94 | key = key.split('.')
95 | ele = opt.copy()
96 | while len(key) > 0:
97 | ele = ele[key.pop(0)]
98 | types.append(type(ele))
99 |
100 | config_dict = {x:z(y) for x,y,z in zip(keys, vals, types)}
101 | load_config_dict_to_opt(opt, config_dict)
102 |
103 | # combine cmdline_args into opt dictionary
104 | for key, val in cmdline_args.__dict__.items():
105 | if val is not None:
106 | opt[key] = val
107 |
108 | return opt, cmdline_args
109 |
110 |
111 | def save_opt_to_json(opt, conf_file):
112 | with open(conf_file, 'w', encoding='utf-8') as f:
113 | json.dump(opt, f, indent=4)
114 |
115 |
116 | def save_opt_to_yaml(opt, conf_file):
117 | with open(conf_file, 'w', encoding='utf-8') as f:
118 | yaml.dump(opt, f)
119 |
--------------------------------------------------------------------------------
/utils/dist.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json, time
3 | import torch
4 | import torch.distributed as dist
5 |
6 |
7 | def get_world_size():
8 | if torch.distributed.is_initialized():
9 | return torch.distributed.get_world_size()
10 | return 1
11 |
12 | def all_gather(x):
13 | if get_world_size() > 1:
14 | all_x = [torch.zeros_like(x) for _ in range(get_world_size())]
15 | torch.distributed.all_gather(all_x, x.detach())
16 | all_x[torch.distributed.get_rank()] = x
17 | x = torch.stack(all_x, dim=0)
18 | return x
19 |
20 | def init_distributed_mode(args):
21 | if 'WORLD_SIZE' in os.environ and os.environ['WORLD_SIZE'] != '': # 'RANK' in os.environ and
22 | args.rank = int(os.environ["RANK"])
23 | args.world_size = int(os.environ['WORLD_SIZE'])
24 | args.gpu = args.local_rank = int(os.environ['LOCAL_RANK'])
25 |
26 | print('world size: {}, rank: {}, local rank: {}'.format(args.world_size, args.rank, args.local_rank))
27 | print(json.dumps(dict(os.environ), indent=2))
28 | elif 'SLURM_PROCID' in os.environ:
29 | args.rank = int(os.environ['SLURM_PROCID'])
30 | args.gpu = args.local_rank = int(os.environ['SLURM_LOCALID'])
31 | args.world_size = int(os.environ['SLURM_NPROCS'])
32 |
33 | if os.environ.get('HAND_DEFINE_DIST_URL', 0) == '1':
34 | pass
35 | else:
36 | import util.hostlist as uh
37 | nodenames = uh.parse_nodelist(os.environ['SLURM_JOB_NODELIST'])
38 | gpu_ids = [int(node[3:]) for node in nodenames]
39 | fixid = int(os.environ.get('FIX_DISTRIBUTED_PORT_NUMBER', 0))
40 | # fixid += random.randint(0, 300)
41 | port = str(3137 + int(min(gpu_ids)) + fixid)
42 | args.dist_url = "tcp://{ip}:{port}".format(ip=uh.nodename_to_ip(nodenames[0]), port=port)
43 |
44 | print('world size: {}, world rank: {}, local rank: {}, device_count: {}'.format(args.world_size, args.rank, args.local_rank, torch.cuda.device_count()))
45 |
46 |
47 | else:
48 | print('Not using distributed mode')
49 | args.distributed = False
50 | args.world_size = 1
51 | args.rank = 0
52 | args.local_rank = 0
53 | return
54 |
55 | print("world_size:{} rank:{} local_rank:{}".format(args.world_size, args.rank, args.local_rank))
56 | args.distributed = True
57 | torch.cuda.set_device(args.local_rank)
58 | args.dist_backend = 'nccl'
59 | print('| distributed init (rank {}): {}'.format(args.rank, args.dist_url), flush=True)
60 |
61 | torch.distributed.init_process_group(
62 | backend=args.dist_backend,
63 | world_size=args.world_size,
64 | rank=args.rank,
65 | init_method=args.dist_url,
66 | )
67 |
68 | print("Before torch.distributed.barrier()")
69 | torch.distributed.barrier()
70 | print("End torch.distributed.barrier()")
--------------------------------------------------------------------------------
/utils/misc.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # X-Decoder -- Generalized Decoding for Pixel, Image, and Language
3 | # Copyright (c) 2022 Microsoft
4 | # Licensed under The MIT License [see LICENSE for details]
5 | # Written by Xueyan Zou (xueyan@cs.wisc.edu)
6 | # --------------------------------------------------------
7 | import math
8 | import wandb
9 | import os
10 |
11 |
12 | # HACK for evalution
13 | def hook_metadata(metadata, name):
14 | if name == 'cityscapes_fine_sem_seg_val':
15 | metadata.__setattr__("keep_sem_bgd", False)
16 | return metadata
17 |
18 | def hook_opt(model, name):
19 | if name in ['cityscapes_fine_panoptic_val', 'ade20k_panoptic_val', 'bdd10k_40_panoptic_val', 'cityscapes_fine_panoptic_val', 'scannet_21_panoptic_val']:
20 | model.model.object_mask_threshold = 0.4
21 | else:
22 | model.model.object_mask_threshold = 0.8
23 |
24 | # HACK for evalution
25 | def hook_switcher(model, name):
26 | mappings = {}
27 | if name in ['cityscapes_fine_sem_seg_val', 'scannet_21_val_seg', 'scannet_38_val_seg', 'scannet_41_val_seg', 'sunrgbd_37_val_seg', 'bdd10k_val_sem_seg', 'ade20k_full_sem_seg_val']:
28 | mappings = {'SEMANTIC_ON': True, 'INSTANCE_ON': False, 'PANOPTIC_ON': False}
29 | elif name in ['cityscapes_fine_instance_seg_val', 'pascal_part_val_interactive', 'pascal_part_val', 'pascal_part_train'] or 'seginw' in name or 'lvis' in name or 'odinw' in name:
30 | mappings = {'SEMANTIC_ON': False, 'INSTANCE_ON': True, 'PANOPTIC_ON': False}
31 | elif name in ['cityscapes_fine_panoptic_val', 'scannet_21_panoptic_val', 'bdd10k_40_panoptic_val']:
32 | # mappings = {'SEMANTIC_ON': True, 'INSTANCE_ON': False, 'PANOPTIC_ON': True}
33 | mappings = {'SEMANTIC_ON': True, 'INSTANCE_ON': True, 'PANOPTIC_ON': True}
34 | elif 'coco_2017_val_panoptic_with_sem_seg' in name or name in ['ade20k_panoptic_val', 'ade20k_panoptic_train', 'coco_2017_test-dev', 'sam_val', 'sam_minival']:
35 | mappings = {'SEMANTIC_ON': True, 'INSTANCE_ON': True, 'PANOPTIC_ON': True}
36 | elif name in ['cityscapes_fine_sem_seg_val', 'scannet_21_val_seg', 'scannet_38_val_seg', 'scannet_41_val_seg',
37 | 'sunrgbd_37_val_seg', 'context_59_val_seg', 'context_459_val_seg', 'voc_2012_val_seg',
38 | 'bdd10k_val_sem_seg']:
39 | mappings = {'SEMANTIC_ON': True, 'INSTANCE_ON': False, 'PANOPTIC_ON': False}
40 | elif name in ['cityscapes_fine_instance_seg_val'] or 'seginw' in name:
41 | mappings = {'SEMANTIC_ON': False, 'INSTANCE_ON': True, 'PANOPTIC_ON': False}
42 | mappings = {'SEMANTIC_ON': True, 'INSTANCE_ON': True, 'PANOPTIC_ON': True}
43 | elif name in ['coco_2017_val_panoptic_with_sem_seg', 'ade20k_panoptic_val']:
44 | mappings = {'SEMANTIC_ON': True, 'INSTANCE_ON': True, 'PANOPTIC_ON': True}
45 | else:
46 | if name not in ["vlp_val", "vlp_captioning_val", "vlp_val2017", "vlp_captioning_val2017", "imagenet_val", "refcocog_val_google", "phrasecut_val", "phrasecut_test", "refcocop_val_unc", "refcoco_val_unc", "refcocog_val_umd"]:
47 | assert False, "dataset switcher is not defined"
48 | for key, value in mappings.items():
49 | if key == 'SEMANTIC_ON':
50 | model.model.semantic_on = value
51 | if key == 'INSTANCE_ON':
52 | model.model.instance_on = value
53 | if key == 'PANOPTIC_ON':
54 | model.model.panoptic_on = value
55 |
56 | class AverageMeter(object):
57 | """Computes and stores the average and current value."""
58 | def __init__(self):
59 | self.reset()
60 |
61 | def reset(self):
62 | self.val = 0
63 | self.avg = 0
64 | self.sum = 0
65 | self.count = 0
66 |
67 | def update(self, val, n=1, decay=0):
68 | self.val = val
69 | if decay:
70 | alpha = math.exp(-n / decay) # exponential decay over 100 updates
71 | self.sum = alpha * self.sum + (1 - alpha) * val * n
72 | self.count = alpha * self.count + (1 - alpha) * n
73 | else:
74 | self.sum += val * n
75 | self.count += n
76 | self.avg = self.sum / self.count
77 |
78 | def init_wandb(args, job_dir, entity='646396839lifeng', project='xdecoder', job_name='tmp'):
79 | wandb_dir = os.path.join(job_dir, 'wandb')
80 | os.makedirs(wandb_dir, exist_ok=True)
81 | runid = None
82 | if os.path.exists(f"{wandb_dir}/runid.txt"):
83 | runid = open(f"{wandb_dir}/runid.txt").read()
84 |
85 | wandb.init(project=project,
86 | name=job_name,
87 | dir=wandb_dir,
88 | entity=entity,
89 | resume="allow",
90 | id=runid,
91 | config={"hierarchical": True}, )
92 |
93 | open(f"{wandb_dir}/runid.txt", 'w').write(wandb.run.id)
94 | wandb.config.update({k: args[k] for k in args if k not in wandb.config})
--------------------------------------------------------------------------------
/utils/model.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 | import time
4 | import pickle
5 | import torch
6 | from detectron2.utils.comm import is_main_process
7 |
8 | logger = logging.getLogger(__name__)
9 |
10 |
11 | NORM_MODULES = [
12 | torch.nn.BatchNorm1d,
13 | torch.nn.BatchNorm2d,
14 | torch.nn.BatchNorm3d,
15 | torch.nn.SyncBatchNorm,
16 | # NaiveSyncBatchNorm inherits from BatchNorm2d
17 | torch.nn.GroupNorm,
18 | torch.nn.InstanceNorm1d,
19 | torch.nn.InstanceNorm2d,
20 | torch.nn.InstanceNorm3d,
21 | torch.nn.LayerNorm,
22 | torch.nn.LocalResponseNorm,
23 | ]
24 |
25 | def register_norm_module(cls):
26 | NORM_MODULES.append(cls)
27 | return cls
28 |
29 | def align_and_update_state_dicts(model_state_dict, ckpt_state_dict):
30 | model_keys = sorted(model_state_dict.keys())
31 | ckpt_keys = sorted(ckpt_state_dict.keys())
32 | result_dicts = {}
33 | matched_log = []
34 | unmatched_log = []
35 | unloaded_log = []
36 | for model_key in model_keys:
37 | model_weight = model_state_dict[model_key]
38 | if model_key in ckpt_keys:
39 | ckpt_weight = ckpt_state_dict[model_key]
40 | if model_weight.shape == ckpt_weight.shape:
41 | result_dicts[model_key] = ckpt_weight
42 | ckpt_keys.pop(ckpt_keys.index(model_key))
43 | matched_log.append("Loaded {}, Model Shape: {} <-> Ckpt Shape: {}".format(model_key, model_weight.shape, ckpt_weight.shape))
44 | else:
45 | unmatched_log.append("*UNMATCHED* {}, Model Shape: {} <-> Ckpt Shape: {}".format(model_key, model_weight.shape, ckpt_weight.shape))
46 | else:
47 | unloaded_log.append("*UNLOADED* {}, Model Shape: {}".format(model_key, model_weight.shape))
48 |
49 | if is_main_process():
50 | for info in matched_log:
51 | logger.info(info)
52 | for info in unloaded_log:
53 | logger.warning(info)
54 | for key in ckpt_keys:
55 | logger.warning("$UNUSED$ {}, Ckpt Shape: {}".format(key, ckpt_state_dict[key].shape))
56 | for info in unmatched_log:
57 | logger.warning(info)
58 | return result_dicts
--------------------------------------------------------------------------------
/utils/sam_utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 |
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 |
--------------------------------------------------------------------------------
/utils/sam_utils/onnx.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 |
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 |
7 | import torch
8 | import torch.nn as nn
9 | from torch.nn import functional as F
10 |
11 | from typing import Tuple
12 |
13 | from ..modeling import Sam
14 | from .amg import calculate_stability_score
15 |
16 |
17 | class SamOnnxModel(nn.Module):
18 | """
19 | This model should not be called directly, but is used in ONNX export.
20 | It combines the prompt encoder, mask decoder, and mask postprocessing of Sam,
21 | with some functions modified to enable model tracing. Also supports extra
22 | options controlling what information. See the ONNX export script for details.
23 | """
24 |
25 | def __init__(
26 | self,
27 | model: Sam,
28 | return_single_mask: bool,
29 | use_stability_score: bool = False,
30 | return_extra_metrics: bool = False,
31 | ) -> None:
32 | super().__init__()
33 | self.mask_decoder = model.mask_decoder
34 | self.model = model
35 | self.img_size = model.image_encoder.img_size
36 | self.return_single_mask = return_single_mask
37 | self.use_stability_score = use_stability_score
38 | self.stability_score_offset = 1.0
39 | self.return_extra_metrics = return_extra_metrics
40 |
41 | @staticmethod
42 | def resize_longest_image_size(
43 | input_image_size: torch.Tensor, longest_side: int
44 | ) -> torch.Tensor:
45 | input_image_size = input_image_size.to(torch.float32)
46 | scale = longest_side / torch.max(input_image_size)
47 | transformed_size = scale * input_image_size
48 | transformed_size = torch.floor(transformed_size + 0.5).to(torch.int64)
49 | return transformed_size
50 |
51 | def _embed_points(self, point_coords: torch.Tensor, point_labels: torch.Tensor) -> torch.Tensor:
52 | point_coords = point_coords + 0.5
53 | point_coords = point_coords / self.img_size
54 | point_embedding = self.model.prompt_encoder.pe_layer._pe_encoding(point_coords)
55 | point_labels = point_labels.unsqueeze(-1).expand_as(point_embedding)
56 |
57 | point_embedding = point_embedding * (point_labels != -1)
58 | point_embedding = point_embedding + self.model.prompt_encoder.not_a_point_embed.weight * (
59 | point_labels == -1
60 | )
61 |
62 | for i in range(self.model.prompt_encoder.num_point_embeddings):
63 | point_embedding = point_embedding + self.model.prompt_encoder.point_embeddings[
64 | i
65 | ].weight * (point_labels == i)
66 |
67 | return point_embedding
68 |
69 | def _embed_masks(self, input_mask: torch.Tensor, has_mask_input: torch.Tensor) -> torch.Tensor:
70 | mask_embedding = has_mask_input * self.model.prompt_encoder.mask_downscaling(input_mask)
71 | mask_embedding = mask_embedding + (
72 | 1 - has_mask_input
73 | ) * self.model.prompt_encoder.no_mask_embed.weight.reshape(1, -1, 1, 1)
74 | return mask_embedding
75 |
76 | def mask_postprocessing(self, masks: torch.Tensor, orig_im_size: torch.Tensor) -> torch.Tensor:
77 | masks = F.interpolate(
78 | masks,
79 | size=(self.img_size, self.img_size),
80 | mode="bilinear",
81 | align_corners=False,
82 | )
83 |
84 | prepadded_size = self.resize_longest_image_size(orig_im_size, self.img_size)
85 | masks = masks[..., : int(prepadded_size[0]), : int(prepadded_size[1])]
86 |
87 | orig_im_size = orig_im_size.to(torch.int64)
88 | h, w = orig_im_size[0], orig_im_size[1]
89 | masks = F.interpolate(masks, size=(h, w), mode="bilinear", align_corners=False)
90 | return masks
91 |
92 | def select_masks(
93 | self, masks: torch.Tensor, iou_preds: torch.Tensor, num_points: int
94 | ) -> Tuple[torch.Tensor, torch.Tensor]:
95 | # Determine if we should return the multiclick mask or not from the number of points.
96 | # The reweighting is used to avoid control flow.
97 | score_reweight = torch.tensor(
98 | [[1000] + [0] * (self.model.mask_decoder.num_mask_tokens - 1)]
99 | ).to(iou_preds.device)
100 | score = iou_preds + (num_points - 2.5) * score_reweight
101 | best_idx = torch.argmax(score, dim=1)
102 | masks = masks[torch.arange(masks.shape[0]), best_idx, :, :].unsqueeze(1)
103 | iou_preds = iou_preds[torch.arange(masks.shape[0]), best_idx].unsqueeze(1)
104 |
105 | return masks, iou_preds
106 |
107 | @torch.no_grad()
108 | def forward(
109 | self,
110 | image_embeddings: torch.Tensor,
111 | point_coords: torch.Tensor,
112 | point_labels: torch.Tensor,
113 | mask_input: torch.Tensor,
114 | has_mask_input: torch.Tensor,
115 | orig_im_size: torch.Tensor,
116 | ):
117 | sparse_embedding = self._embed_points(point_coords, point_labels)
118 | dense_embedding = self._embed_masks(mask_input, has_mask_input)
119 |
120 | masks, scores = self.model.mask_decoder.predict_masks(
121 | image_embeddings=image_embeddings,
122 | image_pe=self.model.prompt_encoder.get_dense_pe(),
123 | sparse_prompt_embeddings=sparse_embedding,
124 | dense_prompt_embeddings=dense_embedding,
125 | )
126 |
127 | if self.use_stability_score:
128 | scores = calculate_stability_score(
129 | masks, self.model.mask_threshold, self.stability_score_offset
130 | )
131 |
132 | if self.return_single_mask:
133 | masks, scores = self.select_masks(masks, scores, point_coords.shape[1])
134 |
135 | upscaled_masks = self.mask_postprocessing(masks, orig_im_size)
136 |
137 | if self.return_extra_metrics:
138 | stability_scores = calculate_stability_score(
139 | upscaled_masks, self.model.mask_threshold, self.stability_score_offset
140 | )
141 | areas = (upscaled_masks > self.model.mask_threshold).sum(-1).sum(-1)
142 | return upscaled_masks, scores, stability_scores, areas, masks
143 |
144 | return upscaled_masks, scores, masks
145 |
--------------------------------------------------------------------------------
/utils/sam_utils/transforms.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 |
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 |
7 | import numpy as np
8 | import torch
9 | from torch.nn import functional as F
10 | from torchvision.transforms.functional import resize, to_pil_image # type: ignore
11 |
12 | from copy import deepcopy
13 | from typing import Tuple
14 |
15 |
16 | class ResizeLongestSide:
17 | """
18 | Resizes images to longest side 'target_length', as well as provides
19 | methods for resizing coordinates and boxes. Provides methods for
20 | transforming both numpy array and batched torch tensors.
21 | """
22 |
23 | def __init__(self, target_length: int) -> None:
24 | self.target_length = target_length
25 |
26 | def apply_image(self, image: np.ndarray) -> np.ndarray:
27 | """
28 | Expects a numpy array with shape HxWxC in uint8 format.
29 | """
30 | target_size = self.get_preprocess_shape(image.shape[0], image.shape[1], self.target_length)
31 | return np.array(resize(to_pil_image(image), target_size))
32 |
33 | def apply_coords(self, coords: np.ndarray, original_size: Tuple[int, ...]) -> np.ndarray:
34 | """
35 | Expects a numpy array of length 2 in the final dimension. Requires the
36 | original image size in (H, W) format.
37 | """
38 | old_h, old_w = original_size
39 | new_h, new_w = self.get_preprocess_shape(
40 | original_size[0], original_size[1], self.target_length
41 | )
42 | coords = deepcopy(coords).astype(float)
43 | coords[..., 0] = coords[..., 0] * (new_w / old_w)
44 | coords[..., 1] = coords[..., 1] * (new_h / old_h)
45 | return coords
46 |
47 | def apply_boxes(self, boxes: np.ndarray, original_size: Tuple[int, ...]) -> np.ndarray:
48 | """
49 | Expects a numpy array shape Bx4. Requires the original image size
50 | in (H, W) format.
51 | """
52 | boxes = self.apply_coords(boxes.reshape(-1, 2, 2), original_size)
53 | return boxes.reshape(-1, 4)
54 |
55 | def apply_image_torch(self, image: torch.Tensor) -> torch.Tensor:
56 | """
57 | Expects batched images with shape BxCxHxW and float format. This
58 | transformation may not exactly match apply_image. apply_image is
59 | the transformation expected by the model.
60 | """
61 | # Expects an image in BCHW format. May not exactly match apply_image.
62 | target_size = self.get_preprocess_shape(image.shape[0], image.shape[1], self.target_length)
63 | return F.interpolate(
64 | image, target_size, mode="bilinear", align_corners=False, antialias=True
65 | )
66 |
67 | def apply_coords_torch(
68 | self, coords: torch.Tensor, original_size: Tuple[int, ...]
69 | ) -> torch.Tensor:
70 | """
71 | Expects a torch tensor with length 2 in the last dimension. Requires the
72 | original image size in (H, W) format.
73 | """
74 | old_h, old_w = original_size
75 | new_h, new_w = self.get_preprocess_shape(
76 | original_size[0], original_size[1], self.target_length
77 | )
78 | coords = deepcopy(coords).to(torch.float)
79 | coords[..., 0] = coords[..., 0] * (new_w / old_w)
80 | coords[..., 1] = coords[..., 1] * (new_h / old_h)
81 | return coords
82 |
83 | def apply_boxes_torch(
84 | self, boxes: torch.Tensor, original_size: Tuple[int, ...]
85 | ) -> torch.Tensor:
86 | """
87 | Expects a torch tensor with shape Bx4. Requires the original image
88 | size in (H, W) format.
89 | """
90 | boxes = self.apply_coords_torch(boxes.reshape(-1, 2, 2), original_size)
91 | return boxes.reshape(-1, 4)
92 |
93 | @staticmethod
94 | def get_preprocess_shape(oldh: int, oldw: int, long_side_length: int) -> Tuple[int, int]:
95 | """
96 | Compute the output size given input size and target long side length.
97 | """
98 | scale = long_side_length * 1.0 / max(oldh, oldw)
99 | newh, neww = oldh * scale, oldw * scale
100 | neww = int(neww + 0.5)
101 | newh = int(newh + 0.5)
102 | return (newh, neww)
103 |
--------------------------------------------------------------------------------