├── .gitignore ├── LICENSE ├── README.md ├── assets ├── logo.png ├── lvis.png ├── overview.png ├── pl.png └── table1.png ├── detectron2 ├── __init__.py ├── checkpoint │ ├── __init__.py │ ├── c2_model_loading.py │ ├── catalog.py │ └── detection_checkpoint.py ├── config │ ├── __init__.py │ ├── compat.py │ ├── config.py │ ├── defaults.py │ ├── instantiate.py │ └── lazy.py ├── data │ ├── __init__.py │ ├── benchmark.py │ ├── build.py │ ├── catalog.py │ ├── common.py │ ├── dataset_mapper.py │ ├── datasets │ │ ├── README.md │ │ ├── __init__.py │ │ ├── builtin.py │ │ ├── builtin_meta.py │ │ ├── cityscapes.py │ │ ├── cityscapes_panoptic.py │ │ ├── coco.py │ │ ├── coco_panoptic.py │ │ ├── lvis.py │ │ ├── lvis_v0_5_categories.py │ │ ├── lvis_v1_categories.py │ │ ├── pascal_voc.py │ │ └── register_coco.py │ ├── detection_utils.py │ ├── samplers │ │ ├── __init__.py │ │ ├── distributed_sampler.py │ │ └── grouped_batch_sampler.py │ └── transforms │ │ ├── __init__.py │ │ ├── augmentation.py │ │ ├── augmentation_impl.py │ │ └── transform.py ├── engine │ ├── __init__.py │ ├── defaults.py │ ├── hooks.py │ ├── launch.py │ └── train_loop.py ├── evaluation │ ├── __init__.py │ ├── cityscapes_evaluation.py │ ├── coco_evaluation.py │ ├── evaluator.py │ ├── fast_eval_api.py │ ├── lvis_evaluation.py │ ├── panoptic_evaluation.py │ ├── pascal_voc_evaluation.py │ ├── refcocoeval.py │ ├── rotated_coco_evaluation.py │ ├── sem_seg_evaluation.py │ └── testing.py ├── export │ ├── README.md │ ├── __init__.py │ ├── api.py │ ├── c10.py │ ├── caffe2_export.py │ ├── caffe2_inference.py │ ├── caffe2_modeling.py │ ├── caffe2_patch.py │ ├── flatten.py │ ├── shared.py │ ├── torchscript.py │ └── torchscript_patch.py ├── layers │ ├── __init__.py │ ├── aspp.py │ ├── batch_norm.py │ ├── blocks.py │ ├── csrc │ │ ├── README.md │ │ ├── ROIAlignRotated │ │ │ ├── ROIAlignRotated.h │ │ │ ├── ROIAlignRotated_cpu.cpp │ │ │ └── ROIAlignRotated_cuda.cu │ │ ├── box_iou_rotated │ │ │ ├── box_iou_rotated.h │ │ │ ├── box_iou_rotated_cpu.cpp │ │ │ ├── box_iou_rotated_cuda.cu │ │ │ └── box_iou_rotated_utils.h │ │ ├── cocoeval │ │ │ ├── cocoeval.cpp │ │ │ └── cocoeval.h │ │ ├── cuda_version.cu │ │ ├── deformable │ │ │ ├── deform_conv.h │ │ │ ├── deform_conv_cuda.cu │ │ │ └── deform_conv_cuda_kernel.cu │ │ ├── nms_rotated │ │ │ ├── nms_rotated.h │ │ │ ├── nms_rotated_cpu.cpp │ │ │ └── nms_rotated_cuda.cu │ │ └── vision.cpp │ ├── deform_conv.py │ ├── losses.py │ ├── mask_ops.py │ ├── nms.py │ ├── roi_align.py │ ├── roi_align_rotated.py │ ├── rotated_boxes.py │ ├── shape_spec.py │ └── wrappers.py ├── model_zoo │ ├── __init__.py │ └── model_zoo.py ├── modeling │ ├── __init__.py │ ├── anchor_generator.py │ ├── backbone │ │ ├── __init__.py │ │ ├── backbone.py │ │ ├── build.py │ │ ├── fpn.py │ │ ├── regnet.py │ │ └── resnet.py │ ├── box_regression.py │ ├── matcher.py │ ├── meta_arch │ │ ├── __init__.py │ │ ├── build.py │ │ ├── dense_detector.py │ │ ├── fcos.py │ │ ├── panoptic_fpn.py │ │ ├── rcnn.py │ │ ├── retinanet.py │ │ └── semantic_seg.py │ ├── mmdet_wrapper.py │ ├── poolers.py │ ├── postprocessing.py │ ├── proposal_generator │ │ ├── __init__.py │ │ ├── build.py │ │ ├── proposal_utils.py │ │ ├── rpn.py │ │ └── rrpn.py │ ├── roi_heads │ │ ├── __init__.py │ │ ├── box_head.py │ │ ├── cascade_rcnn.py │ │ ├── fast_rcnn.py │ │ ├── keypoint_head.py │ │ ├── mask_head.py │ │ ├── roi_heads.py │ │ └── rotated_fast_rcnn.py │ ├── sampling.py │ └── test_time_augmentation.py ├── projects │ ├── README.md │ └── __init__.py ├── solver │ ├── __init__.py │ ├── build.py │ └── lr_scheduler.py ├── structures │ ├── __init__.py │ ├── boxes.py │ ├── image_list.py │ ├── instances.py │ ├── keypoints.py │ ├── masks.py │ └── rotated_boxes.py ├── tracking │ ├── __init__.py │ ├── base_tracker.py │ ├── bbox_iou_tracker.py │ ├── hungarian_tracker.py │ ├── iou_weighted_hungarian_bbox_iou_tracker.py │ ├── utils.py │ └── vanilla_hungarian_bbox_iou_tracker.py └── utils │ ├── README.md │ ├── __init__.py │ ├── analysis.py │ ├── collect_env.py │ ├── colormap.py │ ├── comm.py │ ├── develop.py │ ├── env.py │ ├── events.py │ ├── file_io.py │ ├── logger.py │ ├── memory.py │ ├── registry.py │ ├── serialize.py │ ├── testing.py │ ├── video_visualizer.py │ └── visualizer.py ├── launch.py ├── projects ├── .DS_Store └── DDETRS │ ├── configs │ ├── vg_grit5m_swinL.yaml │ ├── vg_grit5m_swinT.yaml │ ├── vg_swinL.yaml │ └── vg_swinT.yaml │ ├── ddetrs │ ├── __init__.py │ ├── backbone │ │ ├── __init__.py │ │ ├── masked_backbone.py │ │ ├── pos_embed.py │ │ ├── swin.py │ │ └── vit.py │ ├── config_uni.py │ ├── data │ │ ├── custom_build_augmentation.py │ │ ├── custom_dataset_dataloader.py │ │ ├── custom_dataset_mapper.py │ │ ├── datasets │ │ │ ├── grit20m.py │ │ │ ├── grit_pseudo.py │ │ │ ├── lvis_minival.py │ │ │ ├── parsers │ │ │ │ ├── __init__.py │ │ │ │ ├── coco_api.py │ │ │ │ └── coco_video_parser.py │ │ │ └── vg.py │ │ ├── mapper │ │ │ ├── __init__.py │ │ │ ├── augmentation.py │ │ │ ├── build.py │ │ │ ├── custom_dataset_dataloader.py │ │ │ └── dataset_mapper.py │ │ └── transforms │ │ │ ├── custom_augmentation_impl.py │ │ │ └── custom_transform.py │ ├── ddetrs_vl_uni.py │ ├── evaluation │ │ └── eval.py │ ├── models │ │ ├── conv_with_kaiming_uniform.py │ │ ├── deformable_detr │ │ │ ├── __init__.py │ │ │ ├── backbone.py │ │ │ ├── bert_model.py │ │ │ ├── deformable_detr.py │ │ │ ├── deformable_transformer.py │ │ │ ├── fuse_helper.py │ │ │ ├── matcher.py │ │ │ ├── modeling_bert.py │ │ │ ├── ops │ │ │ │ ├── functions │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── ms_deform_attn_func.py │ │ │ │ ├── make.sh │ │ │ │ ├── modules │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── ms_deform_attn.py │ │ │ │ ├── setup.py │ │ │ │ ├── src │ │ │ │ │ ├── cpu │ │ │ │ │ │ ├── ms_deform_attn_cpu.cpp │ │ │ │ │ │ └── ms_deform_attn_cpu.h │ │ │ │ │ ├── cuda │ │ │ │ │ │ ├── ms_deform_attn_cuda.cu │ │ │ │ │ │ ├── ms_deform_attn_cuda.h │ │ │ │ │ │ └── ms_deform_im2col_cuda.cuh │ │ │ │ │ ├── ms_deform_attn.h │ │ │ │ │ └── vision.cpp │ │ │ │ └── test.py │ │ │ ├── position_encoding.py │ │ │ ├── segmentation.py │ │ │ └── vlfusion.py │ │ ├── segmentation_condInst_new_encodfpn.py │ │ └── text │ │ │ └── modeling_t5.py │ └── util │ │ ├── __init__.py │ │ ├── box_ops.py │ │ ├── misc.py │ │ ├── mmcv_utils.py │ │ └── plot_utils.py │ └── train_net.py ├── requirements.txt ├── setup.cfg ├── setup.py └── tools ├── convert-pretrained-swin-model-to-d2.py ├── evaluate_ap_fixed.py └── logger.py /.gitignore: -------------------------------------------------------------------------------- 1 | # output dir 2 | output 3 | instant_test_output 4 | inference_test_output 5 | wandb 6 | 7 | *.json 8 | *.diff 9 | *.jpg 10 | *.tar 11 | !/projects/DensePose/doc/images/*.jpg 12 | 13 | weights 14 | datasets 15 | !/projects/DDETRS/ddetrs/data/datasets 16 | 17 | # compilation and distribution 18 | __pycache__ 19 | _ext 20 | *.pyc 21 | *.pyd 22 | *.so 23 | *.dll 24 | *.egg-info/ 25 | build/ 26 | dist/ 27 | wheels/ 28 | 29 | # pytorch/python/numpy formats 30 | *.pth 31 | *.pkl 32 | *.npy 33 | *.ts 34 | *.pt 35 | model_ts*.txt 36 | 37 | # ipython/jupyter notebooks 38 | *.ipynb 39 | **/.ipynb_checkpoints/ 40 | 41 | # Editor temporaries 42 | *.swn 43 | *.swo 44 | *.swp 45 | *~ 46 | wandb 47 | 48 | # editor settings 49 | .idea 50 | .vscode 51 | _darcs 52 | 53 | # project dirs 54 | /detectron2/model_zoo/configs 55 | /datasets/* 56 | !/datasets/*.* 57 | /models 58 | /snippet 59 | 60 | # .txt 61 | # *.txt* 62 | 63 | *.zip 64 | *.npz 65 | 66 | events.* 67 | 68 | *.bin 69 | 70 | # ReferFormer 71 | external/ReferFormer/data/ 72 | 73 | last_checkpoint 74 | 75 | OVIS 76 | 77 | external/bytetrack_unitrack_bdd 78 | 79 | *.csv 80 | 81 | config.yaml 82 | 83 | *.tar.gz 84 | *.mp4 85 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 FoundationVision 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /assets/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FoundationVision/GenerateU/904e5337758d554e44ad50c2afa567c171166cd9/assets/logo.png -------------------------------------------------------------------------------- /assets/lvis.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FoundationVision/GenerateU/904e5337758d554e44ad50c2afa567c171166cd9/assets/lvis.png -------------------------------------------------------------------------------- /assets/overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FoundationVision/GenerateU/904e5337758d554e44ad50c2afa567c171166cd9/assets/overview.png -------------------------------------------------------------------------------- /assets/pl.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FoundationVision/GenerateU/904e5337758d554e44ad50c2afa567c171166cd9/assets/pl.png -------------------------------------------------------------------------------- /assets/table1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FoundationVision/GenerateU/904e5337758d554e44ad50c2afa567c171166cd9/assets/table1.png -------------------------------------------------------------------------------- /detectron2/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | from .utils.env import setup_environment 4 | 5 | setup_environment() 6 | 7 | 8 | # This line will be programatically read/write by setup.py. 9 | # Leave them at the bottom of this file and don't touch them. 10 | __version__ = "0.6" 11 | -------------------------------------------------------------------------------- /detectron2/checkpoint/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # File: 4 | 5 | 6 | from . import catalog as _UNUSED # register the handler 7 | from .detection_checkpoint import DetectionCheckpointer 8 | from fvcore.common.checkpoint import Checkpointer, PeriodicCheckpointer 9 | 10 | __all__ = ["Checkpointer", "PeriodicCheckpointer", "DetectionCheckpointer"] 11 | -------------------------------------------------------------------------------- /detectron2/config/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from .compat import downgrade_config, upgrade_config 3 | from .config import CfgNode, get_cfg, global_cfg, set_global_cfg, configurable 4 | from .instantiate import instantiate 5 | from .lazy import LazyCall, LazyConfig 6 | 7 | __all__ = [ 8 | "CfgNode", 9 | "get_cfg", 10 | "global_cfg", 11 | "set_global_cfg", 12 | "downgrade_config", 13 | "upgrade_config", 14 | "configurable", 15 | "instantiate", 16 | "LazyCall", 17 | "LazyConfig", 18 | ] 19 | 20 | 21 | from detectron2.utils.env import fixup_module_metadata 22 | 23 | fixup_module_metadata(__name__, globals(), __all__) 24 | del fixup_module_metadata 25 | -------------------------------------------------------------------------------- /detectron2/config/instantiate.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import dataclasses 3 | import logging 4 | from collections import abc 5 | from typing import Any 6 | 7 | from detectron2.utils.registry import _convert_target_to_string, locate 8 | 9 | __all__ = ["dump_dataclass", "instantiate"] 10 | 11 | 12 | def dump_dataclass(obj: Any): 13 | """ 14 | Dump a dataclass recursively into a dict that can be later instantiated. 15 | 16 | Args: 17 | obj: a dataclass object 18 | 19 | Returns: 20 | dict 21 | """ 22 | assert dataclasses.is_dataclass(obj) and not isinstance( 23 | obj, type 24 | ), "dump_dataclass() requires an instance of a dataclass." 25 | ret = {"_target_": _convert_target_to_string(type(obj))} 26 | for f in dataclasses.fields(obj): 27 | v = getattr(obj, f.name) 28 | if dataclasses.is_dataclass(v): 29 | v = dump_dataclass(v) 30 | if isinstance(v, (list, tuple)): 31 | v = [dump_dataclass(x) if dataclasses.is_dataclass(x) else x for x in v] 32 | ret[f.name] = v 33 | return ret 34 | 35 | 36 | def instantiate(cfg): 37 | """ 38 | Recursively instantiate objects defined in dictionaries by 39 | "_target_" and arguments. 40 | 41 | Args: 42 | cfg: a dict-like object with "_target_" that defines the caller, and 43 | other keys that define the arguments 44 | 45 | Returns: 46 | object instantiated by cfg 47 | """ 48 | from omegaconf import ListConfig 49 | 50 | if isinstance(cfg, ListConfig): 51 | lst = [instantiate(x) for x in cfg] 52 | return ListConfig(lst, flags={"allow_objects": True}) 53 | if isinstance(cfg, list): 54 | # Specialize for list, because many classes take 55 | # list[objects] as arguments, such as ResNet, DatasetMapper 56 | return [instantiate(x) for x in cfg] 57 | 58 | if isinstance(cfg, abc.Mapping) and "_target_" in cfg: 59 | # conceptually equivalent to hydra.utils.instantiate(cfg) with _convert_=all, 60 | # but faster: https://github.com/facebookresearch/hydra/issues/1200 61 | cfg = {k: instantiate(v) for k, v in cfg.items()} 62 | cls = cfg.pop("_target_") 63 | cls = instantiate(cls) 64 | 65 | if isinstance(cls, str): 66 | cls_name = cls 67 | cls = locate(cls_name) 68 | assert cls is not None, cls_name 69 | else: 70 | try: 71 | cls_name = cls.__module__ + "." + cls.__qualname__ 72 | except Exception: 73 | # target could be anything, so the above could fail 74 | cls_name = str(cls) 75 | assert callable(cls), f"_target_ {cls} does not define a callable object" 76 | try: 77 | return cls(**cfg) 78 | except TypeError: 79 | logger = logging.getLogger(__name__) 80 | logger.error(f"Error when instantiating {cls_name}!") 81 | raise 82 | return cfg # return as-is if don't know what to do 83 | -------------------------------------------------------------------------------- /detectron2/data/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from . import transforms # isort:skip 3 | 4 | from .build import ( 5 | build_batch_data_loader, 6 | build_detection_test_loader, 7 | build_detection_train_loader, 8 | get_detection_dataset_dicts, 9 | load_proposals_into_dataset, 10 | print_instances_class_histogram, 11 | ) 12 | from .catalog import DatasetCatalog, MetadataCatalog, Metadata 13 | from .common import DatasetFromList, MapDataset, ToIterableDataset 14 | from .dataset_mapper import DatasetMapper 15 | 16 | # ensure the builtin datasets are registered 17 | from . import datasets, samplers # isort:skip 18 | 19 | __all__ = [k for k in globals().keys() if not k.startswith("_")] 20 | -------------------------------------------------------------------------------- /detectron2/data/datasets/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ### Common Datasets 4 | 5 | The dataset implemented here do not need to load the data into the final format. 6 | It should provide the minimal data structure needed to use the dataset, so it can be very efficient. 7 | 8 | For example, for an image dataset, just provide the file names and labels, but don't read the images. 9 | Let the downstream decide how to read. 10 | -------------------------------------------------------------------------------- /detectron2/data/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from .coco import load_coco_json, load_sem_seg, register_coco_instances, convert_to_coco_json 3 | from .coco_panoptic import register_coco_panoptic, register_coco_panoptic_separated 4 | from .lvis import load_lvis_json, register_lvis_instances, get_lvis_instances_meta 5 | from .pascal_voc import load_voc_instances, register_pascal_voc 6 | from . import builtin as _builtin # ensure the builtin datasets are registered 7 | 8 | 9 | __all__ = [k for k in globals().keys() if not k.startswith("_")] 10 | -------------------------------------------------------------------------------- /detectron2/data/datasets/pascal_voc.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | 4 | import numpy as np 5 | import os 6 | import xml.etree.ElementTree as ET 7 | from typing import List, Tuple, Union 8 | 9 | from detectron2.data import DatasetCatalog, MetadataCatalog 10 | from detectron2.structures import BoxMode 11 | from detectron2.utils.file_io import PathManager 12 | 13 | __all__ = ["load_voc_instances", "register_pascal_voc"] 14 | 15 | 16 | # fmt: off 17 | CLASS_NAMES = ( 18 | "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", 19 | "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", 20 | "pottedplant", "sheep", "sofa", "train", "tvmonitor" 21 | ) 22 | # fmt: on 23 | 24 | 25 | def load_voc_instances(dirname: str, split: str, class_names: Union[List[str], Tuple[str, ...]]): 26 | """ 27 | Load Pascal VOC detection annotations to Detectron2 format. 28 | 29 | Args: 30 | dirname: Contain "Annotations", "ImageSets", "JPEGImages" 31 | split (str): one of "train", "test", "val", "trainval" 32 | class_names: list or tuple of class names 33 | """ 34 | with PathManager.open(os.path.join(dirname, "ImageSets", "Main", split + ".txt")) as f: 35 | fileids = np.loadtxt(f, dtype=np.str) 36 | 37 | # Needs to read many small annotation files. Makes sense at local 38 | annotation_dirname = PathManager.get_local_path(os.path.join(dirname, "Annotations/")) 39 | dicts = [] 40 | for fileid in fileids: 41 | anno_file = os.path.join(annotation_dirname, fileid + ".xml") 42 | jpeg_file = os.path.join(dirname, "JPEGImages", fileid + ".jpg") 43 | 44 | with PathManager.open(anno_file) as f: 45 | tree = ET.parse(f) 46 | 47 | r = { 48 | "file_name": jpeg_file, 49 | "image_id": fileid, 50 | "height": int(tree.findall("./size/height")[0].text), 51 | "width": int(tree.findall("./size/width")[0].text), 52 | } 53 | instances = [] 54 | 55 | for obj in tree.findall("object"): 56 | cls = obj.find("name").text 57 | # We include "difficult" samples in training. 58 | # Based on limited experiments, they don't hurt accuracy. 59 | # difficult = int(obj.find("difficult").text) 60 | # if difficult == 1: 61 | # continue 62 | bbox = obj.find("bndbox") 63 | bbox = [float(bbox.find(x).text) for x in ["xmin", "ymin", "xmax", "ymax"]] 64 | # Original annotations are integers in the range [1, W or H] 65 | # Assuming they mean 1-based pixel indices (inclusive), 66 | # a box with annotation (xmin=1, xmax=W) covers the whole image. 67 | # In coordinate space this is represented by (xmin=0, xmax=W) 68 | bbox[0] -= 1.0 69 | bbox[1] -= 1.0 70 | instances.append( 71 | {"category_id": class_names.index(cls), "bbox": bbox, "bbox_mode": BoxMode.XYXY_ABS} 72 | ) 73 | r["annotations"] = instances 74 | dicts.append(r) 75 | return dicts 76 | 77 | 78 | def register_pascal_voc(name, dirname, split, year, class_names=CLASS_NAMES): 79 | DatasetCatalog.register(name, lambda: load_voc_instances(dirname, split, class_names)) 80 | MetadataCatalog.get(name).set( 81 | thing_classes=list(class_names), dirname=dirname, year=year, split=split 82 | ) 83 | -------------------------------------------------------------------------------- /detectron2/data/datasets/register_coco.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from .coco import register_coco_instances # noqa 3 | from .coco_panoptic import register_coco_panoptic_separated # noqa 4 | -------------------------------------------------------------------------------- /detectron2/data/samplers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from .distributed_sampler import ( 3 | InferenceSampler, 4 | RandomSubsetTrainingSampler, 5 | RepeatFactorTrainingSampler, 6 | TrainingSampler, 7 | ) 8 | 9 | from .grouped_batch_sampler import GroupedBatchSampler 10 | 11 | __all__ = [ 12 | "GroupedBatchSampler", 13 | "TrainingSampler", 14 | "RandomSubsetTrainingSampler", 15 | "InferenceSampler", 16 | "RepeatFactorTrainingSampler", 17 | ] 18 | -------------------------------------------------------------------------------- /detectron2/data/samplers/grouped_batch_sampler.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import numpy as np 3 | from torch.utils.data.sampler import BatchSampler, Sampler 4 | 5 | 6 | class GroupedBatchSampler(BatchSampler): 7 | """ 8 | Wraps another sampler to yield a mini-batch of indices. 9 | It enforces that the batch only contain elements from the same group. 10 | It also tries to provide mini-batches which follows an ordering which is 11 | as close as possible to the ordering from the original sampler. 12 | """ 13 | 14 | def __init__(self, sampler, group_ids, batch_size): 15 | """ 16 | Args: 17 | sampler (Sampler): Base sampler. 18 | group_ids (list[int]): If the sampler produces indices in range [0, N), 19 | `group_ids` must be a list of `N` ints which contains the group id of each sample. 20 | The group ids must be a set of integers in the range [0, num_groups). 21 | batch_size (int): Size of mini-batch. 22 | """ 23 | if not isinstance(sampler, Sampler): 24 | raise ValueError( 25 | "sampler should be an instance of " 26 | "torch.utils.data.Sampler, but got sampler={}".format(sampler) 27 | ) 28 | self.sampler = sampler 29 | self.group_ids = np.asarray(group_ids) 30 | assert self.group_ids.ndim == 1 31 | self.batch_size = batch_size 32 | groups = np.unique(self.group_ids).tolist() 33 | 34 | # buffer the indices of each group until batch size is reached 35 | self.buffer_per_group = {k: [] for k in groups} 36 | 37 | def __iter__(self): 38 | for idx in self.sampler: 39 | group_id = self.group_ids[idx] 40 | group_buffer = self.buffer_per_group[group_id] 41 | group_buffer.append(idx) 42 | if len(group_buffer) == self.batch_size: 43 | yield group_buffer[:] # yield a copy of the list 44 | del group_buffer[:] 45 | 46 | def __len__(self): 47 | raise NotImplementedError("len() of GroupedBatchSampler is not well-defined.") 48 | -------------------------------------------------------------------------------- /detectron2/data/transforms/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from fvcore.transforms.transform import Transform, TransformList # order them first 3 | from fvcore.transforms.transform import * 4 | from .transform import * 5 | from .augmentation import * 6 | from .augmentation_impl import * 7 | 8 | __all__ = [k for k in globals().keys() if not k.startswith("_")] 9 | 10 | 11 | from detectron2.utils.env import fixup_module_metadata 12 | 13 | fixup_module_metadata(__name__, globals(), __all__) 14 | del fixup_module_metadata 15 | -------------------------------------------------------------------------------- /detectron2/engine/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | from .launch import * 4 | from .train_loop import * 5 | 6 | __all__ = [k for k in globals().keys() if not k.startswith("_")] 7 | 8 | 9 | # prefer to let hooks and defaults live in separate namespaces (therefore not in __all__) 10 | # but still make them available here 11 | from .hooks import * 12 | from .defaults import * 13 | -------------------------------------------------------------------------------- /detectron2/engine/launch.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import logging 3 | from datetime import timedelta 4 | import torch 5 | import torch.distributed as dist 6 | import torch.multiprocessing as mp 7 | 8 | from detectron2.utils import comm 9 | 10 | __all__ = ["DEFAULT_TIMEOUT", "launch"] 11 | 12 | DEFAULT_TIMEOUT = timedelta(minutes=30) 13 | 14 | 15 | def _find_free_port(): 16 | import socket 17 | 18 | sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 19 | # Binding to port 0 will cause the OS to find an available port for us 20 | sock.bind(("", 0)) 21 | port = sock.getsockname()[1] 22 | sock.close() 23 | # NOTE: there is still a chance the port could be taken by other processes. 24 | return port 25 | 26 | 27 | def launch( 28 | main_func, 29 | num_gpus_per_machine, 30 | num_machines=1, 31 | machine_rank=0, 32 | dist_url=None, 33 | args=(), 34 | timeout=DEFAULT_TIMEOUT, 35 | ): 36 | """ 37 | Launch multi-gpu or distributed training. 38 | This function must be called on all machines involved in the training. 39 | It will spawn child processes (defined by ``num_gpus_per_machine``) on each machine. 40 | 41 | Args: 42 | main_func: a function that will be called by `main_func(*args)` 43 | num_gpus_per_machine (int): number of GPUs per machine 44 | num_machines (int): the total number of machines 45 | machine_rank (int): the rank of this machine 46 | dist_url (str): url to connect to for distributed jobs, including protocol 47 | e.g. "tcp://127.0.0.1:8686". 48 | Can be set to "auto" to automatically select a free port on localhost 49 | timeout (timedelta): timeout of the distributed workers 50 | args (tuple): arguments passed to main_func 51 | """ 52 | world_size = num_machines * num_gpus_per_machine 53 | if world_size > 1: 54 | # https://github.com/pytorch/pytorch/pull/14391 55 | # TODO prctl in spawned processes 56 | 57 | if dist_url == "auto": 58 | assert num_machines == 1, "dist_url=auto not supported in multi-machine jobs." 59 | port = _find_free_port() 60 | dist_url = f"tcp://127.0.0.1:{port}" 61 | if num_machines > 1 and dist_url.startswith("file://"): 62 | logger = logging.getLogger(__name__) 63 | logger.warning( 64 | "file:// is not a reliable init_method in multi-machine jobs. Prefer tcp://" 65 | ) 66 | 67 | mp.spawn( 68 | _distributed_worker, 69 | nprocs=num_gpus_per_machine, 70 | args=( 71 | main_func, 72 | world_size, 73 | num_gpus_per_machine, 74 | machine_rank, 75 | dist_url, 76 | args, 77 | timeout, 78 | ), 79 | daemon=False, 80 | ) 81 | else: 82 | main_func(*args) 83 | 84 | 85 | def _distributed_worker( 86 | local_rank, 87 | main_func, 88 | world_size, 89 | num_gpus_per_machine, 90 | machine_rank, 91 | dist_url, 92 | args, 93 | timeout=DEFAULT_TIMEOUT, 94 | ): 95 | assert torch.cuda.is_available(), "cuda is not available. Please check your installation." 96 | global_rank = machine_rank * num_gpus_per_machine + local_rank 97 | try: 98 | dist.init_process_group( 99 | backend="NCCL", 100 | init_method=dist_url, 101 | world_size=world_size, 102 | rank=global_rank, 103 | timeout=timeout, 104 | ) 105 | except Exception as e: 106 | logger = logging.getLogger(__name__) 107 | logger.error("Process group URL: {}".format(dist_url)) 108 | raise e 109 | 110 | # Setup the local process group (which contains ranks within the same machine) 111 | assert comm._LOCAL_PROCESS_GROUP is None 112 | num_machines = world_size // num_gpus_per_machine 113 | for i in range(num_machines): 114 | ranks_on_i = list(range(i * num_gpus_per_machine, (i + 1) * num_gpus_per_machine)) 115 | pg = dist.new_group(ranks_on_i) 116 | if i == machine_rank: 117 | comm._LOCAL_PROCESS_GROUP = pg 118 | 119 | assert num_gpus_per_machine <= torch.cuda.device_count() 120 | torch.cuda.set_device(local_rank) 121 | 122 | # synchronize is needed here to prevent a possible timeout after calling init_process_group 123 | # See: https://github.com/facebookresearch/maskrcnn-benchmark/issues/172 124 | comm.synchronize() 125 | 126 | main_func(*args) 127 | -------------------------------------------------------------------------------- /detectron2/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from .cityscapes_evaluation import CityscapesInstanceEvaluator, CityscapesSemSegEvaluator 3 | from .coco_evaluation import COCOEvaluator 4 | from .rotated_coco_evaluation import RotatedCOCOEvaluator 5 | from .evaluator import DatasetEvaluator, DatasetEvaluators, inference_context, inference_on_dataset 6 | from .lvis_evaluation import LVISEvaluator 7 | from .panoptic_evaluation import COCOPanopticEvaluator 8 | from .pascal_voc_evaluation import PascalVOCDetectionEvaluator 9 | from .sem_seg_evaluation import SemSegEvaluator 10 | from .testing import print_csv_format, verify_results 11 | 12 | __all__ = [k for k in globals().keys() if not k.startswith("_")] 13 | -------------------------------------------------------------------------------- /detectron2/evaluation/testing.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import logging 3 | import numpy as np 4 | import pprint 5 | import sys 6 | from collections.abc import Mapping 7 | 8 | 9 | def print_csv_format(results): 10 | """ 11 | Print main metrics in a format similar to Detectron, 12 | so that they are easy to copypaste into a spreadsheet. 13 | 14 | Args: 15 | results (OrderedDict[dict]): task_name -> {metric -> score} 16 | unordered dict can also be printed, but in arbitrary order 17 | """ 18 | assert isinstance(results, Mapping) or not len(results), results 19 | logger = logging.getLogger(__name__) 20 | for task, res in results.items(): 21 | if isinstance(res, Mapping): 22 | # Don't print "AP-category" metrics since they are usually not tracked. 23 | important_res = [(k, v) for k, v in res.items() if "-" not in k] 24 | logger.info("copypaste: Task: {}".format(task)) 25 | logger.info("copypaste: " + ",".join([k[0] for k in important_res])) 26 | logger.info("copypaste: " + ",".join(["{0:.4f}".format(k[1]) for k in important_res])) 27 | else: 28 | logger.info(f"copypaste: {task}={res}") 29 | 30 | 31 | def verify_results(cfg, results): 32 | """ 33 | Args: 34 | results (OrderedDict[dict]): task_name -> {metric -> score} 35 | 36 | Returns: 37 | bool: whether the verification succeeds or not 38 | """ 39 | expected_results = cfg.TEST.EXPECTED_RESULTS 40 | if not len(expected_results): 41 | return True 42 | 43 | ok = True 44 | for task, metric, expected, tolerance in expected_results: 45 | actual = results[task].get(metric, None) 46 | if actual is None: 47 | ok = False 48 | continue 49 | if not np.isfinite(actual): 50 | ok = False 51 | continue 52 | diff = abs(actual - expected) 53 | if diff > tolerance: 54 | ok = False 55 | 56 | logger = logging.getLogger(__name__) 57 | if not ok: 58 | logger.error("Result verification failed!") 59 | logger.error("Expected Results: " + str(expected_results)) 60 | logger.error("Actual Results: " + pprint.pformat(results)) 61 | 62 | sys.exit(1) 63 | else: 64 | logger.info("Results verification passed.") 65 | return ok 66 | 67 | 68 | def flatten_results_dict(results): 69 | """ 70 | Expand a hierarchical dict of scalars into a flat dict of scalars. 71 | If results[k1][k2][k3] = v, the returned dict will have the entry 72 | {"k1/k2/k3": v}. 73 | 74 | Args: 75 | results (dict): 76 | """ 77 | r = {} 78 | for k, v in results.items(): 79 | if isinstance(v, Mapping): 80 | v = flatten_results_dict(v) 81 | for kk, vv in v.items(): 82 | r[k + "/" + kk] = vv 83 | else: 84 | r[k] = v 85 | return r 86 | -------------------------------------------------------------------------------- /detectron2/export/README.md: -------------------------------------------------------------------------------- 1 | 2 | This directory contains code to prepare a detectron2 model for deployment. 3 | Currently it supports exporting a detectron2 model to Caffe2 format through ONNX. 4 | 5 | Please see [documentation](https://detectron2.readthedocs.io/tutorials/deployment.html) for its usage. 6 | 7 | 8 | ### Acknowledgements 9 | 10 | Thanks to Mobile Vision team at Facebook for developing the Caffe2 conversion tools. 11 | 12 | Thanks to Computing Platform Department - PAI team at Alibaba Group (@bddpqq, @chenbohua3) who 13 | help export Detectron2 models to TorchScript. 14 | -------------------------------------------------------------------------------- /detectron2/export/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | try: 4 | from caffe2.proto import caffe2_pb2 as _tmp 5 | 6 | # caffe2 is optional 7 | except ImportError: 8 | pass 9 | else: 10 | from .api import * 11 | 12 | from .flatten import TracingAdapter 13 | from .torchscript import scripting_with_instances, dump_torchscript_IR 14 | 15 | __all__ = [k for k in globals().keys() if not k.startswith("_")] 16 | -------------------------------------------------------------------------------- /detectron2/layers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from .batch_norm import FrozenBatchNorm2d, get_norm, NaiveSyncBatchNorm, CycleBatchNormList 3 | from .deform_conv import DeformConv, ModulatedDeformConv 4 | from .mask_ops import paste_masks_in_image 5 | from .nms import batched_nms, batched_nms_rotated, nms, nms_rotated 6 | from .roi_align import ROIAlign, roi_align 7 | from .roi_align_rotated import ROIAlignRotated, roi_align_rotated 8 | from .shape_spec import ShapeSpec 9 | from .wrappers import ( 10 | BatchNorm2d, 11 | Conv2d, 12 | ConvTranspose2d, 13 | cat, 14 | interpolate, 15 | Linear, 16 | nonzero_tuple, 17 | cross_entropy, 18 | shapes_to_tensor, 19 | ) 20 | from .blocks import CNNBlockBase, DepthwiseSeparableConv2d 21 | from .aspp import ASPP 22 | from .losses import ciou_loss, diou_loss 23 | 24 | __all__ = [k for k in globals().keys() if not k.startswith("_")] 25 | -------------------------------------------------------------------------------- /detectron2/layers/blocks.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | 4 | import fvcore.nn.weight_init as weight_init 5 | from torch import nn 6 | 7 | from .batch_norm import FrozenBatchNorm2d, get_norm 8 | from .wrappers import Conv2d 9 | 10 | 11 | """ 12 | CNN building blocks. 13 | """ 14 | 15 | 16 | class CNNBlockBase(nn.Module): 17 | """ 18 | A CNN block is assumed to have input channels, output channels and a stride. 19 | The input and output of `forward()` method must be NCHW tensors. 20 | The method can perform arbitrary computation but must match the given 21 | channels and stride specification. 22 | 23 | Attribute: 24 | in_channels (int): 25 | out_channels (int): 26 | stride (int): 27 | """ 28 | 29 | def __init__(self, in_channels, out_channels, stride): 30 | """ 31 | The `__init__` method of any subclass should also contain these arguments. 32 | 33 | Args: 34 | in_channels (int): 35 | out_channels (int): 36 | stride (int): 37 | """ 38 | super().__init__() 39 | self.in_channels = in_channels 40 | self.out_channels = out_channels 41 | self.stride = stride 42 | 43 | def freeze(self): 44 | """ 45 | Make this block not trainable. 46 | This method sets all parameters to `requires_grad=False`, 47 | and convert all BatchNorm layers to FrozenBatchNorm 48 | 49 | Returns: 50 | the block itself 51 | """ 52 | for p in self.parameters(): 53 | p.requires_grad = False 54 | FrozenBatchNorm2d.convert_frozen_batchnorm(self) 55 | return self 56 | 57 | 58 | class DepthwiseSeparableConv2d(nn.Module): 59 | """ 60 | A kxk depthwise convolution + a 1x1 convolution. 61 | 62 | In :paper:`xception`, norm & activation are applied on the second conv. 63 | :paper:`mobilenet` uses norm & activation on both convs. 64 | """ 65 | 66 | def __init__( 67 | self, 68 | in_channels, 69 | out_channels, 70 | kernel_size=3, 71 | padding=1, 72 | dilation=1, 73 | *, 74 | norm1=None, 75 | activation1=None, 76 | norm2=None, 77 | activation2=None, 78 | ): 79 | """ 80 | Args: 81 | norm1, norm2 (str or callable): normalization for the two conv layers. 82 | activation1, activation2 (callable(Tensor) -> Tensor): activation 83 | function for the two conv layers. 84 | """ 85 | super().__init__() 86 | self.depthwise = Conv2d( 87 | in_channels, 88 | in_channels, 89 | kernel_size=kernel_size, 90 | padding=padding, 91 | dilation=dilation, 92 | groups=in_channels, 93 | bias=not norm1, 94 | norm=get_norm(norm1, in_channels), 95 | activation=activation1, 96 | ) 97 | self.pointwise = Conv2d( 98 | in_channels, 99 | out_channels, 100 | kernel_size=1, 101 | bias=not norm2, 102 | norm=get_norm(norm2, out_channels), 103 | activation=activation2, 104 | ) 105 | 106 | # default initialization 107 | weight_init.c2_msra_fill(self.depthwise) 108 | weight_init.c2_msra_fill(self.pointwise) 109 | 110 | def forward(self, x): 111 | return self.pointwise(self.depthwise(x)) 112 | -------------------------------------------------------------------------------- /detectron2/layers/csrc/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | To add a new Op: 4 | 5 | 1. Create a new directory 6 | 2. Implement new ops there 7 | 3. Delcare its Python interface in `vision.cpp`. 8 | -------------------------------------------------------------------------------- /detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. 2 | #pragma once 3 | #include 4 | 5 | namespace detectron2 { 6 | 7 | at::Tensor ROIAlignRotated_forward_cpu( 8 | const at::Tensor& input, 9 | const at::Tensor& rois, 10 | const float spatial_scale, 11 | const int pooled_height, 12 | const int pooled_width, 13 | const int sampling_ratio); 14 | 15 | at::Tensor ROIAlignRotated_backward_cpu( 16 | const at::Tensor& grad, 17 | const at::Tensor& rois, 18 | const float spatial_scale, 19 | const int pooled_height, 20 | const int pooled_width, 21 | const int batch_size, 22 | const int channels, 23 | const int height, 24 | const int width, 25 | const int sampling_ratio); 26 | 27 | #if defined(WITH_CUDA) || defined(WITH_HIP) 28 | at::Tensor ROIAlignRotated_forward_cuda( 29 | const at::Tensor& input, 30 | const at::Tensor& rois, 31 | const float spatial_scale, 32 | const int pooled_height, 33 | const int pooled_width, 34 | const int sampling_ratio); 35 | 36 | at::Tensor ROIAlignRotated_backward_cuda( 37 | const at::Tensor& grad, 38 | const at::Tensor& rois, 39 | const float spatial_scale, 40 | const int pooled_height, 41 | const int pooled_width, 42 | const int batch_size, 43 | const int channels, 44 | const int height, 45 | const int width, 46 | const int sampling_ratio); 47 | #endif 48 | 49 | // Interface for Python 50 | inline at::Tensor ROIAlignRotated_forward( 51 | const at::Tensor& input, 52 | const at::Tensor& rois, 53 | const double spatial_scale, 54 | const int64_t pooled_height, 55 | const int64_t pooled_width, 56 | const int64_t sampling_ratio) { 57 | if (input.is_cuda()) { 58 | #if defined(WITH_CUDA) || defined(WITH_HIP) 59 | return ROIAlignRotated_forward_cuda( 60 | input, 61 | rois, 62 | spatial_scale, 63 | pooled_height, 64 | pooled_width, 65 | sampling_ratio); 66 | #else 67 | AT_ERROR("Detectron2 is not compiled with GPU support!"); 68 | #endif 69 | } 70 | return ROIAlignRotated_forward_cpu( 71 | input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio); 72 | } 73 | 74 | inline at::Tensor ROIAlignRotated_backward( 75 | const at::Tensor& grad, 76 | const at::Tensor& rois, 77 | const double spatial_scale, 78 | const int64_t pooled_height, 79 | const int64_t pooled_width, 80 | const int64_t batch_size, 81 | const int64_t channels, 82 | const int64_t height, 83 | const int64_t width, 84 | const int64_t sampling_ratio) { 85 | if (grad.is_cuda()) { 86 | #if defined(WITH_CUDA) || defined(WITH_HIP) 87 | return ROIAlignRotated_backward_cuda( 88 | grad, 89 | rois, 90 | spatial_scale, 91 | pooled_height, 92 | pooled_width, 93 | batch_size, 94 | channels, 95 | height, 96 | width, 97 | sampling_ratio); 98 | #else 99 | AT_ERROR("Detectron2 is not compiled with GPU support!"); 100 | #endif 101 | } 102 | return ROIAlignRotated_backward_cpu( 103 | grad, 104 | rois, 105 | spatial_scale, 106 | pooled_height, 107 | pooled_width, 108 | batch_size, 109 | channels, 110 | height, 111 | width, 112 | sampling_ratio); 113 | } 114 | 115 | } // namespace detectron2 116 | -------------------------------------------------------------------------------- /detectron2/layers/csrc/box_iou_rotated/box_iou_rotated.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. 2 | #pragma once 3 | #include 4 | 5 | namespace detectron2 { 6 | 7 | at::Tensor box_iou_rotated_cpu( 8 | const at::Tensor& boxes1, 9 | const at::Tensor& boxes2); 10 | 11 | #if defined(WITH_CUDA) || defined(WITH_HIP) 12 | at::Tensor box_iou_rotated_cuda( 13 | const at::Tensor& boxes1, 14 | const at::Tensor& boxes2); 15 | #endif 16 | 17 | // Interface for Python 18 | // inline is needed to prevent multiple function definitions when this header is 19 | // included by different cpps 20 | inline at::Tensor box_iou_rotated( 21 | const at::Tensor& boxes1, 22 | const at::Tensor& boxes2) { 23 | assert(boxes1.device().is_cuda() == boxes2.device().is_cuda()); 24 | if (boxes1.device().is_cuda()) { 25 | #if defined(WITH_CUDA) || defined(WITH_HIP) 26 | return box_iou_rotated_cuda(boxes1.contiguous(), boxes2.contiguous()); 27 | #else 28 | AT_ERROR("Detectron2 is not compiled with GPU support!"); 29 | #endif 30 | } 31 | 32 | return box_iou_rotated_cpu(boxes1.contiguous(), boxes2.contiguous()); 33 | } 34 | 35 | } // namespace detectron2 36 | -------------------------------------------------------------------------------- /detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cpu.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. 2 | #include "box_iou_rotated.h" 3 | #include "box_iou_rotated_utils.h" 4 | 5 | namespace detectron2 { 6 | 7 | template 8 | void box_iou_rotated_cpu_kernel( 9 | const at::Tensor& boxes1, 10 | const at::Tensor& boxes2, 11 | at::Tensor& ious) { 12 | auto num_boxes1 = boxes1.size(0); 13 | auto num_boxes2 = boxes2.size(0); 14 | 15 | for (int i = 0; i < num_boxes1; i++) { 16 | for (int j = 0; j < num_boxes2; j++) { 17 | ious[i * num_boxes2 + j] = single_box_iou_rotated( 18 | boxes1[i].data_ptr(), boxes2[j].data_ptr()); 19 | } 20 | } 21 | } 22 | 23 | at::Tensor box_iou_rotated_cpu( 24 | // input must be contiguous: 25 | const at::Tensor& boxes1, 26 | const at::Tensor& boxes2) { 27 | auto num_boxes1 = boxes1.size(0); 28 | auto num_boxes2 = boxes2.size(0); 29 | at::Tensor ious = 30 | at::empty({num_boxes1 * num_boxes2}, boxes1.options().dtype(at::kFloat)); 31 | 32 | box_iou_rotated_cpu_kernel(boxes1, boxes2, ious); 33 | 34 | // reshape from 1d array to 2d array 35 | auto shape = std::vector{num_boxes1, num_boxes2}; 36 | return ious.reshape(shape); 37 | } 38 | 39 | } // namespace detectron2 40 | -------------------------------------------------------------------------------- /detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cuda.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "box_iou_rotated_utils.h" 7 | 8 | namespace detectron2 { 9 | 10 | // 2D block with 32 * 16 = 512 threads per block 11 | const int BLOCK_DIM_X = 32; 12 | const int BLOCK_DIM_Y = 16; 13 | 14 | template 15 | __global__ void box_iou_rotated_cuda_kernel( 16 | const int n_boxes1, 17 | const int n_boxes2, 18 | const T* dev_boxes1, 19 | const T* dev_boxes2, 20 | T* dev_ious) { 21 | const int row_start = blockIdx.x * blockDim.x; 22 | const int col_start = blockIdx.y * blockDim.y; 23 | 24 | const int row_size = min(n_boxes1 - row_start, blockDim.x); 25 | const int col_size = min(n_boxes2 - col_start, blockDim.y); 26 | 27 | __shared__ float block_boxes1[BLOCK_DIM_X * 5]; 28 | __shared__ float block_boxes2[BLOCK_DIM_Y * 5]; 29 | 30 | // It's safe to copy using threadIdx.x since BLOCK_DIM_X >= BLOCK_DIM_Y 31 | if (threadIdx.x < row_size && threadIdx.y == 0) { 32 | block_boxes1[threadIdx.x * 5 + 0] = 33 | dev_boxes1[(row_start + threadIdx.x) * 5 + 0]; 34 | block_boxes1[threadIdx.x * 5 + 1] = 35 | dev_boxes1[(row_start + threadIdx.x) * 5 + 1]; 36 | block_boxes1[threadIdx.x * 5 + 2] = 37 | dev_boxes1[(row_start + threadIdx.x) * 5 + 2]; 38 | block_boxes1[threadIdx.x * 5 + 3] = 39 | dev_boxes1[(row_start + threadIdx.x) * 5 + 3]; 40 | block_boxes1[threadIdx.x * 5 + 4] = 41 | dev_boxes1[(row_start + threadIdx.x) * 5 + 4]; 42 | } 43 | 44 | if (threadIdx.x < col_size && threadIdx.y == 0) { 45 | block_boxes2[threadIdx.x * 5 + 0] = 46 | dev_boxes2[(col_start + threadIdx.x) * 5 + 0]; 47 | block_boxes2[threadIdx.x * 5 + 1] = 48 | dev_boxes2[(col_start + threadIdx.x) * 5 + 1]; 49 | block_boxes2[threadIdx.x * 5 + 2] = 50 | dev_boxes2[(col_start + threadIdx.x) * 5 + 2]; 51 | block_boxes2[threadIdx.x * 5 + 3] = 52 | dev_boxes2[(col_start + threadIdx.x) * 5 + 3]; 53 | block_boxes2[threadIdx.x * 5 + 4] = 54 | dev_boxes2[(col_start + threadIdx.x) * 5 + 4]; 55 | } 56 | __syncthreads(); 57 | 58 | if (threadIdx.x < row_size && threadIdx.y < col_size) { 59 | int offset = (row_start + threadIdx.x) * n_boxes2 + col_start + threadIdx.y; 60 | dev_ious[offset] = single_box_iou_rotated( 61 | block_boxes1 + threadIdx.x * 5, block_boxes2 + threadIdx.y * 5); 62 | } 63 | } 64 | 65 | at::Tensor box_iou_rotated_cuda( 66 | // input must be contiguous 67 | const at::Tensor& boxes1, 68 | const at::Tensor& boxes2) { 69 | using scalar_t = float; 70 | AT_ASSERTM( 71 | boxes1.scalar_type() == at::kFloat, "boxes1 must be a float tensor"); 72 | AT_ASSERTM( 73 | boxes2.scalar_type() == at::kFloat, "boxes2 must be a float tensor"); 74 | AT_ASSERTM(boxes1.is_cuda(), "boxes1 must be a CUDA tensor"); 75 | AT_ASSERTM(boxes2.is_cuda(), "boxes2 must be a CUDA tensor"); 76 | at::cuda::CUDAGuard device_guard(boxes1.device()); 77 | 78 | auto num_boxes1 = boxes1.size(0); 79 | auto num_boxes2 = boxes2.size(0); 80 | 81 | at::Tensor ious = 82 | at::empty({num_boxes1 * num_boxes2}, boxes1.options().dtype(at::kFloat)); 83 | 84 | bool transpose = false; 85 | if (num_boxes1 > 0 && num_boxes2 > 0) { 86 | scalar_t *data1 = boxes1.data_ptr(), 87 | *data2 = boxes2.data_ptr(); 88 | 89 | if (num_boxes2 > 65535 * BLOCK_DIM_Y) { 90 | AT_ASSERTM( 91 | num_boxes1 <= 65535 * BLOCK_DIM_Y, 92 | "Too many boxes for box_iou_rotated_cuda!"); 93 | // x dim is allowed to be large, but y dim cannot, 94 | // so we transpose the two to avoid "invalid configuration argument" 95 | // error. We assume one of them is small. Otherwise the result is hard to 96 | // fit in memory anyway. 97 | std::swap(num_boxes1, num_boxes2); 98 | std::swap(data1, data2); 99 | transpose = true; 100 | } 101 | 102 | const int blocks_x = 103 | at::cuda::ATenCeilDiv(static_cast(num_boxes1), BLOCK_DIM_X); 104 | const int blocks_y = 105 | at::cuda::ATenCeilDiv(static_cast(num_boxes2), BLOCK_DIM_Y); 106 | 107 | dim3 blocks(blocks_x, blocks_y); 108 | dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y); 109 | cudaStream_t stream = at::cuda::getCurrentCUDAStream(); 110 | 111 | box_iou_rotated_cuda_kernel<<>>( 112 | num_boxes1, 113 | num_boxes2, 114 | data1, 115 | data2, 116 | (scalar_t*)ious.data_ptr()); 117 | 118 | AT_CUDA_CHECK(cudaGetLastError()); 119 | } 120 | 121 | // reshape from 1d array to 2d array 122 | auto shape = std::vector{num_boxes1, num_boxes2}; 123 | if (transpose) { 124 | return ious.view(shape).t(); 125 | } else { 126 | return ious.view(shape); 127 | } 128 | } 129 | 130 | } // namespace detectron2 131 | -------------------------------------------------------------------------------- /detectron2/layers/csrc/cocoeval/cocoeval.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. 2 | #pragma once 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | namespace py = pybind11; 11 | 12 | namespace detectron2 { 13 | 14 | namespace COCOeval { 15 | 16 | // Annotation data for a single object instance in an image 17 | struct InstanceAnnotation { 18 | InstanceAnnotation( 19 | uint64_t id, 20 | double score, 21 | double area, 22 | bool is_crowd, 23 | bool ignore) 24 | : id{id}, score{score}, area{area}, is_crowd{is_crowd}, ignore{ignore} {} 25 | uint64_t id; 26 | double score = 0.; 27 | double area = 0.; 28 | bool is_crowd = false; 29 | bool ignore = false; 30 | }; 31 | 32 | // Stores intermediate results for evaluating detection results for a single 33 | // image that has D detected instances and G ground truth instances. This stores 34 | // matches between detected and ground truth instances 35 | struct ImageEvaluation { 36 | // For each of the D detected instances, the id of the matched ground truth 37 | // instance, or 0 if unmatched 38 | std::vector detection_matches; 39 | 40 | // The detection score of each of the D detected instances 41 | std::vector detection_scores; 42 | 43 | // Marks whether or not each of G instances was ignored from evaluation (e.g., 44 | // because it's outside area_range) 45 | std::vector ground_truth_ignores; 46 | 47 | // Marks whether or not each of D instances was ignored from evaluation (e.g., 48 | // because it's outside aRng) 49 | std::vector detection_ignores; 50 | }; 51 | 52 | template 53 | using ImageCategoryInstances = std::vector>>; 54 | 55 | // C++ implementation of COCO API cocoeval.py::COCOeval.evaluateImg(). For each 56 | // combination of image, category, area range settings, and IOU thresholds to 57 | // evaluate, it matches detected instances to ground truth instances and stores 58 | // the results into a vector of ImageEvaluation results, which will be 59 | // interpreted by the COCOeval::Accumulate() function to produce precion-recall 60 | // curves. The parameters of nested vectors have the following semantics: 61 | // image_category_ious[i][c][d][g] is the intersection over union of the d'th 62 | // detected instance and g'th ground truth instance of 63 | // category category_ids[c] in image image_ids[i] 64 | // image_category_ground_truth_instances[i][c] is a vector of ground truth 65 | // instances in image image_ids[i] of category category_ids[c] 66 | // image_category_detection_instances[i][c] is a vector of detected 67 | // instances in image image_ids[i] of category category_ids[c] 68 | std::vector EvaluateImages( 69 | const std::vector>& area_ranges, // vector of 2-tuples 70 | int max_detections, 71 | const std::vector& iou_thresholds, 72 | const ImageCategoryInstances>& image_category_ious, 73 | const ImageCategoryInstances& 74 | image_category_ground_truth_instances, 75 | const ImageCategoryInstances& 76 | image_category_detection_instances); 77 | 78 | // C++ implementation of COCOeval.accumulate(), which generates precision 79 | // recall curves for each set of category, IOU threshold, detection area range, 80 | // and max number of detections parameters. It is assumed that the parameter 81 | // evaluations is the return value of the functon COCOeval::EvaluateImages(), 82 | // which was called with the same parameter settings params 83 | py::dict Accumulate( 84 | const py::object& params, 85 | const std::vector& evalutations); 86 | 87 | } // namespace COCOeval 88 | } // namespace detectron2 89 | -------------------------------------------------------------------------------- /detectron2/layers/csrc/cuda_version.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | #include 4 | 5 | namespace detectron2 { 6 | int get_cudart_version() { 7 | // Not a ROCM platform: Either HIP is not used, or 8 | // it is used, but platform is not ROCM (i.e. it is CUDA) 9 | #if !defined(__HIP_PLATFORM_HCC__) 10 | return CUDART_VERSION; 11 | #else 12 | int version = 0; 13 | 14 | #if HIP_VERSION_MAJOR != 0 15 | // Create a convention similar to that of CUDA, as assumed by other 16 | // parts of the code. 17 | 18 | version = HIP_VERSION_MINOR; 19 | version += (HIP_VERSION_MAJOR * 100); 20 | #else 21 | hipRuntimeGetVersion(&version); 22 | #endif 23 | return version; 24 | #endif 25 | } 26 | } // namespace detectron2 27 | -------------------------------------------------------------------------------- /detectron2/layers/csrc/nms_rotated/nms_rotated.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. 2 | #pragma once 3 | #include 4 | 5 | namespace detectron2 { 6 | 7 | at::Tensor nms_rotated_cpu( 8 | const at::Tensor& dets, 9 | const at::Tensor& scores, 10 | const double iou_threshold); 11 | 12 | #if defined(WITH_CUDA) || defined(WITH_HIP) 13 | at::Tensor nms_rotated_cuda( 14 | const at::Tensor& dets, 15 | const at::Tensor& scores, 16 | const double iou_threshold); 17 | #endif 18 | 19 | // Interface for Python 20 | // inline is needed to prevent multiple function definitions when this header is 21 | // included by different cpps 22 | inline at::Tensor nms_rotated( 23 | const at::Tensor& dets, 24 | const at::Tensor& scores, 25 | const double iou_threshold) { 26 | assert(dets.device().is_cuda() == scores.device().is_cuda()); 27 | if (dets.device().is_cuda()) { 28 | #if defined(WITH_CUDA) || defined(WITH_HIP) 29 | return nms_rotated_cuda( 30 | dets.contiguous(), scores.contiguous(), iou_threshold); 31 | #else 32 | AT_ERROR("Detectron2 is not compiled with GPU support!"); 33 | #endif 34 | } 35 | 36 | return nms_rotated_cpu(dets.contiguous(), scores.contiguous(), iou_threshold); 37 | } 38 | 39 | } // namespace detectron2 40 | -------------------------------------------------------------------------------- /detectron2/layers/csrc/nms_rotated/nms_rotated_cpu.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. 2 | #include "../box_iou_rotated/box_iou_rotated_utils.h" 3 | #include "nms_rotated.h" 4 | 5 | namespace detectron2 { 6 | 7 | template 8 | at::Tensor nms_rotated_cpu_kernel( 9 | const at::Tensor& dets, 10 | const at::Tensor& scores, 11 | const double iou_threshold) { 12 | // nms_rotated_cpu_kernel is modified from torchvision's nms_cpu_kernel, 13 | // however, the code in this function is much shorter because 14 | // we delegate the IoU computation for rotated boxes to 15 | // the single_box_iou_rotated function in box_iou_rotated_utils.h 16 | AT_ASSERTM(dets.device().is_cpu(), "dets must be a CPU tensor"); 17 | AT_ASSERTM(scores.device().is_cpu(), "scores must be a CPU tensor"); 18 | AT_ASSERTM( 19 | dets.scalar_type() == scores.scalar_type(), 20 | "dets should have the same type as scores"); 21 | 22 | if (dets.numel() == 0) { 23 | return at::empty({0}, dets.options().dtype(at::kLong)); 24 | } 25 | 26 | auto order_t = std::get<1>(scores.sort(0, /* descending=*/true)); 27 | 28 | auto ndets = dets.size(0); 29 | at::Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte)); 30 | at::Tensor keep_t = at::zeros({ndets}, dets.options().dtype(at::kLong)); 31 | 32 | auto suppressed = suppressed_t.data_ptr(); 33 | auto keep = keep_t.data_ptr(); 34 | auto order = order_t.data_ptr(); 35 | 36 | int64_t num_to_keep = 0; 37 | 38 | for (int64_t _i = 0; _i < ndets; _i++) { 39 | auto i = order[_i]; 40 | if (suppressed[i] == 1) { 41 | continue; 42 | } 43 | 44 | keep[num_to_keep++] = i; 45 | 46 | for (int64_t _j = _i + 1; _j < ndets; _j++) { 47 | auto j = order[_j]; 48 | if (suppressed[j] == 1) { 49 | continue; 50 | } 51 | 52 | auto ovr = single_box_iou_rotated( 53 | dets[i].data_ptr(), dets[j].data_ptr()); 54 | if (ovr >= iou_threshold) { 55 | suppressed[j] = 1; 56 | } 57 | } 58 | } 59 | return keep_t.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep); 60 | } 61 | 62 | at::Tensor nms_rotated_cpu( 63 | // input must be contiguous 64 | const at::Tensor& dets, 65 | const at::Tensor& scores, 66 | const double iou_threshold) { 67 | auto result = at::empty({0}, dets.options()); 68 | 69 | AT_DISPATCH_FLOATING_TYPES(dets.scalar_type(), "nms_rotated", [&] { 70 | result = nms_rotated_cpu_kernel(dets, scores, iou_threshold); 71 | }); 72 | return result; 73 | } 74 | 75 | } // namespace detectron2 76 | -------------------------------------------------------------------------------- /detectron2/layers/csrc/vision.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | #include 4 | #include "ROIAlignRotated/ROIAlignRotated.h" 5 | #include "box_iou_rotated/box_iou_rotated.h" 6 | #include "cocoeval/cocoeval.h" 7 | #include "deformable/deform_conv.h" 8 | #include "nms_rotated/nms_rotated.h" 9 | 10 | namespace detectron2 { 11 | 12 | #if defined(WITH_CUDA) || defined(WITH_HIP) 13 | extern int get_cudart_version(); 14 | #endif 15 | 16 | std::string get_cuda_version() { 17 | #if defined(WITH_CUDA) || defined(WITH_HIP) 18 | std::ostringstream oss; 19 | 20 | #if defined(WITH_CUDA) 21 | oss << "CUDA "; 22 | #else 23 | oss << "HIP "; 24 | #endif 25 | 26 | // copied from 27 | // https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/cuda/detail/CUDAHooks.cpp#L231 28 | auto printCudaStyleVersion = [&](int v) { 29 | oss << (v / 1000) << "." << (v / 10 % 100); 30 | if (v % 10 != 0) { 31 | oss << "." << (v % 10); 32 | } 33 | }; 34 | printCudaStyleVersion(get_cudart_version()); 35 | return oss.str(); 36 | #else // neither CUDA nor HIP 37 | return std::string("not available"); 38 | #endif 39 | } 40 | 41 | bool has_cuda() { 42 | #if defined(WITH_CUDA) 43 | return true; 44 | #else 45 | return false; 46 | #endif 47 | } 48 | 49 | // similar to 50 | // https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Version.cpp 51 | std::string get_compiler_version() { 52 | std::ostringstream ss; 53 | #if defined(__GNUC__) 54 | #ifndef __clang__ 55 | 56 | #if ((__GNUC__ <= 4) && (__GNUC_MINOR__ <= 8)) 57 | #error "GCC >= 4.9 is required!" 58 | #endif 59 | 60 | { ss << "GCC " << __GNUC__ << "." << __GNUC_MINOR__; } 61 | #endif 62 | #endif 63 | 64 | #if defined(__clang_major__) 65 | { 66 | ss << "clang " << __clang_major__ << "." << __clang_minor__ << "." 67 | << __clang_patchlevel__; 68 | } 69 | #endif 70 | 71 | #if defined(_MSC_VER) 72 | { ss << "MSVC " << _MSC_FULL_VER; } 73 | #endif 74 | return ss.str(); 75 | } 76 | 77 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 78 | m.def("get_compiler_version", &get_compiler_version, "get_compiler_version"); 79 | m.def("get_cuda_version", &get_cuda_version, "get_cuda_version"); 80 | m.def("has_cuda", &has_cuda, "has_cuda"); 81 | 82 | m.def("deform_conv_forward", &deform_conv_forward, "deform_conv_forward"); 83 | m.def( 84 | "deform_conv_backward_input", 85 | &deform_conv_backward_input, 86 | "deform_conv_backward_input"); 87 | m.def( 88 | "deform_conv_backward_filter", 89 | &deform_conv_backward_filter, 90 | "deform_conv_backward_filter"); 91 | m.def( 92 | "modulated_deform_conv_forward", 93 | &modulated_deform_conv_forward, 94 | "modulated_deform_conv_forward"); 95 | m.def( 96 | "modulated_deform_conv_backward", 97 | &modulated_deform_conv_backward, 98 | "modulated_deform_conv_backward"); 99 | 100 | m.def("COCOevalAccumulate", &COCOeval::Accumulate, "COCOeval::Accumulate"); 101 | m.def( 102 | "COCOevalEvaluateImages", 103 | &COCOeval::EvaluateImages, 104 | "COCOeval::EvaluateImages"); 105 | pybind11::class_(m, "InstanceAnnotation") 106 | .def(pybind11::init()); 107 | pybind11::class_(m, "ImageEvaluation") 108 | .def(pybind11::init<>()); 109 | } 110 | 111 | TORCH_LIBRARY(detectron2, m) { 112 | m.def("nms_rotated", &nms_rotated); 113 | m.def("box_iou_rotated", &box_iou_rotated); 114 | m.def("roi_align_rotated_forward", &ROIAlignRotated_forward); 115 | m.def("roi_align_rotated_backward", &ROIAlignRotated_backward); 116 | } 117 | } // namespace detectron2 118 | -------------------------------------------------------------------------------- /detectron2/layers/losses.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | 4 | 5 | def diou_loss( 6 | boxes1: torch.Tensor, 7 | boxes2: torch.Tensor, 8 | reduction: str = "none", 9 | eps: float = 1e-7, 10 | ) -> torch.Tensor: 11 | """ 12 | Distance Intersection over Union Loss (Zhaohui Zheng et. al) 13 | https://arxiv.org/abs/1911.08287 14 | Args: 15 | boxes1, boxes2 (Tensor): box locations in XYXY format, shape (N, 4) or (4,). 16 | reduction: 'none' | 'mean' | 'sum' 17 | 'none': No reduction will be applied to the output. 18 | 'mean': The output will be averaged. 19 | 'sum': The output will be summed. 20 | eps (float): small number to prevent division by zero 21 | """ 22 | 23 | x1, y1, x2, y2 = boxes1.unbind(dim=-1) 24 | x1g, y1g, x2g, y2g = boxes2.unbind(dim=-1) 25 | 26 | # TODO: use torch._assert_async() when pytorch 1.8 support is dropped 27 | assert (x2 >= x1).all(), "bad box: x1 larger than x2" 28 | assert (y2 >= y1).all(), "bad box: y1 larger than y2" 29 | 30 | # Intersection keypoints 31 | xkis1 = torch.max(x1, x1g) 32 | ykis1 = torch.max(y1, y1g) 33 | xkis2 = torch.min(x2, x2g) 34 | ykis2 = torch.min(y2, y2g) 35 | 36 | intsct = torch.zeros_like(x1) 37 | mask = (ykis2 > ykis1) & (xkis2 > xkis1) 38 | intsct[mask] = (xkis2[mask] - xkis1[mask]) * (ykis2[mask] - ykis1[mask]) 39 | union = (x2 - x1) * (y2 - y1) + (x2g - x1g) * (y2g - y1g) - intsct + eps 40 | iou = intsct / union 41 | 42 | # smallest enclosing box 43 | xc1 = torch.min(x1, x1g) 44 | yc1 = torch.min(y1, y1g) 45 | xc2 = torch.max(x2, x2g) 46 | yc2 = torch.max(y2, y2g) 47 | diag_len = ((xc2 - xc1) ** 2) + ((yc2 - yc1) ** 2) + eps 48 | 49 | # centers of boxes 50 | x_p = (x2 + x1) / 2 51 | y_p = (y2 + y1) / 2 52 | x_g = (x1g + x2g) / 2 53 | y_g = (y1g + y2g) / 2 54 | distance = ((x_p - x_g) ** 2) + ((y_p - y_g) ** 2) 55 | 56 | # Eqn. (7) 57 | loss = 1 - iou + (distance / diag_len) 58 | if reduction == "mean": 59 | loss = loss.mean() if loss.numel() > 0 else 0.0 * loss.sum() 60 | elif reduction == "sum": 61 | loss = loss.sum() 62 | 63 | return loss 64 | 65 | 66 | def ciou_loss( 67 | boxes1: torch.Tensor, 68 | boxes2: torch.Tensor, 69 | reduction: str = "none", 70 | eps: float = 1e-7, 71 | ) -> torch.Tensor: 72 | """ 73 | Complete Intersection over Union Loss (Zhaohui Zheng et. al) 74 | https://arxiv.org/abs/1911.08287 75 | Args: 76 | boxes1, boxes2 (Tensor): box locations in XYXY format, shape (N, 4) or (4,). 77 | reduction: 'none' | 'mean' | 'sum' 78 | 'none': No reduction will be applied to the output. 79 | 'mean': The output will be averaged. 80 | 'sum': The output will be summed. 81 | eps (float): small number to prevent division by zero 82 | """ 83 | 84 | x1, y1, x2, y2 = boxes1.unbind(dim=-1) 85 | x1g, y1g, x2g, y2g = boxes2.unbind(dim=-1) 86 | 87 | # TODO: use torch._assert_async() when pytorch 1.8 support is dropped 88 | assert (x2 >= x1).all(), "bad box: x1 larger than x2" 89 | assert (y2 >= y1).all(), "bad box: y1 larger than y2" 90 | 91 | # Intersection keypoints 92 | xkis1 = torch.max(x1, x1g) 93 | ykis1 = torch.max(y1, y1g) 94 | xkis2 = torch.min(x2, x2g) 95 | ykis2 = torch.min(y2, y2g) 96 | 97 | intsct = torch.zeros_like(x1) 98 | mask = (ykis2 > ykis1) & (xkis2 > xkis1) 99 | intsct[mask] = (xkis2[mask] - xkis1[mask]) * (ykis2[mask] - ykis1[mask]) 100 | union = (x2 - x1) * (y2 - y1) + (x2g - x1g) * (y2g - y1g) - intsct + eps 101 | iou = intsct / union 102 | 103 | # smallest enclosing box 104 | xc1 = torch.min(x1, x1g) 105 | yc1 = torch.min(y1, y1g) 106 | xc2 = torch.max(x2, x2g) 107 | yc2 = torch.max(y2, y2g) 108 | diag_len = ((xc2 - xc1) ** 2) + ((yc2 - yc1) ** 2) + eps 109 | 110 | # centers of boxes 111 | x_p = (x2 + x1) / 2 112 | y_p = (y2 + y1) / 2 113 | x_g = (x1g + x2g) / 2 114 | y_g = (y1g + y2g) / 2 115 | distance = ((x_p - x_g) ** 2) + ((y_p - y_g) ** 2) 116 | 117 | # width and height of boxes 118 | w_pred = x2 - x1 119 | h_pred = y2 - y1 120 | w_gt = x2g - x1g 121 | h_gt = y2g - y1g 122 | v = (4 / (math.pi ** 2)) * torch.pow((torch.atan(w_gt / h_gt) - torch.atan(w_pred / h_pred)), 2) 123 | with torch.no_grad(): 124 | alpha = v / (1 - iou + v + eps) 125 | 126 | # Eqn. (10) 127 | loss = 1 - iou + (distance / diag_len) + alpha * v 128 | if reduction == "mean": 129 | loss = loss.mean() if loss.numel() > 0 else 0.0 * loss.sum() 130 | elif reduction == "sum": 131 | loss = loss.sum() 132 | 133 | return loss 134 | -------------------------------------------------------------------------------- /detectron2/layers/roi_align.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from torch import nn 3 | from torchvision.ops import roi_align 4 | 5 | 6 | # NOTE: torchvision's RoIAlign has a different default aligned=False 7 | class ROIAlign(nn.Module): 8 | def __init__(self, output_size, spatial_scale, sampling_ratio, aligned=True): 9 | """ 10 | Args: 11 | output_size (tuple): h, w 12 | spatial_scale (float): scale the input boxes by this number 13 | sampling_ratio (int): number of inputs samples to take for each output 14 | sample. 0 to take samples densely. 15 | aligned (bool): if False, use the legacy implementation in 16 | Detectron. If True, align the results more perfectly. 17 | 18 | Note: 19 | The meaning of aligned=True: 20 | 21 | Given a continuous coordinate c, its two neighboring pixel indices (in our 22 | pixel model) are computed by floor(c - 0.5) and ceil(c - 0.5). For example, 23 | c=1.3 has pixel neighbors with discrete indices [0] and [1] (which are sampled 24 | from the underlying signal at continuous coordinates 0.5 and 1.5). But the original 25 | roi_align (aligned=False) does not subtract the 0.5 when computing neighboring 26 | pixel indices and therefore it uses pixels with a slightly incorrect alignment 27 | (relative to our pixel model) when performing bilinear interpolation. 28 | 29 | With `aligned=True`, 30 | we first appropriately scale the ROI and then shift it by -0.5 31 | prior to calling roi_align. This produces the correct neighbors; see 32 | detectron2/tests/test_roi_align.py for verification. 33 | 34 | The difference does not make a difference to the model's performance if 35 | ROIAlign is used together with conv layers. 36 | """ 37 | super().__init__() 38 | self.output_size = output_size 39 | self.spatial_scale = spatial_scale 40 | self.sampling_ratio = sampling_ratio 41 | self.aligned = aligned 42 | 43 | from torchvision import __version__ 44 | 45 | version = tuple(int(x) for x in __version__.split(".")[:2]) 46 | # https://github.com/pytorch/vision/pull/2438 47 | assert version >= (0, 7), "Require torchvision >= 0.7" 48 | 49 | def forward(self, input, rois): 50 | """ 51 | Args: 52 | input: NCHW images 53 | rois: Bx5 boxes. First column is the index into N. The other 4 columns are xyxy. 54 | """ 55 | assert rois.dim() == 2 and rois.size(1) == 5 56 | if input.is_quantized: 57 | input = input.dequantize() 58 | return roi_align( 59 | input, 60 | rois.to(dtype=input.dtype), 61 | self.output_size, 62 | self.spatial_scale, 63 | self.sampling_ratio, 64 | self.aligned, 65 | ) 66 | 67 | def __repr__(self): 68 | tmpstr = self.__class__.__name__ + "(" 69 | tmpstr += "output_size=" + str(self.output_size) 70 | tmpstr += ", spatial_scale=" + str(self.spatial_scale) 71 | tmpstr += ", sampling_ratio=" + str(self.sampling_ratio) 72 | tmpstr += ", aligned=" + str(self.aligned) 73 | tmpstr += ")" 74 | return tmpstr 75 | -------------------------------------------------------------------------------- /detectron2/layers/roi_align_rotated.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import torch 3 | from torch import nn 4 | from torch.autograd import Function 5 | from torch.autograd.function import once_differentiable 6 | from torch.nn.modules.utils import _pair 7 | 8 | 9 | class _ROIAlignRotated(Function): 10 | @staticmethod 11 | def forward(ctx, input, roi, output_size, spatial_scale, sampling_ratio): 12 | ctx.save_for_backward(roi) 13 | ctx.output_size = _pair(output_size) 14 | ctx.spatial_scale = spatial_scale 15 | ctx.sampling_ratio = sampling_ratio 16 | ctx.input_shape = input.size() 17 | output = torch.ops.detectron2.roi_align_rotated_forward( 18 | input, roi, spatial_scale, output_size[0], output_size[1], sampling_ratio 19 | ) 20 | return output 21 | 22 | @staticmethod 23 | @once_differentiable 24 | def backward(ctx, grad_output): 25 | (rois,) = ctx.saved_tensors 26 | output_size = ctx.output_size 27 | spatial_scale = ctx.spatial_scale 28 | sampling_ratio = ctx.sampling_ratio 29 | bs, ch, h, w = ctx.input_shape 30 | grad_input = torch.ops.detectron2.roi_align_rotated_backward( 31 | grad_output, 32 | rois, 33 | spatial_scale, 34 | output_size[0], 35 | output_size[1], 36 | bs, 37 | ch, 38 | h, 39 | w, 40 | sampling_ratio, 41 | ) 42 | return grad_input, None, None, None, None, None 43 | 44 | 45 | roi_align_rotated = _ROIAlignRotated.apply 46 | 47 | 48 | class ROIAlignRotated(nn.Module): 49 | def __init__(self, output_size, spatial_scale, sampling_ratio): 50 | """ 51 | Args: 52 | output_size (tuple): h, w 53 | spatial_scale (float): scale the input boxes by this number 54 | sampling_ratio (int): number of inputs samples to take for each output 55 | sample. 0 to take samples densely. 56 | 57 | Note: 58 | ROIAlignRotated supports continuous coordinate by default: 59 | Given a continuous coordinate c, its two neighboring pixel indices (in our 60 | pixel model) are computed by floor(c - 0.5) and ceil(c - 0.5). For example, 61 | c=1.3 has pixel neighbors with discrete indices [0] and [1] (which are sampled 62 | from the underlying signal at continuous coordinates 0.5 and 1.5). 63 | """ 64 | super(ROIAlignRotated, self).__init__() 65 | self.output_size = output_size 66 | self.spatial_scale = spatial_scale 67 | self.sampling_ratio = sampling_ratio 68 | 69 | def forward(self, input, rois): 70 | """ 71 | Args: 72 | input: NCHW images 73 | rois: Bx6 boxes. First column is the index into N. 74 | The other 5 columns are (x_ctr, y_ctr, width, height, angle_degrees). 75 | """ 76 | assert rois.dim() == 2 and rois.size(1) == 6 77 | orig_dtype = input.dtype 78 | if orig_dtype == torch.float16: 79 | input = input.float() 80 | rois = rois.float() 81 | return roi_align_rotated( 82 | input, rois, self.output_size, self.spatial_scale, self.sampling_ratio 83 | ).to(dtype=orig_dtype) 84 | 85 | def __repr__(self): 86 | tmpstr = self.__class__.__name__ + "(" 87 | tmpstr += "output_size=" + str(self.output_size) 88 | tmpstr += ", spatial_scale=" + str(self.spatial_scale) 89 | tmpstr += ", sampling_ratio=" + str(self.sampling_ratio) 90 | tmpstr += ")" 91 | return tmpstr 92 | -------------------------------------------------------------------------------- /detectron2/layers/rotated_boxes.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from __future__ import absolute_import, division, print_function, unicode_literals 3 | import torch 4 | 5 | 6 | def pairwise_iou_rotated(boxes1, boxes2): 7 | """ 8 | Return intersection-over-union (Jaccard index) of boxes. 9 | 10 | Both sets of boxes are expected to be in 11 | (x_center, y_center, width, height, angle) format. 12 | 13 | Arguments: 14 | boxes1 (Tensor[N, 5]) 15 | boxes2 (Tensor[M, 5]) 16 | 17 | Returns: 18 | iou (Tensor[N, M]): the NxM matrix containing the pairwise 19 | IoU values for every element in boxes1 and boxes2 20 | """ 21 | return torch.ops.detectron2.box_iou_rotated(boxes1, boxes2) 22 | -------------------------------------------------------------------------------- /detectron2/layers/shape_spec.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | from collections import namedtuple 4 | 5 | 6 | class ShapeSpec(namedtuple("_ShapeSpec", ["channels", "height", "width", "stride"])): 7 | """ 8 | A simple structure that contains basic shape specification about a tensor. 9 | It is often used as the auxiliary inputs/outputs of models, 10 | to complement the lack of shape inference ability among pytorch modules. 11 | 12 | Attributes: 13 | channels: 14 | height: 15 | width: 16 | stride: 17 | """ 18 | 19 | def __new__(cls, channels=None, height=None, width=None, stride=None): 20 | return super().__new__(cls, channels, height, width, stride) 21 | -------------------------------------------------------------------------------- /detectron2/layers/wrappers.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | """ 3 | Wrappers around on some nn functions, mainly to support empty tensors. 4 | 5 | Ideally, add support directly in PyTorch to empty tensors in those functions. 6 | 7 | These can be removed once https://github.com/pytorch/pytorch/issues/12013 8 | is implemented 9 | """ 10 | 11 | from typing import List, Optional 12 | import torch 13 | from torch.nn import functional as F 14 | 15 | 16 | def shapes_to_tensor(x: List[int], device: Optional[torch.device] = None) -> torch.Tensor: 17 | """ 18 | Turn a list of integer scalars or integer Tensor scalars into a vector, 19 | in a way that's both traceable and scriptable. 20 | 21 | In tracing, `x` should be a list of scalar Tensor, so the output can trace to the inputs. 22 | In scripting or eager, `x` should be a list of int. 23 | """ 24 | if torch.jit.is_scripting(): 25 | return torch.as_tensor(x, device=device) 26 | if torch.jit.is_tracing(): 27 | assert all( 28 | [isinstance(t, torch.Tensor) for t in x] 29 | ), "Shape should be tensor during tracing!" 30 | # as_tensor should not be used in tracing because it records a constant 31 | ret = torch.stack(x) 32 | if ret.device != device: # avoid recording a hard-coded device if not necessary 33 | ret = ret.to(device=device) 34 | return ret 35 | return torch.as_tensor(x, device=device) 36 | 37 | 38 | def cat(tensors: List[torch.Tensor], dim: int = 0): 39 | """ 40 | Efficient version of torch.cat that avoids a copy if there is only a single element in a list 41 | """ 42 | assert isinstance(tensors, (list, tuple)) 43 | if len(tensors) == 1: 44 | return tensors[0] 45 | return torch.cat(tensors, dim) 46 | 47 | 48 | def cross_entropy(input, target, *, reduction="mean", **kwargs): 49 | """ 50 | Same as `torch.nn.functional.cross_entropy`, but returns 0 (instead of nan) 51 | for empty inputs. 52 | """ 53 | if target.numel() == 0 and reduction == "mean": 54 | return input.sum() * 0.0 # connect the gradient 55 | return F.cross_entropy(input, target, reduction=reduction, **kwargs) 56 | 57 | 58 | class _NewEmptyTensorOp(torch.autograd.Function): 59 | @staticmethod 60 | def forward(ctx, x, new_shape): 61 | ctx.shape = x.shape 62 | return x.new_empty(new_shape) 63 | 64 | @staticmethod 65 | def backward(ctx, grad): 66 | shape = ctx.shape 67 | return _NewEmptyTensorOp.apply(grad, shape), None 68 | 69 | 70 | class Conv2d(torch.nn.Conv2d): 71 | """ 72 | A wrapper around :class:`torch.nn.Conv2d` to support empty inputs and more features. 73 | """ 74 | 75 | def __init__(self, *args, **kwargs): 76 | """ 77 | Extra keyword arguments supported in addition to those in `torch.nn.Conv2d`: 78 | 79 | Args: 80 | norm (nn.Module, optional): a normalization layer 81 | activation (callable(Tensor) -> Tensor): a callable activation function 82 | 83 | It assumes that norm layer is used before activation. 84 | """ 85 | norm = kwargs.pop("norm", None) 86 | activation = kwargs.pop("activation", None) 87 | super().__init__(*args, **kwargs) 88 | 89 | self.norm = norm 90 | self.activation = activation 91 | 92 | def forward(self, x): 93 | # torchscript does not support SyncBatchNorm yet 94 | # https://github.com/pytorch/pytorch/issues/40507 95 | # and we skip these codes in torchscript since: 96 | # 1. currently we only support torchscript in evaluation mode 97 | # 2. features needed by exporting module to torchscript are added in PyTorch 1.6 or 98 | # later version, `Conv2d` in these PyTorch versions has already supported empty inputs. 99 | if not torch.jit.is_scripting(): 100 | if x.numel() == 0 and self.training: 101 | # https://github.com/pytorch/pytorch/issues/12013 102 | assert not isinstance( 103 | self.norm, torch.nn.SyncBatchNorm 104 | ), "SyncBatchNorm does not support empty inputs!" 105 | 106 | x = F.conv2d( 107 | x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups 108 | ) 109 | if self.norm is not None: 110 | x = self.norm(x) 111 | if self.activation is not None: 112 | x = self.activation(x) 113 | return x 114 | 115 | 116 | ConvTranspose2d = torch.nn.ConvTranspose2d 117 | BatchNorm2d = torch.nn.BatchNorm2d 118 | interpolate = F.interpolate 119 | Linear = torch.nn.Linear 120 | 121 | 122 | def nonzero_tuple(x): 123 | """ 124 | A 'as_tuple=True' version of torch.nonzero to support torchscript. 125 | because of https://github.com/pytorch/pytorch/issues/38718 126 | """ 127 | if torch.jit.is_scripting(): 128 | if x.dim() == 0: 129 | return x.unsqueeze(0).nonzero().unbind(1) 130 | return x.nonzero().unbind(1) 131 | else: 132 | return x.nonzero(as_tuple=True) 133 | -------------------------------------------------------------------------------- /detectron2/model_zoo/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | """ 3 | Model Zoo API for Detectron2: a collection of functions to create common model architectures 4 | listed in `MODEL_ZOO.md `_, 5 | and optionally load their pre-trained weights. 6 | """ 7 | 8 | from .model_zoo import get, get_config_file, get_checkpoint_url, get_config 9 | 10 | __all__ = ["get_checkpoint_url", "get", "get_config_file", "get_config"] 11 | -------------------------------------------------------------------------------- /detectron2/modeling/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from detectron2.layers import ShapeSpec 3 | 4 | from .anchor_generator import build_anchor_generator, ANCHOR_GENERATOR_REGISTRY 5 | from .backbone import ( 6 | BACKBONE_REGISTRY, 7 | FPN, 8 | Backbone, 9 | ResNet, 10 | ResNetBlockBase, 11 | build_backbone, 12 | build_resnet_backbone, 13 | make_stage, 14 | ) 15 | from .meta_arch import ( 16 | META_ARCH_REGISTRY, 17 | SEM_SEG_HEADS_REGISTRY, 18 | GeneralizedRCNN, 19 | PanopticFPN, 20 | ProposalNetwork, 21 | RetinaNet, 22 | SemanticSegmentor, 23 | build_model, 24 | build_sem_seg_head, 25 | FCOS, 26 | ) 27 | from .postprocessing import detector_postprocess 28 | from .proposal_generator import ( 29 | PROPOSAL_GENERATOR_REGISTRY, 30 | build_proposal_generator, 31 | RPN_HEAD_REGISTRY, 32 | build_rpn_head, 33 | ) 34 | from .roi_heads import ( 35 | ROI_BOX_HEAD_REGISTRY, 36 | ROI_HEADS_REGISTRY, 37 | ROI_KEYPOINT_HEAD_REGISTRY, 38 | ROI_MASK_HEAD_REGISTRY, 39 | ROIHeads, 40 | StandardROIHeads, 41 | BaseMaskRCNNHead, 42 | BaseKeypointRCNNHead, 43 | FastRCNNOutputLayers, 44 | build_box_head, 45 | build_keypoint_head, 46 | build_mask_head, 47 | build_roi_heads, 48 | ) 49 | from .test_time_augmentation import DatasetMapperTTA, GeneralizedRCNNWithTTA 50 | from .mmdet_wrapper import MMDetBackbone, MMDetDetector 51 | 52 | _EXCLUDE = {"ShapeSpec"} 53 | __all__ = [k for k in globals().keys() if k not in _EXCLUDE and not k.startswith("_")] 54 | 55 | 56 | from detectron2.utils.env import fixup_module_metadata 57 | 58 | fixup_module_metadata(__name__, globals(), __all__) 59 | del fixup_module_metadata 60 | -------------------------------------------------------------------------------- /detectron2/modeling/backbone/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from .build import build_backbone, BACKBONE_REGISTRY # noqa F401 isort:skip 3 | 4 | from .backbone import Backbone 5 | from .fpn import FPN 6 | from .regnet import RegNet 7 | from .resnet import ( 8 | BasicStem, 9 | ResNet, 10 | ResNetBlockBase, 11 | build_resnet_backbone, 12 | make_stage, 13 | BottleneckBlock, 14 | ) 15 | 16 | __all__ = [k for k in globals().keys() if not k.startswith("_")] 17 | # TODO can expose more resnet blocks after careful consideration 18 | -------------------------------------------------------------------------------- /detectron2/modeling/backbone/backbone.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from abc import ABCMeta, abstractmethod 3 | import torch.nn as nn 4 | 5 | from detectron2.layers import ShapeSpec 6 | 7 | __all__ = ["Backbone"] 8 | 9 | 10 | class Backbone(nn.Module, metaclass=ABCMeta): 11 | """ 12 | Abstract base class for network backbones. 13 | """ 14 | 15 | def __init__(self): 16 | """ 17 | The `__init__` method of any subclass can specify its own set of arguments. 18 | """ 19 | super().__init__() 20 | 21 | @abstractmethod 22 | def forward(self): 23 | """ 24 | Subclasses must override this method, but adhere to the same return type. 25 | 26 | Returns: 27 | dict[str->Tensor]: mapping from feature name (e.g., "res2") to tensor 28 | """ 29 | pass 30 | 31 | @property 32 | def size_divisibility(self) -> int: 33 | """ 34 | Some backbones require the input height and width to be divisible by a 35 | specific integer. This is typically true for encoder / decoder type networks 36 | with lateral connection (e.g., FPN) for which feature maps need to match 37 | dimension in the "bottom up" and "top down" paths. Set to 0 if no specific 38 | input size divisibility is required. 39 | """ 40 | return 0 41 | 42 | def output_shape(self): 43 | """ 44 | Returns: 45 | dict[str->ShapeSpec] 46 | """ 47 | # this is a backward-compatible default 48 | return { 49 | name: ShapeSpec( 50 | channels=self._out_feature_channels[name], stride=self._out_feature_strides[name] 51 | ) 52 | for name in self._out_features 53 | } 54 | -------------------------------------------------------------------------------- /detectron2/modeling/backbone/build.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from detectron2.layers import ShapeSpec 3 | from detectron2.utils.registry import Registry 4 | 5 | from .backbone import Backbone 6 | 7 | BACKBONE_REGISTRY = Registry("BACKBONE") 8 | BACKBONE_REGISTRY.__doc__ = """ 9 | Registry for backbones, which extract feature maps from images 10 | 11 | The registered object must be a callable that accepts two arguments: 12 | 13 | 1. A :class:`detectron2.config.CfgNode` 14 | 2. A :class:`detectron2.layers.ShapeSpec`, which contains the input shape specification. 15 | 16 | Registered object must return instance of :class:`Backbone`. 17 | """ 18 | 19 | 20 | def build_backbone(cfg, input_shape=None): 21 | """ 22 | Build a backbone from `cfg.MODEL.BACKBONE.NAME`. 23 | 24 | Returns: 25 | an instance of :class:`Backbone` 26 | """ 27 | if input_shape is None: 28 | input_shape = ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN)) 29 | 30 | backbone_name = cfg.MODEL.BACKBONE.NAME 31 | backbone = BACKBONE_REGISTRY.get(backbone_name)(cfg, input_shape) 32 | assert isinstance(backbone, Backbone) 33 | return backbone 34 | -------------------------------------------------------------------------------- /detectron2/modeling/meta_arch/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | 4 | from .build import META_ARCH_REGISTRY, build_model # isort:skip 5 | 6 | from .panoptic_fpn import PanopticFPN 7 | 8 | # import all the meta_arch, so they will be registered 9 | from .rcnn import GeneralizedRCNN, ProposalNetwork 10 | from .dense_detector import DenseDetector 11 | from .retinanet import RetinaNet 12 | from .fcos import FCOS 13 | from .semantic_seg import SEM_SEG_HEADS_REGISTRY, SemanticSegmentor, build_sem_seg_head 14 | 15 | 16 | __all__ = list(globals().keys()) 17 | -------------------------------------------------------------------------------- /detectron2/modeling/meta_arch/build.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import torch 3 | 4 | from detectron2.utils.logger import _log_api_usage 5 | from detectron2.utils.registry import Registry 6 | 7 | META_ARCH_REGISTRY = Registry("META_ARCH") # noqa F401 isort:skip 8 | META_ARCH_REGISTRY.__doc__ = """ 9 | Registry for meta-architectures, i.e. the whole model. 10 | 11 | The registered object will be called with `obj(cfg)` 12 | and expected to return a `nn.Module` object. 13 | """ 14 | 15 | 16 | def build_model(cfg): 17 | """ 18 | Build the whole model architecture, defined by ``cfg.MODEL.META_ARCHITECTURE``. 19 | Note that it does not load any weights from ``cfg``. 20 | """ 21 | meta_arch = cfg.MODEL.META_ARCHITECTURE 22 | model = META_ARCH_REGISTRY.get(meta_arch)(cfg) 23 | model.to(torch.device(cfg.MODEL.DEVICE)) 24 | _log_api_usage("modeling.meta_arch." + meta_arch) 25 | return model 26 | -------------------------------------------------------------------------------- /detectron2/modeling/postprocessing.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import torch 3 | from torch.nn import functional as F 4 | 5 | from detectron2.structures import Instances, ROIMasks 6 | 7 | 8 | # perhaps should rename to "resize_instance" 9 | def detector_postprocess( 10 | results: Instances, output_height: int, output_width: int, mask_threshold: float = 0.5 11 | ): 12 | """ 13 | Resize the output instances. 14 | The input images are often resized when entering an object detector. 15 | As a result, we often need the outputs of the detector in a different 16 | resolution from its inputs. 17 | 18 | This function will resize the raw outputs of an R-CNN detector 19 | to produce outputs according to the desired output resolution. 20 | 21 | Args: 22 | results (Instances): the raw outputs from the detector. 23 | `results.image_size` contains the input image resolution the detector sees. 24 | This object might be modified in-place. 25 | output_height, output_width: the desired output resolution. 26 | 27 | Returns: 28 | Instances: the resized output from the model, based on the output resolution 29 | """ 30 | if isinstance(output_width, torch.Tensor): 31 | # This shape might (but not necessarily) be tensors during tracing. 32 | # Converts integer tensors to float temporaries to ensure true 33 | # division is performed when computing scale_x and scale_y. 34 | output_width_tmp = output_width.float() 35 | output_height_tmp = output_height.float() 36 | new_size = torch.stack([output_height, output_width]) 37 | else: 38 | new_size = (output_height, output_width) 39 | output_width_tmp = output_width 40 | output_height_tmp = output_height 41 | 42 | scale_x, scale_y = ( 43 | output_width_tmp / results.image_size[1], 44 | output_height_tmp / results.image_size[0], 45 | ) 46 | results = Instances(new_size, **results.get_fields()) 47 | 48 | if results.has("pred_boxes"): 49 | output_boxes = results.pred_boxes 50 | elif results.has("proposal_boxes"): 51 | output_boxes = results.proposal_boxes 52 | else: 53 | output_boxes = None 54 | assert output_boxes is not None, "Predictions must contain boxes!" 55 | 56 | output_boxes.scale(scale_x, scale_y) 57 | output_boxes.clip(results.image_size) 58 | 59 | results = results[output_boxes.nonempty()] 60 | 61 | if results.has("pred_masks"): 62 | if isinstance(results.pred_masks, ROIMasks): 63 | roi_masks = results.pred_masks 64 | else: 65 | # pred_masks is a tensor of shape (N, 1, M, M) 66 | roi_masks = ROIMasks(results.pred_masks[:, 0, :, :]) 67 | results.pred_masks = roi_masks.to_bitmasks( 68 | results.pred_boxes, output_height, output_width, mask_threshold 69 | ).tensor # TODO return ROIMasks/BitMask object in the future 70 | 71 | if results.has("pred_keypoints"): 72 | results.pred_keypoints[:, :, 0] *= scale_x 73 | results.pred_keypoints[:, :, 1] *= scale_y 74 | 75 | return results 76 | 77 | 78 | def sem_seg_postprocess(result, img_size, output_height, output_width): 79 | """ 80 | Return semantic segmentation predictions in the original resolution. 81 | 82 | The input images are often resized when entering semantic segmentor. Moreover, in same 83 | cases, they also padded inside segmentor to be divisible by maximum network stride. 84 | As a result, we often need the predictions of the segmentor in a different 85 | resolution from its inputs. 86 | 87 | Args: 88 | result (Tensor): semantic segmentation prediction logits. A tensor of shape (C, H, W), 89 | where C is the number of classes, and H, W are the height and width of the prediction. 90 | img_size (tuple): image size that segmentor is taking as input. 91 | output_height, output_width: the desired output resolution. 92 | 93 | Returns: 94 | semantic segmentation prediction (Tensor): A tensor of the shape 95 | (C, output_height, output_width) that contains per-pixel soft predictions. 96 | """ 97 | result = result[:, : img_size[0], : img_size[1]].expand(1, -1, -1, -1) 98 | result = F.interpolate( 99 | result, size=(output_height, output_width), mode="bilinear", align_corners=False 100 | )[0] 101 | return result 102 | -------------------------------------------------------------------------------- /detectron2/modeling/proposal_generator/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from .build import PROPOSAL_GENERATOR_REGISTRY, build_proposal_generator 3 | from .rpn import RPN_HEAD_REGISTRY, build_rpn_head, RPN, StandardRPNHead 4 | 5 | __all__ = list(globals().keys()) 6 | -------------------------------------------------------------------------------- /detectron2/modeling/proposal_generator/build.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from detectron2.utils.registry import Registry 3 | 4 | PROPOSAL_GENERATOR_REGISTRY = Registry("PROPOSAL_GENERATOR") 5 | PROPOSAL_GENERATOR_REGISTRY.__doc__ = """ 6 | Registry for proposal generator, which produces object proposals from feature maps. 7 | 8 | The registered object will be called with `obj(cfg, input_shape)`. 9 | The call should return a `nn.Module` object. 10 | """ 11 | 12 | from . import rpn, rrpn # noqa F401 isort:skip 13 | 14 | 15 | def build_proposal_generator(cfg, input_shape): 16 | """ 17 | Build a proposal generator from `cfg.MODEL.PROPOSAL_GENERATOR.NAME`. 18 | The name can be "PrecomputedProposals" to use no proposal generator. 19 | """ 20 | name = cfg.MODEL.PROPOSAL_GENERATOR.NAME 21 | if name == "PrecomputedProposals": 22 | return None 23 | 24 | return PROPOSAL_GENERATOR_REGISTRY.get(name)(cfg, input_shape) 25 | -------------------------------------------------------------------------------- /detectron2/modeling/roi_heads/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from .box_head import ROI_BOX_HEAD_REGISTRY, build_box_head, FastRCNNConvFCHead 3 | from .keypoint_head import ( 4 | ROI_KEYPOINT_HEAD_REGISTRY, 5 | build_keypoint_head, 6 | BaseKeypointRCNNHead, 7 | KRCNNConvDeconvUpsampleHead, 8 | ) 9 | from .mask_head import ( 10 | ROI_MASK_HEAD_REGISTRY, 11 | build_mask_head, 12 | BaseMaskRCNNHead, 13 | MaskRCNNConvUpsampleHead, 14 | ) 15 | from .roi_heads import ( 16 | ROI_HEADS_REGISTRY, 17 | ROIHeads, 18 | Res5ROIHeads, 19 | StandardROIHeads, 20 | build_roi_heads, 21 | select_foreground_proposals, 22 | ) 23 | from .cascade_rcnn import CascadeROIHeads 24 | from .rotated_fast_rcnn import RROIHeads 25 | from .fast_rcnn import FastRCNNOutputLayers 26 | 27 | from . import cascade_rcnn # isort:skip 28 | 29 | __all__ = list(globals().keys()) 30 | -------------------------------------------------------------------------------- /detectron2/modeling/roi_heads/box_head.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import numpy as np 3 | from typing import List 4 | import fvcore.nn.weight_init as weight_init 5 | import torch 6 | from torch import nn 7 | 8 | from detectron2.config import configurable 9 | from detectron2.layers import Conv2d, ShapeSpec, get_norm 10 | from detectron2.utils.registry import Registry 11 | 12 | __all__ = ["FastRCNNConvFCHead", "build_box_head", "ROI_BOX_HEAD_REGISTRY"] 13 | 14 | ROI_BOX_HEAD_REGISTRY = Registry("ROI_BOX_HEAD") 15 | ROI_BOX_HEAD_REGISTRY.__doc__ = """ 16 | Registry for box heads, which make box predictions from per-region features. 17 | 18 | The registered object will be called with `obj(cfg, input_shape)`. 19 | """ 20 | 21 | 22 | # To get torchscript support, we make the head a subclass of `nn.Sequential`. 23 | # Therefore, to add new layers in this head class, please make sure they are 24 | # added in the order they will be used in forward(). 25 | @ROI_BOX_HEAD_REGISTRY.register() 26 | class FastRCNNConvFCHead(nn.Sequential): 27 | """ 28 | A head with several 3x3 conv layers (each followed by norm & relu) and then 29 | several fc layers (each followed by relu). 30 | """ 31 | 32 | @configurable 33 | def __init__( 34 | self, input_shape: ShapeSpec, *, conv_dims: List[int], fc_dims: List[int], conv_norm="" 35 | ): 36 | """ 37 | NOTE: this interface is experimental. 38 | 39 | Args: 40 | input_shape (ShapeSpec): shape of the input feature. 41 | conv_dims (list[int]): the output dimensions of the conv layers 42 | fc_dims (list[int]): the output dimensions of the fc layers 43 | conv_norm (str or callable): normalization for the conv layers. 44 | See :func:`detectron2.layers.get_norm` for supported types. 45 | """ 46 | super().__init__() 47 | assert len(conv_dims) + len(fc_dims) > 0 48 | 49 | self._output_size = (input_shape.channels, input_shape.height, input_shape.width) 50 | 51 | self.conv_norm_relus = [] 52 | for k, conv_dim in enumerate(conv_dims): 53 | conv = Conv2d( 54 | self._output_size[0], 55 | conv_dim, 56 | kernel_size=3, 57 | padding=1, 58 | bias=not conv_norm, 59 | norm=get_norm(conv_norm, conv_dim), 60 | activation=nn.ReLU(), 61 | ) 62 | self.add_module("conv{}".format(k + 1), conv) 63 | self.conv_norm_relus.append(conv) 64 | self._output_size = (conv_dim, self._output_size[1], self._output_size[2]) 65 | 66 | self.fcs = [] 67 | for k, fc_dim in enumerate(fc_dims): 68 | if k == 0: 69 | self.add_module("flatten", nn.Flatten()) 70 | fc = nn.Linear(int(np.prod(self._output_size)), fc_dim) 71 | self.add_module("fc{}".format(k + 1), fc) 72 | self.add_module("fc_relu{}".format(k + 1), nn.ReLU()) 73 | self.fcs.append(fc) 74 | self._output_size = fc_dim 75 | 76 | for layer in self.conv_norm_relus: 77 | weight_init.c2_msra_fill(layer) 78 | for layer in self.fcs: 79 | weight_init.c2_xavier_fill(layer) 80 | 81 | @classmethod 82 | def from_config(cls, cfg, input_shape): 83 | num_conv = cfg.MODEL.ROI_BOX_HEAD.NUM_CONV 84 | conv_dim = cfg.MODEL.ROI_BOX_HEAD.CONV_DIM 85 | num_fc = cfg.MODEL.ROI_BOX_HEAD.NUM_FC 86 | fc_dim = cfg.MODEL.ROI_BOX_HEAD.FC_DIM 87 | return { 88 | "input_shape": input_shape, 89 | "conv_dims": [conv_dim] * num_conv, 90 | "fc_dims": [fc_dim] * num_fc, 91 | "conv_norm": cfg.MODEL.ROI_BOX_HEAD.NORM, 92 | } 93 | 94 | def forward(self, x): 95 | for layer in self: 96 | x = layer(x) 97 | return x 98 | 99 | @property 100 | @torch.jit.unused 101 | def output_shape(self): 102 | """ 103 | Returns: 104 | ShapeSpec: the output feature shape 105 | """ 106 | o = self._output_size 107 | if isinstance(o, int): 108 | return ShapeSpec(channels=o) 109 | else: 110 | return ShapeSpec(channels=o[0], height=o[1], width=o[2]) 111 | 112 | 113 | def build_box_head(cfg, input_shape): 114 | """ 115 | Build a box head defined by `cfg.MODEL.ROI_BOX_HEAD.NAME`. 116 | """ 117 | name = cfg.MODEL.ROI_BOX_HEAD.NAME 118 | return ROI_BOX_HEAD_REGISTRY.get(name)(cfg, input_shape) 119 | -------------------------------------------------------------------------------- /detectron2/modeling/sampling.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import torch 3 | 4 | from detectron2.layers import nonzero_tuple 5 | 6 | __all__ = ["subsample_labels"] 7 | 8 | 9 | def subsample_labels( 10 | labels: torch.Tensor, num_samples: int, positive_fraction: float, bg_label: int 11 | ): 12 | """ 13 | Return `num_samples` (or fewer, if not enough found) 14 | random samples from `labels` which is a mixture of positives & negatives. 15 | It will try to return as many positives as possible without 16 | exceeding `positive_fraction * num_samples`, and then try to 17 | fill the remaining slots with negatives. 18 | 19 | Args: 20 | labels (Tensor): (N, ) label vector with values: 21 | * -1: ignore 22 | * bg_label: background ("negative") class 23 | * otherwise: one or more foreground ("positive") classes 24 | num_samples (int): The total number of labels with value >= 0 to return. 25 | Values that are not sampled will be filled with -1 (ignore). 26 | positive_fraction (float): The number of subsampled labels with values > 0 27 | is `min(num_positives, int(positive_fraction * num_samples))`. The number 28 | of negatives sampled is `min(num_negatives, num_samples - num_positives_sampled)`. 29 | In order words, if there are not enough positives, the sample is filled with 30 | negatives. If there are also not enough negatives, then as many elements are 31 | sampled as is possible. 32 | bg_label (int): label index of background ("negative") class. 33 | 34 | Returns: 35 | pos_idx, neg_idx (Tensor): 36 | 1D vector of indices. The total length of both is `num_samples` or fewer. 37 | """ 38 | positive = nonzero_tuple((labels != -1) & (labels != bg_label))[0] 39 | negative = nonzero_tuple(labels == bg_label)[0] 40 | 41 | num_pos = int(num_samples * positive_fraction) 42 | # protect against not enough positive examples 43 | num_pos = min(positive.numel(), num_pos) 44 | num_neg = num_samples - num_pos 45 | # protect against not enough negative examples 46 | num_neg = min(negative.numel(), num_neg) 47 | 48 | # randomly select positive and negative examples 49 | perm1 = torch.randperm(positive.numel(), device=positive.device)[:num_pos] 50 | perm2 = torch.randperm(negative.numel(), device=negative.device)[:num_neg] 51 | 52 | pos_idx = positive[perm1] 53 | neg_idx = negative[perm2] 54 | return pos_idx, neg_idx 55 | -------------------------------------------------------------------------------- /detectron2/projects/README.md: -------------------------------------------------------------------------------- 1 | 2 | Projects live in the [`projects` directory](../../projects) under the root of this repository, but not here. 3 | -------------------------------------------------------------------------------- /detectron2/projects/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import importlib 3 | from pathlib import Path 4 | 5 | _PROJECTS = { 6 | "point_rend": "PointRend", 7 | "deeplab": "DeepLab", 8 | "panoptic_deeplab": "Panoptic-DeepLab", 9 | "ifc": "IFC", 10 | "ddetrs": "DDETRS", 11 | "idol": "IDOL", 12 | } 13 | _PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent / "projects" 14 | 15 | if _PROJECT_ROOT.is_dir(): 16 | # This is true only for in-place installation (pip install -e, setup.py develop), 17 | # where setup(package_dir=) does not work: https://github.com/pypa/setuptools/issues/230 18 | 19 | class _D2ProjectsFinder(importlib.abc.MetaPathFinder): 20 | def find_spec(self, name, path, target=None): 21 | if not name.startswith("detectron2.projects."): 22 | return 23 | project_name = name.split(".")[-1] 24 | project_dir = _PROJECTS.get(project_name) 25 | if not project_dir: 26 | return 27 | target_file = _PROJECT_ROOT / f"{project_dir}/{project_name}/__init__.py" 28 | if not target_file.is_file(): 29 | return 30 | return importlib.util.spec_from_file_location(name, target_file) 31 | 32 | import sys 33 | 34 | sys.meta_path.append(_D2ProjectsFinder()) 35 | -------------------------------------------------------------------------------- /detectron2/solver/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from .build import build_lr_scheduler, build_optimizer, get_default_optimizer_params 3 | from .lr_scheduler import WarmupCosineLR, WarmupMultiStepLR, LRMultiplier, WarmupParamScheduler 4 | 5 | __all__ = [k for k in globals().keys() if not k.startswith("_")] 6 | -------------------------------------------------------------------------------- /detectron2/structures/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from .boxes import Boxes, BoxMode, pairwise_iou, pairwise_ioa, pairwise_point_box_distance 3 | from .image_list import ImageList 4 | 5 | from .instances import Instances 6 | from .keypoints import Keypoints, heatmaps_to_keypoints 7 | from .masks import BitMasks, PolygonMasks, polygons_to_bitmask, ROIMasks 8 | from .rotated_boxes import RotatedBoxes 9 | from .rotated_boxes import pairwise_iou as pairwise_iou_rotated 10 | 11 | __all__ = [k for k in globals().keys() if not k.startswith("_")] 12 | 13 | 14 | from detectron2.utils.env import fixup_module_metadata 15 | 16 | fixup_module_metadata(__name__, globals(), __all__) 17 | del fixup_module_metadata 18 | -------------------------------------------------------------------------------- /detectron2/structures/image_list.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from __future__ import division 3 | from typing import Any, List, Tuple 4 | import torch 5 | from torch import device 6 | from torch.nn import functional as F 7 | 8 | from detectron2.layers.wrappers import shapes_to_tensor 9 | 10 | 11 | class ImageList(object): 12 | """ 13 | Structure that holds a list of images (of possibly 14 | varying sizes) as a single tensor. 15 | This works by padding the images to the same size. 16 | The original sizes of each image is stored in `image_sizes`. 17 | 18 | Attributes: 19 | image_sizes (list[tuple[int, int]]): each tuple is (h, w). 20 | During tracing, it becomes list[Tensor] instead. 21 | """ 22 | 23 | def __init__(self, tensor: torch.Tensor, image_sizes: List[Tuple[int, int]]): 24 | """ 25 | Arguments: 26 | tensor (Tensor): of shape (N, H, W) or (N, C_1, ..., C_K, H, W) where K >= 1 27 | image_sizes (list[tuple[int, int]]): Each tuple is (h, w). It can 28 | be smaller than (H, W) due to padding. 29 | """ 30 | self.tensor = tensor 31 | self.image_sizes = image_sizes 32 | 33 | def __len__(self) -> int: 34 | return len(self.image_sizes) 35 | 36 | def __getitem__(self, idx) -> torch.Tensor: 37 | """ 38 | Access the individual image in its original size. 39 | 40 | Args: 41 | idx: int or slice 42 | 43 | Returns: 44 | Tensor: an image of shape (H, W) or (C_1, ..., C_K, H, W) where K >= 1 45 | """ 46 | size = self.image_sizes[idx] 47 | return self.tensor[idx, ..., : size[0], : size[1]] 48 | 49 | @torch.jit.unused 50 | def to(self, *args: Any, **kwargs: Any) -> "ImageList": 51 | cast_tensor = self.tensor.to(*args, **kwargs) 52 | return ImageList(cast_tensor, self.image_sizes) 53 | 54 | @property 55 | def device(self) -> device: 56 | return self.tensor.device 57 | 58 | @staticmethod 59 | def from_tensors( 60 | tensors: List[torch.Tensor], size_divisibility: int = 0, pad_value: float = 0.0 61 | ) -> "ImageList": 62 | """ 63 | Args: 64 | tensors: a tuple or list of `torch.Tensor`, each of shape (Hi, Wi) or 65 | (C_1, ..., C_K, Hi, Wi) where K >= 1. The Tensors will be padded 66 | to the same shape with `pad_value`. 67 | size_divisibility (int): If `size_divisibility > 0`, add padding to ensure 68 | the common height and width is divisible by `size_divisibility`. 69 | This depends on the model and many models need a divisibility of 32. 70 | pad_value (float): value to pad 71 | 72 | Returns: 73 | an `ImageList`. 74 | """ 75 | assert len(tensors) > 0 76 | assert isinstance(tensors, (tuple, list)) 77 | for t in tensors: 78 | assert isinstance(t, torch.Tensor), type(t) 79 | assert t.shape[:-2] == tensors[0].shape[:-2], t.shape 80 | 81 | image_sizes = [(im.shape[-2], im.shape[-1]) for im in tensors] 82 | image_sizes_tensor = [shapes_to_tensor(x) for x in image_sizes] 83 | max_size = torch.stack(image_sizes_tensor).max(0).values 84 | 85 | if size_divisibility > 1: 86 | stride = size_divisibility 87 | # the last two dims are H,W, both subject to divisibility requirement 88 | max_size = (max_size + (stride - 1)).div(stride, rounding_mode="floor") * stride 89 | 90 | # handle weirdness of scripting and tracing ... 91 | if torch.jit.is_scripting(): 92 | max_size: List[int] = max_size.to(dtype=torch.long).tolist() 93 | else: 94 | if torch.jit.is_tracing(): 95 | image_sizes = image_sizes_tensor 96 | 97 | if len(tensors) == 1: 98 | # This seems slightly (2%) faster. 99 | # TODO: check whether it's faster for multiple images as well 100 | image_size = image_sizes[0] 101 | padding_size = [0, max_size[-1] - image_size[1], 0, max_size[-2] - image_size[0]] 102 | batched_imgs = F.pad(tensors[0], padding_size, value=pad_value).unsqueeze_(0) 103 | else: 104 | # max_size can be a tensor in tracing mode, therefore convert to list 105 | batch_shape = [len(tensors)] + list(tensors[0].shape[:-2]) + list(max_size) 106 | batched_imgs = tensors[0].new_full(batch_shape, pad_value) 107 | for img, pad_img in zip(tensors, batched_imgs): 108 | pad_img[..., : img.shape[-2], : img.shape[-1]].copy_(img) 109 | 110 | return ImageList(batched_imgs.contiguous(), image_sizes) 111 | -------------------------------------------------------------------------------- /detectron2/tracking/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from .base_tracker import ( # noqa 3 | BaseTracker, 4 | build_tracker_head, 5 | TRACKER_HEADS_REGISTRY, 6 | ) 7 | from .bbox_iou_tracker import BBoxIOUTracker # noqa 8 | from .hungarian_tracker import BaseHungarianTracker # noqa 9 | from .iou_weighted_hungarian_bbox_iou_tracker import ( # noqa 10 | IOUWeightedHungarianBBoxIOUTracker, 11 | ) 12 | from .utils import create_prediction_pairs # noqa 13 | from .vanilla_hungarian_bbox_iou_tracker import VanillaHungarianBBoxIOUTracker # noqa 14 | 15 | __all__ = [k for k in globals().keys() if not k.startswith("_")] 16 | -------------------------------------------------------------------------------- /detectron2/tracking/base_tracker.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright 2004-present Facebook. All Rights Reserved. 3 | from ..structures import Instances 4 | from detectron2.utils.registry import Registry 5 | from ..config.config import CfgNode as CfgNode_ 6 | from detectron2.config import configurable 7 | 8 | 9 | TRACKER_HEADS_REGISTRY = Registry("TRACKER_HEADS") 10 | TRACKER_HEADS_REGISTRY.__doc__ = """ 11 | Registry for tracking classes. 12 | """ 13 | 14 | 15 | class BaseTracker(object): 16 | """ 17 | A parent class for all trackers 18 | """ 19 | 20 | @configurable 21 | def __init__(self, **kwargs): 22 | self._prev_instances = None # (D2)instances for previous frame 23 | self._matched_idx = set() # indices in prev_instances found matching 24 | self._matched_ID = set() # idendities in prev_instances found matching 25 | self._untracked_prev_idx = set() # indices in prev_instances not found matching 26 | self._id_count = 0 # used to assign new id 27 | 28 | @classmethod 29 | def from_config(cls, cfg: CfgNode_): 30 | raise NotImplementedError("Calling BaseTracker::from_config") 31 | 32 | def update(self, predictions: Instances) -> Instances: 33 | """ 34 | Args: 35 | predictions: D2 Instances for predictions of the current frame 36 | Return: 37 | D2 Instances for predictions of the current frame with ID assigned 38 | 39 | _prev_instances and instances will have the following fields: 40 | .pred_boxes (shape=[N, 4]) 41 | .scores (shape=[N,]) 42 | .pred_classes (shape=[N,]) 43 | .pred_keypoints (shape=[N, M, 3], Optional) 44 | .pred_masks (shape=List[2D_MASK], Optional) 2D_MASK: shape=[H, W] 45 | .ID (shape=[N,]) 46 | 47 | N: # of detected bboxes 48 | H and W: height and width of 2D mask 49 | """ 50 | raise NotImplementedError("Calling BaseTracker::update") 51 | 52 | 53 | def build_tracker_head(cfg: CfgNode_) -> BaseTracker: 54 | """ 55 | Build a tracker head from `cfg.TRACKER_HEADS.TRACKER_NAME`. 56 | 57 | Args: 58 | cfg: D2 CfgNode, config file with tracker information 59 | Return: 60 | tracker object 61 | """ 62 | name = cfg.TRACKER_HEADS.TRACKER_NAME 63 | tracker_class = TRACKER_HEADS_REGISTRY.get(name) 64 | return tracker_class(cfg) 65 | -------------------------------------------------------------------------------- /detectron2/tracking/iou_weighted_hungarian_bbox_iou_tracker.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright 2004-present Facebook. All Rights Reserved. 3 | 4 | from typing import List 5 | 6 | import numpy as np 7 | 8 | from .base_tracker import TRACKER_HEADS_REGISTRY 9 | from .vanilla_hungarian_bbox_iou_tracker import VanillaHungarianBBoxIOUTracker 10 | from detectron2.config import configurable, CfgNode as CfgNode_ 11 | 12 | 13 | @TRACKER_HEADS_REGISTRY.register() 14 | class IOUWeightedHungarianBBoxIOUTracker(VanillaHungarianBBoxIOUTracker): 15 | """ 16 | A tracker using IoU as weight in Hungarian algorithm, also known 17 | as Munkres or Kuhn-Munkres algorithm 18 | """ 19 | 20 | @configurable 21 | def __init__( 22 | self, 23 | *, 24 | video_height: int, 25 | video_width: int, 26 | max_num_instances: int = 200, 27 | max_lost_frame_count: int = 0, 28 | min_box_rel_dim: float = 0.02, 29 | min_instance_period: int = 1, 30 | track_iou_threshold: float = 0.5, 31 | **kwargs 32 | ): 33 | """ 34 | Args: 35 | video_height: height the video frame 36 | video_width: width of the video frame 37 | max_num_instances: maximum number of id allowed to be tracked 38 | max_lost_frame_count: maximum number of frame an id can lost tracking 39 | exceed this number, an id is considered as lost 40 | forever 41 | min_box_rel_dim: a percentage, smaller than this dimension, a bbox is 42 | removed from tracking 43 | min_instance_period: an instance will be shown after this number of period 44 | since its first showing up in the video 45 | track_iou_threshold: iou threshold, below this number a bbox pair is removed 46 | from tracking 47 | """ 48 | super().__init__( 49 | video_height=video_height, 50 | video_width=video_width, 51 | max_num_instances=max_num_instances, 52 | max_lost_frame_count=max_lost_frame_count, 53 | min_box_rel_dim=min_box_rel_dim, 54 | min_instance_period=min_instance_period, 55 | track_iou_threshold=track_iou_threshold 56 | ) 57 | 58 | @classmethod 59 | def from_config(cls, cfg: CfgNode_): 60 | """ 61 | Old style initialization using CfgNode 62 | 63 | Args: 64 | cfg: D2 CfgNode, config file 65 | Return: 66 | dictionary storing arguments for __init__ method 67 | """ 68 | assert "VIDEO_HEIGHT" in cfg.TRACKER_HEADS 69 | assert "VIDEO_WIDTH" in cfg.TRACKER_HEADS 70 | video_height = cfg.TRACKER_HEADS.get("VIDEO_HEIGHT") 71 | video_width = cfg.TRACKER_HEADS.get("VIDEO_WIDTH") 72 | max_num_instances = cfg.TRACKER_HEADS.get("MAX_NUM_INSTANCES", 200) 73 | max_lost_frame_count = cfg.TRACKER_HEADS.get("MAX_LOST_FRAME_COUNT", 0) 74 | min_box_rel_dim = cfg.TRACKER_HEADS.get("MIN_BOX_REL_DIM", 0.02) 75 | min_instance_period = cfg.TRACKER_HEADS.get("MIN_INSTANCE_PERIOD", 1) 76 | track_iou_threshold = cfg.TRACKER_HEADS.get("TRACK_IOU_THRESHOLD", 0.5) 77 | return { 78 | "_target_": "detectron2.tracking.iou_weighted_hungarian_bbox_iou_tracker.IOUWeightedHungarianBBoxIOUTracker", # noqa 79 | "video_height": video_height, 80 | "video_width": video_width, 81 | "max_num_instances": max_num_instances, 82 | "max_lost_frame_count": max_lost_frame_count, 83 | "min_box_rel_dim": min_box_rel_dim, 84 | "min_instance_period": min_instance_period, 85 | "track_iou_threshold": track_iou_threshold 86 | } 87 | 88 | def assign_cost_matrix_values(self, cost_matrix: np.ndarray, bbox_pairs: List) -> np.ndarray: 89 | """ 90 | Based on IoU for each pair of bbox, assign the associated value in cost matrix 91 | 92 | Args: 93 | cost_matrix: np.ndarray, initialized 2D array with target dimensions 94 | bbox_pairs: list of bbox pair, in each pair, iou value is stored 95 | Return: 96 | np.ndarray, cost_matrix with assigned values 97 | """ 98 | for pair in bbox_pairs: 99 | # assign (-1 * IoU) for above threshold pairs, algorithms will minimize cost 100 | cost_matrix[pair["idx"]][pair["prev_idx"]] = -1 * pair["IoU"] 101 | return cost_matrix 102 | -------------------------------------------------------------------------------- /detectron2/tracking/utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from detectron2.structures import Instances 3 | import numpy as np 4 | from typing import List 5 | 6 | 7 | def create_prediction_pairs( 8 | instances: Instances, 9 | prev_instances: Instances, 10 | iou_all: np.ndarray, 11 | threshold: float = 0.5, 12 | ) -> List: 13 | """ 14 | Args: 15 | instances: predictions from current frame 16 | prev_instances: predictions from previous frame 17 | iou_all: 2D numpy array containing iou for each bbox pair 18 | threshold: below the threshold, doesn't consider the pair of bbox is valid 19 | Return: 20 | List of bbox pairs 21 | """ 22 | bbox_pairs = [] 23 | for i in range(len(instances)): 24 | for j in range(len(prev_instances)): 25 | if iou_all[i, j] < threshold: 26 | continue 27 | bbox_pairs.append( 28 | { 29 | "idx": i, 30 | "prev_idx": j, 31 | "prev_id": prev_instances.ID[j], 32 | "IoU": iou_all[i, j], 33 | "prev_period": prev_instances.ID_period[j], 34 | } 35 | ) 36 | return bbox_pairs 37 | 38 | 39 | LARGE_COST_VALUE = 100000 40 | -------------------------------------------------------------------------------- /detectron2/utils/README.md: -------------------------------------------------------------------------------- 1 | # Utility functions 2 | 3 | This folder contain utility functions that are not used in the 4 | core library, but are useful for building models or training 5 | code using the config system. 6 | -------------------------------------------------------------------------------- /detectron2/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /detectron2/utils/colormap.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | """ 4 | An awesome colormap for really neat visualizations. 5 | Copied from Detectron, and removed gray colors. 6 | """ 7 | 8 | import numpy as np 9 | import random 10 | 11 | __all__ = ["colormap", "random_color", "random_colors"] 12 | 13 | # fmt: off 14 | # RGB: 15 | _COLORS = np.array( 16 | [ 17 | 0.000, 0.447, 0.741, 18 | 0.850, 0.325, 0.098, 19 | 0.929, 0.694, 0.125, 20 | 0.494, 0.184, 0.556, 21 | 0.466, 0.674, 0.188, 22 | 0.301, 0.745, 0.933, 23 | 0.635, 0.078, 0.184, 24 | 0.300, 0.300, 0.300, 25 | 0.600, 0.600, 0.600, 26 | 1.000, 0.000, 0.000, 27 | 1.000, 0.500, 0.000, 28 | 0.749, 0.749, 0.000, 29 | 0.000, 1.000, 0.000, 30 | 0.000, 0.000, 1.000, 31 | 0.667, 0.000, 1.000, 32 | 0.333, 0.333, 0.000, 33 | 0.333, 0.667, 0.000, 34 | 0.333, 1.000, 0.000, 35 | 0.667, 0.333, 0.000, 36 | 0.667, 0.667, 0.000, 37 | 0.667, 1.000, 0.000, 38 | 1.000, 0.333, 0.000, 39 | 1.000, 0.667, 0.000, 40 | 1.000, 1.000, 0.000, 41 | 0.000, 0.333, 0.500, 42 | 0.000, 0.667, 0.500, 43 | 0.000, 1.000, 0.500, 44 | 0.333, 0.000, 0.500, 45 | 0.333, 0.333, 0.500, 46 | 0.333, 0.667, 0.500, 47 | 0.333, 1.000, 0.500, 48 | 0.667, 0.000, 0.500, 49 | 0.667, 0.333, 0.500, 50 | 0.667, 0.667, 0.500, 51 | 0.667, 1.000, 0.500, 52 | 1.000, 0.000, 0.500, 53 | 1.000, 0.333, 0.500, 54 | 1.000, 0.667, 0.500, 55 | 1.000, 1.000, 0.500, 56 | 0.000, 0.333, 1.000, 57 | 0.000, 0.667, 1.000, 58 | 0.000, 1.000, 1.000, 59 | 0.333, 0.000, 1.000, 60 | 0.333, 0.333, 1.000, 61 | 0.333, 0.667, 1.000, 62 | 0.333, 1.000, 1.000, 63 | 0.667, 0.000, 1.000, 64 | 0.667, 0.333, 1.000, 65 | 0.667, 0.667, 1.000, 66 | 0.667, 1.000, 1.000, 67 | 1.000, 0.000, 1.000, 68 | 1.000, 0.333, 1.000, 69 | 1.000, 0.667, 1.000, 70 | 0.333, 0.000, 0.000, 71 | 0.500, 0.000, 0.000, 72 | 0.667, 0.000, 0.000, 73 | 0.833, 0.000, 0.000, 74 | 1.000, 0.000, 0.000, 75 | 0.000, 0.167, 0.000, 76 | 0.000, 0.333, 0.000, 77 | 0.000, 0.500, 0.000, 78 | 0.000, 0.667, 0.000, 79 | 0.000, 0.833, 0.000, 80 | 0.000, 1.000, 0.000, 81 | 0.000, 0.000, 0.167, 82 | 0.000, 0.000, 0.333, 83 | 0.000, 0.000, 0.500, 84 | 0.000, 0.000, 0.667, 85 | 0.000, 0.000, 0.833, 86 | 0.000, 0.000, 1.000, 87 | 0.000, 0.000, 0.000, 88 | 0.143, 0.143, 0.143, 89 | 0.857, 0.857, 0.857, 90 | 1.000, 1.000, 1.000 91 | ] 92 | ).astype(np.float32).reshape(-1, 3) 93 | # fmt: on 94 | 95 | 96 | def colormap(rgb=False, maximum=255): 97 | """ 98 | Args: 99 | rgb (bool): whether to return RGB colors or BGR colors. 100 | maximum (int): either 255 or 1 101 | 102 | Returns: 103 | ndarray: a float32 array of Nx3 colors, in range [0, 255] or [0, 1] 104 | """ 105 | assert maximum in [255, 1], maximum 106 | c = _COLORS * maximum 107 | if not rgb: 108 | c = c[:, ::-1] 109 | return c 110 | 111 | 112 | def random_color(rgb=False, maximum=255): 113 | """ 114 | Args: 115 | rgb (bool): whether to return RGB colors or BGR colors. 116 | maximum (int): either 255 or 1 117 | 118 | Returns: 119 | ndarray: a vector of 3 numbers 120 | """ 121 | idx = np.random.randint(0, len(_COLORS)) 122 | ret = _COLORS[idx] * maximum 123 | if not rgb: 124 | ret = ret[::-1] 125 | return ret 126 | 127 | 128 | def random_colors(N, rgb=False, maximum=255): 129 | """ 130 | Args: 131 | N (int): number of unique colors needed 132 | rgb (bool): whether to return RGB colors or BGR colors. 133 | maximum (int): either 255 or 1 134 | 135 | Returns: 136 | ndarray: a list of random_color 137 | """ 138 | indices = random.sample(range(len(_COLORS)), N) 139 | ret = [_COLORS[i] * maximum for i in indices] 140 | if not rgb: 141 | ret = [x[::-1] for x in ret] 142 | return ret 143 | 144 | 145 | if __name__ == "__main__": 146 | import cv2 147 | 148 | size = 100 149 | H, W = 10, 10 150 | canvas = np.random.rand(H * size, W * size, 3).astype("float32") 151 | for h in range(H): 152 | for w in range(W): 153 | idx = h * W + w 154 | if idx >= len(_COLORS): 155 | break 156 | canvas[h * size : (h + 1) * size, w * size : (w + 1) * size] = _COLORS[idx] 157 | cv2.imshow("a", canvas) 158 | cv2.waitKey(0) 159 | -------------------------------------------------------------------------------- /detectron2/utils/develop.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | """ Utilities for developers only. 3 | These are not visible to users (not automatically imported). And should not 4 | appeared in docs.""" 5 | # adapted from https://github.com/tensorpack/tensorpack/blob/master/tensorpack/utils/develop.py 6 | 7 | 8 | def create_dummy_class(klass, dependency, message=""): 9 | """ 10 | When a dependency of a class is not available, create a dummy class which throws ImportError 11 | when used. 12 | 13 | Args: 14 | klass (str): name of the class. 15 | dependency (str): name of the dependency. 16 | message: extra message to print 17 | Returns: 18 | class: a class object 19 | """ 20 | err = "Cannot import '{}', therefore '{}' is not available.".format(dependency, klass) 21 | if message: 22 | err = err + " " + message 23 | 24 | class _DummyMetaClass(type): 25 | # throw error on class attribute access 26 | def __getattr__(_, __): 27 | raise ImportError(err) 28 | 29 | class _Dummy(object, metaclass=_DummyMetaClass): 30 | # throw error on constructor 31 | def __init__(self, *args, **kwargs): 32 | raise ImportError(err) 33 | 34 | return _Dummy 35 | 36 | 37 | def create_dummy_func(func, dependency, message=""): 38 | """ 39 | When a dependency of a function is not available, create a dummy function which throws 40 | ImportError when used. 41 | 42 | Args: 43 | func (str): name of the function. 44 | dependency (str or list[str]): name(s) of the dependency. 45 | message: extra message to print 46 | Returns: 47 | function: a function object 48 | """ 49 | err = "Cannot import '{}', therefore '{}' is not available.".format(dependency, func) 50 | if message: 51 | err = err + " " + message 52 | 53 | if isinstance(dependency, (list, tuple)): 54 | dependency = ",".join(dependency) 55 | 56 | def _dummy(*args, **kwargs): 57 | raise ImportError(err) 58 | 59 | return _dummy 60 | -------------------------------------------------------------------------------- /detectron2/utils/file_io.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from iopath.common.file_io import HTTPURLHandler, OneDrivePathHandler, PathHandler 3 | from iopath.common.file_io import PathManager as PathManagerBase 4 | 5 | __all__ = ["PathManager", "PathHandler"] 6 | 7 | 8 | PathManager = PathManagerBase() 9 | """ 10 | This is a detectron2 project-specific PathManager. 11 | We try to stay away from global PathManager in fvcore as it 12 | introduces potential conflicts among other libraries. 13 | """ 14 | 15 | 16 | class Detectron2Handler(PathHandler): 17 | """ 18 | Resolve anything that's hosted under detectron2's namespace. 19 | """ 20 | 21 | PREFIX = "detectron2://" 22 | S3_DETECTRON2_PREFIX = "https://dl.fbaipublicfiles.com/detectron2/" 23 | 24 | def _get_supported_prefixes(self): 25 | return [self.PREFIX] 26 | 27 | def _get_local_path(self, path, **kwargs): 28 | name = path[len(self.PREFIX) :] 29 | return PathManager.get_local_path(self.S3_DETECTRON2_PREFIX + name, **kwargs) 30 | 31 | def _open(self, path, mode="r", **kwargs): 32 | return PathManager.open(self._get_local_path(path), mode, **kwargs) 33 | 34 | 35 | PathManager.register_handler(HTTPURLHandler()) 36 | PathManager.register_handler(OneDrivePathHandler()) 37 | PathManager.register_handler(Detectron2Handler()) 38 | -------------------------------------------------------------------------------- /detectron2/utils/memory.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | import logging 4 | from contextlib import contextmanager 5 | from functools import wraps 6 | import torch 7 | 8 | __all__ = ["retry_if_cuda_oom"] 9 | 10 | 11 | @contextmanager 12 | def _ignore_torch_cuda_oom(): 13 | """ 14 | A context which ignores CUDA OOM exception from pytorch. 15 | """ 16 | try: 17 | yield 18 | except RuntimeError as e: 19 | # NOTE: the string may change? 20 | if "CUDA out of memory. " in str(e): 21 | pass 22 | else: 23 | raise 24 | 25 | 26 | def retry_if_cuda_oom(func): 27 | """ 28 | Makes a function retry itself after encountering 29 | pytorch's CUDA OOM error. 30 | It will first retry after calling `torch.cuda.empty_cache()`. 31 | 32 | If that still fails, it will then retry by trying to convert inputs to CPUs. 33 | In this case, it expects the function to dispatch to CPU implementation. 34 | The return values may become CPU tensors as well and it's user's 35 | responsibility to convert it back to CUDA tensor if needed. 36 | 37 | Args: 38 | func: a stateless callable that takes tensor-like objects as arguments 39 | 40 | Returns: 41 | a callable which retries `func` if OOM is encountered. 42 | 43 | Examples: 44 | :: 45 | output = retry_if_cuda_oom(some_torch_function)(input1, input2) 46 | # output may be on CPU even if inputs are on GPU 47 | 48 | Note: 49 | 1. When converting inputs to CPU, it will only look at each argument and check 50 | if it has `.device` and `.to` for conversion. Nested structures of tensors 51 | are not supported. 52 | 53 | 2. Since the function might be called more than once, it has to be 54 | stateless. 55 | """ 56 | 57 | def maybe_to_cpu(x): 58 | try: 59 | like_gpu_tensor = x.device.type == "cuda" and hasattr(x, "to") 60 | except AttributeError: 61 | like_gpu_tensor = False 62 | if like_gpu_tensor: 63 | return x.to(device="cpu") 64 | else: 65 | return x 66 | 67 | @wraps(func) 68 | def wrapped(*args, **kwargs): 69 | with _ignore_torch_cuda_oom(): 70 | return func(*args, **kwargs) 71 | 72 | # Clear cache and retry 73 | torch.cuda.empty_cache() 74 | with _ignore_torch_cuda_oom(): 75 | return func(*args, **kwargs) 76 | 77 | # Try on CPU. This slows down the code significantly, therefore print a notice. 78 | logger = logging.getLogger(__name__) 79 | logger.info("Attempting to copy inputs of {} to CPU due to CUDA OOM".format(str(func))) 80 | new_args = (maybe_to_cpu(x) for x in args) 81 | new_kwargs = {k: maybe_to_cpu(v) for k, v in kwargs.items()} 82 | return func(*new_args, **new_kwargs) 83 | 84 | return wrapped 85 | -------------------------------------------------------------------------------- /detectron2/utils/registry.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | from typing import Any 4 | import pydoc 5 | from fvcore.common.registry import Registry # for backward compatibility. 6 | 7 | """ 8 | ``Registry`` and `locate` provide ways to map a string (typically found 9 | in config files) to callable objects. 10 | """ 11 | 12 | __all__ = ["Registry", "locate"] 13 | 14 | 15 | def _convert_target_to_string(t: Any) -> str: 16 | """ 17 | Inverse of ``locate()``. 18 | 19 | Args: 20 | t: any object with ``__module__`` and ``__qualname__`` 21 | """ 22 | module, qualname = t.__module__, t.__qualname__ 23 | 24 | # Compress the path to this object, e.g. ``module.submodule._impl.class`` 25 | # may become ``module.submodule.class``, if the later also resolves to the same 26 | # object. This simplifies the string, and also is less affected by moving the 27 | # class implementation. 28 | module_parts = module.split(".") 29 | for k in range(1, len(module_parts)): 30 | prefix = ".".join(module_parts[:k]) 31 | candidate = f"{prefix}.{qualname}" 32 | try: 33 | if locate(candidate) is t: 34 | return candidate 35 | except ImportError: 36 | pass 37 | return f"{module}.{qualname}" 38 | 39 | 40 | def locate(name: str) -> Any: 41 | """ 42 | Locate and return an object ``x`` using an input string ``{x.__module__}.{x.__qualname__}``, 43 | such as "module.submodule.class_name". 44 | 45 | Raise Exception if it cannot be found. 46 | """ 47 | obj = pydoc.locate(name) 48 | 49 | # Some cases (e.g. torch.optim.sgd.SGD) not handled correctly 50 | # by pydoc.locate. Try a private function from hydra. 51 | if obj is None: 52 | try: 53 | # from hydra.utils import get_method - will print many errors 54 | from hydra.utils import _locate 55 | except ImportError as e: 56 | raise ImportError(f"Cannot dynamically locate object {name}!") from e 57 | else: 58 | obj = _locate(name) # it raises if fails 59 | 60 | return obj 61 | -------------------------------------------------------------------------------- /detectron2/utils/serialize.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import cloudpickle 3 | 4 | 5 | class PicklableWrapper(object): 6 | """ 7 | Wrap an object to make it more picklable, note that it uses 8 | heavy weight serialization libraries that are slower than pickle. 9 | It's best to use it only on closures (which are usually not picklable). 10 | 11 | This is a simplified version of 12 | https://github.com/joblib/joblib/blob/master/joblib/externals/loky/cloudpickle_wrapper.py 13 | """ 14 | 15 | def __init__(self, obj): 16 | while isinstance(obj, PicklableWrapper): 17 | # Wrapping an object twice is no-op 18 | obj = obj._obj 19 | self._obj = obj 20 | 21 | def __reduce__(self): 22 | s = cloudpickle.dumps(self._obj) 23 | return cloudpickle.loads, (s,) 24 | 25 | def __call__(self, *args, **kwargs): 26 | return self._obj(*args, **kwargs) 27 | 28 | def __getattr__(self, attr): 29 | # Ensure that the wrapped object can be used seamlessly as the previous object. 30 | if attr not in ["_obj"]: 31 | return getattr(self._obj, attr) 32 | return getattr(self, attr) 33 | -------------------------------------------------------------------------------- /launch.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | import os 3 | import sys 4 | import socket 5 | import random 6 | import argparse 7 | import subprocess 8 | import time 9 | 10 | def _find_free_port(): 11 | sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 12 | # Binding to port 0 will cause the OS to find an available port for us 13 | sock.bind(("", 0)) 14 | port = sock.getsockname()[1] 15 | sock.close() 16 | # NOTE: there is still a chance the port could be taken by other processes. 17 | return port 18 | 19 | 20 | def _get_rand_port(): 21 | hour = time.time() // 3600 22 | random.seed(int(hour)) 23 | return random.randrange(40000, 60000) 24 | 25 | 26 | def init_workdir(): 27 | ROOT = os.path.dirname(os.path.abspath(__file__)) 28 | os.chdir(ROOT) 29 | sys.path.insert(0, ROOT) 30 | 31 | 32 | 33 | if __name__ == '__main__': 34 | parser = argparse.ArgumentParser(description='Launcher') 35 | parser.add_argument('--launch', type=str, default='projects/DDETRS/train_net.py', 36 | help='Specify launcher script.') 37 | parser.add_argument('--dist', type=int, default=1, 38 | help='Whether start by torch.distributed.launch.') 39 | parser.add_argument('--np', type=int, default=8, 40 | help='number of (GPU) processes per node') 41 | parser.add_argument('--nn', type=int, default=1, 42 | help='number of workers') 43 | parser.add_argument('--port', type=int, default=-1, 44 | help='master port for communication') 45 | parser.add_argument('--worker_rank', type=int, default=0) 46 | parser.add_argument('--master_address', type=str) 47 | args, other_args = parser.parse_known_args() 48 | 49 | # change to current dir 50 | prj_dir = os.path.dirname(os.path.abspath(__file__)) 51 | os.chdir(prj_dir) 52 | init_workdir() 53 | 54 | # Get training info 55 | master_address = args.master_address 56 | num_processes_per_worker = args.np 57 | num_workers = args.nn 58 | worker_rank = args.worker_rank 59 | 60 | # Get port 61 | if args.port > 0: 62 | master_port = args.port 63 | elif num_workers == 1: 64 | master_port = _find_free_port() 65 | else: # This reduce the conflict possibility, but the port availablity is not guaranteed. 66 | master_port = _get_rand_port() 67 | 68 | 69 | if args.dist >= 1: 70 | print(f'Start {args.launch} by torch.distributed.launch with port {master_port}!', flush=True) 71 | cmd = f'python3 {args.launch}\ 72 | --num-gpus={num_processes_per_worker}' 73 | if num_workers > 1: 74 | # multi-machine 75 | assert master_address is not None 76 | dist_url = "tcp://" + str(master_address) + ":" + str(master_port) 77 | cmd += f" --num-machines={num_workers}\ 78 | --machine-rank={worker_rank}\ 79 | --dist-url={dist_url}" 80 | else: 81 | print(f'Start {args.launch}!', flush=True) 82 | cmd = f'python3 {args.launch}' 83 | # $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3}' 84 | for argv in other_args: 85 | cmd += f' {argv}' 86 | print("==> Run command: " + cmd) 87 | exit_code = subprocess.call(cmd, shell=True) 88 | sys.exit(exit_code) -------------------------------------------------------------------------------- /projects/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FoundationVision/GenerateU/904e5337758d554e44ad50c2afa567c171166cd9/projects/.DS_Store -------------------------------------------------------------------------------- /projects/DDETRS/configs/vg_grit5m_swinL.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | USE_IOU_BRANCH: True 3 | LANG_GUIDE_DET: False 4 | LANG_AS_TGT: False 5 | DECOUPLE_TGT: False 6 | STILL_TGT_FOR_BOTH: True 7 | META_ARCHITECTURE: "DDETRSVLUni" 8 | BACKBONE: 9 | NAME: "D2SwinTransformer" 10 | SWIN: 11 | PRETRAIN_IMG_SIZE: 384 12 | EMBED_DIM: 192 13 | DEPTHS: [2, 2, 18, 2] 14 | NUM_HEADS: [6, 12, 24, 48] 15 | WINDOW_SIZE: 12 16 | APE: False 17 | DROP_PATH_RATE: 0.2 18 | PATCH_NORM: True 19 | WEIGHTS: "weights/vg_swinL.pth" 20 | PIXEL_MEAN: [123.675, 116.280, 103.530] 21 | PIXEL_STD: [58.395, 57.120, 57.375] 22 | MASK_ON: False 23 | OTA: False 24 | STILL_CLS_FOR_ENCODER: True 25 | USE_MULTI_BBOX_EMBED: False 26 | TEXT: 27 | TEXT_DECODER: 'google/flan-t5-base' 28 | FIX_TEXT_DECODER: False 29 | USE_GENERATE_LOSS: True 30 | USE_FOCAL_LOSS: False 31 | GENERATE_LOSS_WEIGHT: 1.0 32 | USE_ALL_NEGATIVE: True 33 | ZERO_SHOT_WEIGHT: 'datasets/lvis/lvis_v1_clip_a+cname_ViT-H.npy' 34 | BEAM_SIZE: 3 35 | DDETRS: 36 | NUM_CLASSES: [1] 37 | NUM_OBJECT_QUERIES: 300 38 | TWO_STAGE_NUM_PROPOSALS: 300 39 | USE_DAB: False 40 | USE_DINO: False 41 | DYNAMIC_LABEL_ENC: True 42 | DROPOUT: 0.0 43 | TWO_STAGE: True 44 | MIXED_SELECTION: False 45 | LOOK_FORWARD_TWICE: False 46 | DATASETS: 47 | TRAIN: ("vg_from_objects", "grit5m_train_pseudo",) 48 | TEST: ("lvis_v1_minival",) 49 | SOLVER: 50 | IMS_PER_BATCH: 128 #unused when multisampler 51 | BASE_LR: 0.0001 52 | VL_LR: 0.0001 53 | STEPS: (520000,) 54 | MAX_ITER: 600000 55 | WARMUP_ITERS: 200 56 | WEIGHT_DECAY: 0.05 57 | OPTIMIZER: "ADAMW" 58 | BACKBONE_MULTIPLIER: 0.1 59 | CLIP_GRADIENTS: 60 | ENABLED: True 61 | CLIP_TYPE: "full_model" 62 | CLIP_VALUE: 0.1 63 | NORM_TYPE: 2.0 64 | CHECKPOINT_PERIOD: 10000 65 | INPUT: 66 | MIN_SIZE_TRAIN: (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800) 67 | CROP: 68 | ENABLED: True 69 | TYPE: "absolute_range" 70 | SIZE: (384, 600) 71 | FORMAT: "RGB" 72 | TEST: 73 | EVAL_PERIOD: 30000 74 | EVAL_AFTER_TRAIN: True 75 | NUM_TEST_QUERIES: 300 76 | DATALOADER: 77 | SAMPLER_TRAIN: "MultiDatasetSampler" #"ClassAwareSampler" # 78 | DATASET_RATIO: [1,4] 79 | USE_DIFF_BS_SIZE: True 80 | DATASET_BS: [2,2] 81 | USE_RFS: [False,False] 82 | USE_CAS: [False,False] 83 | MULTI_DATASET_GROUPING: True 84 | DATASET_INPUT_SIZE: [1024,1024,] 85 | DATASET_INPUT_SCALE: [[0.1, 2.0],[0.1, 2.0],] 86 | NUM_WORKERS: 8 87 | FILTER_EMPTY_ANNOTATIONS: True 88 | VERSION: 2 89 | OUTPUT_DIR: ./test 90 | -------------------------------------------------------------------------------- /projects/DDETRS/configs/vg_grit5m_swinT.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | USE_IOU_BRANCH: True 3 | LANG_GUIDE_DET: False 4 | LANG_AS_TGT: False 5 | DECOUPLE_TGT: False 6 | STILL_TGT_FOR_BOTH: True 7 | META_ARCHITECTURE: "DDETRSVLUni" 8 | BACKBONE: 9 | NAME: "D2SwinTransformer" 10 | SWIN: 11 | EMBED_DIM: 96 12 | DEPTHS: [2, 2, 6, 2] 13 | NUM_HEADS: [3, 6, 12, 24] 14 | WINDOW_SIZE: 7 15 | APE: False 16 | DROP_PATH_RATE: 0.3 17 | PATCH_NORM: True 18 | WEIGHTS: "weights/vg_swinT.pth" 19 | PIXEL_MEAN: [123.675, 116.280, 103.530] 20 | PIXEL_STD: [58.395, 57.120, 57.375] 21 | MASK_ON: False 22 | OTA: False 23 | STILL_CLS_FOR_ENCODER: True 24 | USE_MULTI_BBOX_EMBED: False 25 | TEXT: 26 | TEXT_DECODER: 'google/flan-t5-base' 27 | FIX_TEXT_DECODER: False 28 | USE_GENERATE_LOSS: True 29 | USE_FOCAL_LOSS: False 30 | GENERATE_LOSS_WEIGHT: 1.0 31 | USE_ALL_NEGATIVE: True 32 | ZERO_SHOT_WEIGHT: 'datasets/lvis/lvis_v1_clip_a+cname_ViT-H.npy' 33 | BEAM_SIZE: 3 34 | DDETRS: 35 | NUM_CLASSES: [1] 36 | NUM_OBJECT_QUERIES: 300 37 | TWO_STAGE_NUM_PROPOSALS: 300 38 | USE_DAB: False 39 | USE_DINO: False 40 | DYNAMIC_LABEL_ENC: True 41 | DROPOUT: 0.0 42 | TWO_STAGE: True 43 | MIXED_SELECTION: False 44 | LOOK_FORWARD_TWICE: False 45 | DATASETS: 46 | TRAIN: ("vg_from_objects", "grit5m_train_pseudo",) 47 | TEST: ("lvis_v1_minival",) 48 | SOLVER: 49 | IMS_PER_BATCH: 128 #unused when multisampler 50 | BASE_LR: 0.0001 51 | VL_LR: 0.0001 52 | STEPS: (260000,) 53 | MAX_ITER: 300000 54 | WARMUP_FACTOR: 1.0 55 | WARMUP_ITERS: 200 56 | WEIGHT_DECAY: 0.05 57 | OPTIMIZER: "ADAMW" 58 | BACKBONE_MULTIPLIER: 0.1 59 | CLIP_GRADIENTS: 60 | ENABLED: True 61 | CLIP_TYPE: "full_model" 62 | CLIP_VALUE: 0.1 63 | NORM_TYPE: 2.0 64 | CHECKPOINT_PERIOD: 10000 65 | INPUT: 66 | MIN_SIZE_TRAIN: (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800) 67 | CROP: 68 | ENABLED: True 69 | TYPE: "absolute_range" 70 | SIZE: (384, 600) 71 | FORMAT: "RGB" 72 | TEST: 73 | EVAL_PERIOD: 30000 74 | EVAL_AFTER_TRAIN: True 75 | NUM_TEST_QUERIES: 300 76 | DATALOADER: 77 | SAMPLER_TRAIN: "MultiDatasetSampler" 78 | DATASET_RATIO: [1,4] 79 | USE_DIFF_BS_SIZE: True 80 | DATASET_BS: [4,4] 81 | USE_RFS: [False,False] 82 | USE_CAS: [False,False] 83 | MULTI_DATASET_GROUPING: True 84 | DATASET_INPUT_SIZE: [1024,1024,] 85 | DATASET_INPUT_SCALE: [[0.1, 2.0],[0.1, 2.0],] 86 | NUM_WORKERS: 8 87 | FILTER_EMPTY_ANNOTATIONS: True 88 | VERSION: 2 89 | OUTPUT_DIR: ./obj365v2_ep12_32g_r50_new_dino 90 | -------------------------------------------------------------------------------- /projects/DDETRS/configs/vg_swinL.yaml: -------------------------------------------------------------------------------- 1 | # Align with H-Deformable DETR (8 GPU training) 2 | # Language-guided detection 3 | # Add boxInst, iou branch, still tgt 4 | # main setting of our paper (R50 Obj365 pretrain) 5 | MODEL: 6 | USE_IOU_BRANCH: True 7 | LANG_GUIDE_DET: False #True # 8 | LANG_AS_TGT: False #True #False 9 | DECOUPLE_TGT: False #True 10 | STILL_TGT_FOR_BOTH: True 11 | META_ARCHITECTURE: "DDETRSVLUni" 12 | BACKBONE: 13 | NAME: "D2SwinTransformer" 14 | SWIN: 15 | PRETRAIN_IMG_SIZE: 384 16 | EMBED_DIM: 192 17 | DEPTHS: [2, 2, 18, 2] 18 | NUM_HEADS: [6, 12, 24, 48] 19 | WINDOW_SIZE: 12 20 | APE: False 21 | DROP_PATH_RATE: 0.2 22 | PATCH_NORM: True 23 | WEIGHTS: "weights/swin_large_patch4_window12_384_22k.pkl" 24 | PIXEL_MEAN: [123.675, 116.280, 103.530] 25 | PIXEL_STD: [58.395, 57.120, 57.375] 26 | MASK_ON: False 27 | OTA: False 28 | STILL_CLS_FOR_ENCODER: True 29 | USE_MULTI_BBOX_EMBED: False 30 | TEXT: 31 | TEXT_DECODER: 'google/flan-t5-base' 32 | FIX_TEXT_DECODER: False 33 | USE_GENERATE_LOSS: True 34 | USE_FOCAL_LOSS: False 35 | GENERATE_LOSS_WEIGHT: 1.0 36 | USE_ALL_NEGATIVE: True 37 | ZERO_SHOT_WEIGHT: 'datasets/lvis/lvis_v1_clip_a+cname_ViT-H.npy' 38 | BEAM_SIZE: 3 39 | DDETRS: 40 | NUM_CLASSES: [1] 41 | NUM_OBJECT_QUERIES: 300 42 | TWO_STAGE_NUM_PROPOSALS: 300 43 | USE_DAB: False 44 | USE_DINO: False 45 | DYNAMIC_LABEL_ENC: True 46 | DROPOUT: 0.0 47 | TWO_STAGE: True 48 | MIXED_SELECTION: False #True 49 | LOOK_FORWARD_TWICE: False #True 50 | DATASETS: 51 | TRAIN: ("vg_from_objects",) 52 | TEST: ("lvis_v1_minival",) 53 | SOLVER: 54 | IMS_PER_BATCH: 128 55 | BASE_LR: 0.0002 56 | VL_LR: 0.0003 57 | STEPS: (160000,) 58 | MAX_ITER: 180000 59 | WARMUP_FACTOR: 1.0 60 | WARMUP_ITERS: 200 61 | WEIGHT_DECAY: 0.05 62 | OPTIMIZER: "ADAMW" 63 | BACKBONE_MULTIPLIER: 0.1 64 | CLIP_GRADIENTS: 65 | ENABLED: True 66 | CLIP_TYPE: "full_model" 67 | CLIP_VALUE: 0.1 68 | NORM_TYPE: 2.0 69 | CHECKPOINT_PERIOD: 10000 70 | INPUT: 71 | MIN_SIZE_TRAIN: (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800) 72 | CROP: 73 | ENABLED: True 74 | TYPE: "absolute_range" 75 | SIZE: (384, 600) 76 | FORMAT: "RGB" 77 | TEST: 78 | EVAL_PERIOD: 10000 79 | EVAL_AFTER_TRAIN: True 80 | NUM_TEST_QUERIES: 300 81 | DATALOADER: 82 | SAMPLER_TRAIN: "MultiDatasetSampler" #"ClassAwareSampler" # 83 | DATASET_RATIO: [1] 84 | USE_DIFF_BS_SIZE: True 85 | DATASET_BS: [2] 86 | USE_RFS: [False] 87 | USE_CAS: [False] 88 | MULTI_DATASET_GROUPING: True 89 | DATASET_INPUT_SIZE: [1024,] 90 | DATASET_INPUT_SCALE: [[0.1, 2.0],] 91 | NUM_WORKERS: 8 92 | FILTER_EMPTY_ANNOTATIONS: True 93 | VERSION: 2 94 | OUTPUT_DIR: ./test 95 | -------------------------------------------------------------------------------- /projects/DDETRS/configs/vg_swinT.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | USE_IOU_BRANCH: True 3 | LANG_GUIDE_DET: False 4 | LANG_AS_TGT: False 5 | DECOUPLE_TGT: False 6 | STILL_TGT_FOR_BOTH: True 7 | META_ARCHITECTURE: "DDETRSVLUni" 8 | BACKBONE: 9 | NAME: "D2SwinTransformer" 10 | SWIN: 11 | EMBED_DIM: 96 12 | DEPTHS: [2, 2, 6, 2] 13 | NUM_HEADS: [3, 6, 12, 24] 14 | WINDOW_SIZE: 7 15 | APE: False 16 | DROP_PATH_RATE: 0.3 17 | PATCH_NORM: True 18 | WEIGHTS: "weights/swin_tiny_patch4_window7_224.pkl" 19 | PIXEL_MEAN: [123.675, 116.280, 103.530] 20 | PIXEL_STD: [58.395, 57.120, 57.375] 21 | OTA: False 22 | STILL_CLS_FOR_ENCODER: True 23 | USE_MULTI_BBOX_EMBED: False 24 | TEXT: 25 | TEXT_DECODER: 'google/flan-t5-base' 26 | FIX_TEXT_DECODER: False 27 | USE_GENERATE_LOSS: True 28 | USE_FOCAL_LOSS: False 29 | GENERATE_LOSS_WEIGHT: 1.0 30 | USE_ALL_NEGATIVE: True 31 | ZERO_SHOT_WEIGHT: 'datasets/lvis/lvis_v1_clip_a+cname_ViT-H.npy' 32 | BEAM_SIZE: 3 33 | DDETRS: 34 | NUM_CLASSES: [1] 35 | NUM_OBJECT_QUERIES: 300 36 | TWO_STAGE_NUM_PROPOSALS: 300 37 | USE_DAB: False 38 | USE_DINO: False 39 | DYNAMIC_LABEL_ENC: True 40 | DROPOUT: 0.0 41 | TWO_STAGE: True 42 | MIXED_SELECTION: False #True 43 | LOOK_FORWARD_TWICE: False #True 44 | DATASETS: 45 | TRAIN: ("vg_from_objects",) 46 | TEST: ("lvis_v1_minival",) 47 | SOLVER: 48 | IMS_PER_BATCH: 128 #unused 49 | BASE_LR: 0.0002 50 | VL_LR: 0.0003 51 | STEPS: (160000,) 52 | MAX_ITER: 180000 53 | WARMUP_FACTOR: 1.0 54 | WARMUP_ITERS: 200 55 | WEIGHT_DECAY: 0.05 56 | OPTIMIZER: "ADAMW" 57 | BACKBONE_MULTIPLIER: 0.1 58 | CLIP_GRADIENTS: 59 | ENABLED: True 60 | CLIP_TYPE: "full_model" 61 | CLIP_VALUE: 0.1 62 | NORM_TYPE: 2.0 63 | CHECKPOINT_PERIOD: 10000 64 | INPUT: 65 | MIN_SIZE_TRAIN: (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800) 66 | CROP: 67 | ENABLED: True 68 | TYPE: "absolute_range" 69 | SIZE: (384, 600) 70 | FORMAT: "RGB" 71 | TEST: 72 | EVAL_PERIOD: 30000 73 | EVAL_AFTER_TRAIN: True 74 | NUM_TEST_QUERIES: 300 75 | TOPK_FOR_MAPPING: 15 76 | DATALOADER: 77 | SAMPLER_TRAIN: "MultiDatasetSampler" 78 | DATASET_RATIO: [1] 79 | USE_DIFF_BS_SIZE: True 80 | DATASET_BS: [4] 81 | USE_RFS: [False] 82 | USE_CAS: [False] 83 | MULTI_DATASET_GROUPING: True 84 | DATASET_INPUT_SIZE: [1024,] 85 | DATASET_INPUT_SCALE: [[0.1, 2.0],] 86 | NUM_WORKERS: 8 87 | FILTER_EMPTY_ANNOTATIONS: True 88 | VERSION: 2 89 | OUTPUT_DIR: ./test 90 | -------------------------------------------------------------------------------- /projects/DDETRS/ddetrs/__init__.py: -------------------------------------------------------------------------------- 1 | from .config_uni import add_ddetrsvluni_config 2 | from .ddetrs_vl_uni import DDETRSVLUni 3 | from .data.datasets import vg 4 | from .data.datasets import lvis_minival 5 | from .data.datasets import grit20m 6 | from .data.datasets import grit_pseudo 7 | from .backbone.swin import D2SwinTransformer 8 | from .data.mapper import build_detection_test_loader -------------------------------------------------------------------------------- /projects/DDETRS/ddetrs/backbone/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FoundationVision/GenerateU/904e5337758d554e44ad50c2afa567c171166cd9/projects/DDETRS/ddetrs/backbone/__init__.py -------------------------------------------------------------------------------- /projects/DDETRS/ddetrs/backbone/masked_backbone.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn.functional as F 4 | from torch import nn 5 | from typing import Dict 6 | from detectron2.modeling import build_backbone 7 | from ..util.misc import NestedTensor 8 | 9 | 10 | class MaskedBackbone(nn.Module): 11 | """ This is a thin wrapper around D2's backbone to provide padding masking""" 12 | 13 | def __init__(self, cfg, input_shape=None): 14 | super().__init__() 15 | self.backbone = build_backbone(cfg, input_shape=input_shape) 16 | backbone_shape = self.backbone.output_shape() 17 | self.feature_strides = [backbone_shape[f].stride for f in backbone_shape.keys()] 18 | 19 | self.num_channels = [backbone_shape[f].channels for f in backbone_shape.keys()] 20 | 21 | def forward(self, tensor_list): 22 | xs = self.backbone(tensor_list.tensors) 23 | out: Dict[str, NestedTensor] = {} 24 | for name, x in xs.items(): 25 | m = tensor_list.mask 26 | assert m is not None 27 | mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0] 28 | out[name] = NestedTensor(x, mask) 29 | return out 30 | 31 | def mask_out_padding(self, feature_shapes, image_sizes, device): 32 | masks = [] 33 | assert len(feature_shapes) == len(self.feature_strides) 34 | for idx, shape in enumerate(feature_shapes): 35 | N, _, H, W = shape 36 | masks_per_feature_level = torch.ones((N, H, W), dtype=torch.bool, device=device) 37 | for img_idx, (h, w) in enumerate(image_sizes): 38 | masks_per_feature_level[ 39 | img_idx, 40 | : int(np.ceil(float(h) / self.feature_strides[idx])), 41 | : int(np.ceil(float(w) / self.feature_strides[idx])), 42 | ] = 0 43 | masks.append(masks_per_feature_level) 44 | return masks -------------------------------------------------------------------------------- /projects/DDETRS/ddetrs/backbone/pos_embed.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # -------------------------------------------------------- 7 | # Position embedding utils 8 | # -------------------------------------------------------- 9 | 10 | import numpy as np 11 | 12 | import torch 13 | 14 | # -------------------------------------------------------- 15 | # 2D sine-cosine position embedding 16 | # References: 17 | # Transformer: https://github.com/tensorflow/models/blob/master/official/nlp/transformer/model_utils.py 18 | # MoCo v3: https://github.com/facebookresearch/moco-v3 19 | # -------------------------------------------------------- 20 | def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False): 21 | """ 22 | grid_size: int of the grid height and width 23 | return: 24 | pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token) 25 | """ 26 | grid_h = np.arange(grid_size, dtype=np.float32) 27 | grid_w = np.arange(grid_size, dtype=np.float32) 28 | grid = np.meshgrid(grid_w, grid_h) # here w goes first 29 | grid = np.stack(grid, axis=0) 30 | 31 | grid = grid.reshape([2, 1, grid_size, grid_size]) 32 | pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid) 33 | if cls_token: 34 | pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0) 35 | return pos_embed 36 | 37 | 38 | def get_2d_sincos_pos_embed_from_grid(embed_dim, grid): 39 | assert embed_dim % 2 == 0 40 | 41 | # use half of dimensions to encode grid_h 42 | emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2) 43 | emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2) 44 | 45 | emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D) 46 | return emb 47 | 48 | 49 | def get_1d_sincos_pos_embed_from_grid(embed_dim, pos): 50 | """ 51 | embed_dim: output dimension for each position 52 | pos: a list of positions to be encoded: size (M,) 53 | out: (M, D) 54 | """ 55 | assert embed_dim % 2 == 0 56 | omega = np.arange(embed_dim // 2, dtype=np.float) 57 | omega /= embed_dim / 2. 58 | omega = 1. / 10000**omega # (D/2,) 59 | 60 | pos = pos.reshape(-1) # (M,) 61 | out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product 62 | 63 | emb_sin = np.sin(out) # (M, D/2) 64 | emb_cos = np.cos(out) # (M, D/2) 65 | 66 | emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D) 67 | return emb 68 | 69 | 70 | # -------------------------------------------------------- 71 | # Interpolate position embeddings for high-resolution 72 | # References: 73 | # DeiT: https://github.com/facebookresearch/deit 74 | # -------------------------------------------------------- 75 | def interpolate_pos_embed(model, checkpoint_model): 76 | if 'pos_embed' in checkpoint_model: 77 | pos_embed_checkpoint = checkpoint_model['pos_embed'] 78 | embedding_size = pos_embed_checkpoint.shape[-1] 79 | num_patches = model.patch_embed.num_patches 80 | num_extra_tokens = model.pos_embed.shape[-2] - num_patches 81 | # height (== width) for the checkpoint position embedding 82 | orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5) 83 | # height (== width) for the new position embedding 84 | new_size = int(num_patches ** 0.5) 85 | # class_token and dist_token are kept unchanged 86 | if orig_size != new_size: 87 | print("Position interpolate from %dx%d to %dx%d" % (orig_size, orig_size, new_size, new_size)) 88 | extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens] 89 | # only the position tokens are interpolated 90 | pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:] 91 | pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2) 92 | pos_tokens = torch.nn.functional.interpolate( 93 | pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False) 94 | pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2) 95 | new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1) 96 | checkpoint_model['pos_embed'] = new_pos_embed 97 | -------------------------------------------------------------------------------- /projects/DDETRS/ddetrs/data/custom_build_augmentation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from detectron2.data import transforms as T 3 | from .transforms.custom_augmentation_impl import EfficientDetResizeCrop 4 | 5 | 6 | def build_custom_augmentation(cfg, is_train, scale=None, size=None, \ 7 | min_size=None, max_size=None): 8 | """ 9 | Create a list of default :class:`Augmentation` from config. 10 | Now it includes resizing and flipping. 11 | 12 | Returns: 13 | list[Augmentation] 14 | """ 15 | if cfg.INPUT.CUSTOM_AUG == 'ResizeShortestEdge': 16 | if is_train: 17 | min_size = cfg.INPUT.MIN_SIZE_TRAIN if min_size is None else min_size 18 | max_size = cfg.INPUT.MAX_SIZE_TRAIN if max_size is None else max_size 19 | sample_style = cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING 20 | else: 21 | min_size = cfg.INPUT.MIN_SIZE_TEST 22 | max_size = cfg.INPUT.MAX_SIZE_TEST 23 | sample_style = "choice" 24 | augmentation = [T.ResizeShortestEdge(min_size, max_size, sample_style)] 25 | elif cfg.INPUT.CUSTOM_AUG == 'EfficientDetResizeCrop': 26 | if is_train: 27 | scale = cfg.INPUT.SCALE_RANGE if scale is None else scale 28 | size = cfg.INPUT.TRAIN_SIZE if size is None else size 29 | else: 30 | scale = (1, 1) 31 | size = cfg.INPUT.TEST_SIZE 32 | augmentation = [EfficientDetResizeCrop(size, scale)] 33 | else: 34 | assert 0, cfg.INPUT.CUSTOM_AUG 35 | 36 | if is_train: 37 | augmentation.append(T.RandomFlip()) 38 | return augmentation 39 | 40 | 41 | build_custom_transform_gen = build_custom_augmentation 42 | """ 43 | Alias for backward-compatibility. 44 | """ 45 | -------------------------------------------------------------------------------- /projects/DDETRS/ddetrs/data/datasets/grit20m.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from fvcore.common.timer import Timer 4 | from detectron2.structures import BoxMode 5 | from fvcore.common.file_io import PathManager 6 | from detectron2.data import DatasetCatalog, MetadataCatalog 7 | from lvis import LVIS 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | __all__ = ["load_grit_json", "register_grit_instances"] 12 | 13 | 14 | def register_grit_instances(name, metadata, json_file, image_root): 15 | """ 16 | """ 17 | DatasetCatalog.register(name, lambda: load_grit_json( 18 | json_file, image_root, name)) 19 | MetadataCatalog.get(name).set( 20 | json_file=json_file, image_root=image_root, 21 | evaluator_type="vg", **metadata 22 | ) 23 | 24 | 25 | def get_grit_meta(): 26 | categories = [{'supercategory': 'object', 'id': 1, 'name': 'object'}] 27 | grit_categories = sorted(categories, key=lambda x: x["id"]) 28 | thing_classes = [k["name"] for k in grit_categories] 29 | meta = {"thing_classes": thing_classes} 30 | return meta 31 | 32 | 33 | def load_grit_json(json_file, image_root, dataset_name=None, prompt=None): 34 | 35 | json_file = PathManager.get_local_path(json_file) 36 | 37 | timer = Timer() 38 | lvis_api = LVIS(json_file) 39 | if timer.seconds() > 1: 40 | logger.info("Loading {} takes {:.2f} seconds.".format( 41 | json_file, timer.seconds())) 42 | 43 | img_ids = sorted(lvis_api.imgs.keys()) 44 | imgs = lvis_api.load_imgs(img_ids) 45 | anns = [lvis_api.img_ann_map[img_id] for img_id in img_ids] 46 | 47 | ann_ids = [ann["id"] for anns_per_image in anns for ann in anns_per_image] 48 | assert len(set(ann_ids)) == len(ann_ids), \ 49 | "Annotation ids in '{}' are not unique".format(json_file) 50 | 51 | imgs_anns = list(zip(imgs, anns)) 52 | logger.info("Loaded {} images in the LVIS v1 format from {}".format( 53 | len(imgs_anns), json_file)) 54 | 55 | dataset_dicts = [] 56 | 57 | for (img_dict, anno_dict_list) in imgs_anns: 58 | record = {} 59 | if "file_name" in img_dict: 60 | file_name = img_dict["file_name"] 61 | record["file_name"] = os.path.join(image_root, file_name) 62 | 63 | record["height"] = int(img_dict["height"]) 64 | record["width"] = int(img_dict["width"]) 65 | image_id = record["image_id"] = img_dict["id"] 66 | 67 | objs = [] 68 | for anno in anno_dict_list: 69 | assert anno["image_id"] == image_id 70 | if anno.get('iscrowd', 0) > 0: 71 | continue 72 | obj = {"bbox": anno["bbox"], "bbox_mode": BoxMode.XYWH_ABS} 73 | obj["category_id"] = 0 74 | 75 | if "object_name" in anno.keys(): 76 | obj["object_description"] = anno["object_name"] 77 | else: 78 | obj["object_description"] = anno["object_descriptions"] 79 | 80 | objs.append(obj) 81 | record["annotations"] = objs 82 | if len(record["annotations"]) == 0: 83 | continue 84 | record["anno_type"] = 'pseudo_box' 85 | record["task"] = 'detect' 86 | dataset_dicts.append(record) 87 | 88 | return dataset_dicts 89 | 90 | _CUSTOM_SPLITS_LVIS = { 91 | "grit_5m": ("grit_20m/images", "grit_20m/grit_object5m.json"), 92 | "grit_20m": ("grit_20m/images", "grit_20m/grit_object20m.json"), 93 | } 94 | 95 | for key, (image_root, json_file) in _CUSTOM_SPLITS_LVIS.items(): 96 | register_grit_instances( 97 | key, 98 | get_grit_meta(), 99 | os.path.join("datasets", json_file) if "://" not in json_file else json_file, 100 | os.path.join("datasets", image_root), 101 | ) 102 | -------------------------------------------------------------------------------- /projects/DDETRS/ddetrs/data/datasets/grit_pseudo.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from fvcore.common.timer import Timer 4 | from detectron2.structures import BoxMode 5 | from fvcore.common.file_io import PathManager 6 | from detectron2.data import DatasetCatalog, MetadataCatalog 7 | from lvis import LVIS 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | __all__ = ["load_grit_json", "register_grit_instances"] 12 | 13 | 14 | def register_grit_instances(name, metadata, json_file, image_root): 15 | """ 16 | """ 17 | DatasetCatalog.register(name, lambda: load_grit_json( 18 | json_file, image_root, name)) 19 | MetadataCatalog.get(name).set( 20 | json_file=json_file, image_root=image_root, 21 | evaluator_type="vg", **metadata 22 | ) 23 | 24 | 25 | def get_grit_meta(): 26 | categories = [{'supercategory': 'object', 'id': 1, 'name': 'object'}] 27 | grit_categories = sorted(categories, key=lambda x: x["id"]) 28 | thing_classes = [k["name"] for k in grit_categories] 29 | meta = {"thing_classes": thing_classes} 30 | return meta 31 | 32 | 33 | def load_grit_json(json_file, image_root, dataset_name=None, prompt=None): 34 | 35 | json_file = PathManager.get_local_path(json_file) 36 | 37 | timer = Timer() 38 | lvis_api = LVIS(json_file) 39 | if timer.seconds() > 1: 40 | logger.info("Loading {} takes {:.2f} seconds.".format( 41 | json_file, timer.seconds())) 42 | 43 | img_ids = sorted(lvis_api.imgs.keys()) 44 | imgs = lvis_api.load_imgs(img_ids) 45 | anns = [lvis_api.img_ann_map[img_id] for img_id in img_ids] 46 | 47 | ann_ids = [ann["id"] for anns_per_image in anns for ann in anns_per_image] 48 | assert len(set(ann_ids)) == len(ann_ids), \ 49 | "Annotation ids in '{}' are not unique".format(json_file) 50 | 51 | imgs_anns = list(zip(imgs, anns)) 52 | logger.info("Loaded {} images in the LVIS v1 format from {}".format( 53 | len(imgs_anns), json_file)) 54 | 55 | dataset_dicts = [] 56 | 57 | for (img_dict, anno_dict_list) in imgs_anns: 58 | record = {} 59 | if "file_name" in img_dict: 60 | file_name = img_dict["file_name"] 61 | record["file_name"] = os.path.join(image_root, file_name) 62 | 63 | record["height"] = int(img_dict["height"]) 64 | record["width"] = int(img_dict["width"]) 65 | image_id = record["image_id"] = img_dict["id"] 66 | 67 | objs = [] 68 | for anno in anno_dict_list: 69 | assert anno["image_id"] == image_id 70 | if anno.get('iscrowd', 0) > 0: 71 | continue 72 | obj = {"bbox": anno["bbox"], "bbox_mode": BoxMode.XYWH_ABS} 73 | obj["category_id"] = 0 74 | 75 | if "object_name" in anno.keys(): 76 | obj["object_description"] = anno["object_name"] 77 | else: 78 | obj["object_description"] = anno["object_descriptions"] 79 | 80 | objs.append(obj) 81 | record["annotations"] = objs 82 | if len(record["annotations"]) == 0: 83 | continue 84 | record["anno_type"] = 'box' 85 | record["task"] = 'detect' 86 | dataset_dicts.append(record) 87 | 88 | return dataset_dicts 89 | 90 | _CUSTOM_SPLITS_LVIS = { 91 | "grit2_5m_train_pseudo": ("grit_20m/images", "grit_20m/annotations/grit2_5m_train_pseudo.json"), 92 | "grit5m_train_pseudo": ("grit_20m/images", "grit_20m/annotations/grit5m_train_pseudo.json"), 93 | } 94 | 95 | for key, (image_root, json_file) in _CUSTOM_SPLITS_LVIS.items(): 96 | register_grit_instances( 97 | key, 98 | get_grit_meta(), 99 | os.path.join("datasets", json_file) if "://" not in json_file else json_file, 100 | os.path.join("datasets", image_root), 101 | ) 102 | -------------------------------------------------------------------------------- /projects/DDETRS/ddetrs/data/datasets/parsers/__init__.py: -------------------------------------------------------------------------------- 1 | from .coco_api import COCO, COCOeval 2 | from .coco_video_parser import CocoVID 3 | 4 | __all__ = ['COCO', 'COCOeval', 'CocoVID'] 5 | -------------------------------------------------------------------------------- /projects/DDETRS/ddetrs/data/datasets/parsers/coco_api.py: -------------------------------------------------------------------------------- 1 | # This file add snake case alias for coco api 2 | 3 | import warnings 4 | 5 | import pycocotools 6 | from pycocotools.coco import COCO as _COCO 7 | from pycocotools.cocoeval import COCOeval as _COCOeval 8 | 9 | 10 | class COCO(_COCO): 11 | """This class is almost the same as official pycocotools package. 12 | 13 | It implements some snake case function aliases. So that the COCO class has 14 | the same interface as LVIS class. 15 | """ 16 | 17 | def __init__(self, annotation_file=None): 18 | if getattr(pycocotools, '__version__', '0') >= '12.0.2': 19 | warnings.warn( 20 | 'mmpycocotools is deprecated. Please install official pycocotools by "pip install pycocotools"', # noqa: E501 21 | UserWarning) 22 | super().__init__(annotation_file=annotation_file) 23 | self.img_ann_map = self.imgToAnns 24 | self.cat_img_map = self.catToImgs 25 | 26 | def get_ann_ids(self, img_ids=[], cat_ids=[], area_rng=[], iscrowd=None): 27 | return self.getAnnIds(img_ids, cat_ids, area_rng, iscrowd) 28 | 29 | def get_cat_ids(self, cat_names=[], sup_names=[], cat_ids=[]): 30 | return self.getCatIds(cat_names, sup_names, cat_ids) 31 | 32 | def get_img_ids(self, img_ids=[], cat_ids=[]): 33 | return self.getImgIds(img_ids, cat_ids) 34 | 35 | def load_anns(self, ids): 36 | return self.loadAnns(ids) 37 | 38 | def load_cats(self, ids): 39 | return self.loadCats(ids) 40 | 41 | def load_imgs(self, ids): 42 | return self.loadImgs(ids) 43 | 44 | 45 | # just for the ease of import 46 | COCOeval = _COCOeval 47 | -------------------------------------------------------------------------------- /projects/DDETRS/ddetrs/data/datasets/parsers/coco_video_parser.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from collections import defaultdict 3 | from .coco_api import COCO 4 | from pycocotools.coco import _isArrayLike 5 | 6 | 7 | class CocoVID(COCO): 8 | 9 | def __init__(self, annotation_file=None): 10 | assert annotation_file, 'Annotation file must be provided.' 11 | super(CocoVID, self).__init__(annotation_file=annotation_file) 12 | 13 | def createIndex(self): 14 | print('creating index...') 15 | anns, cats, imgs, vids = {}, {}, {}, {} 16 | imgToAnns, catToImgs, vidToImgs = defaultdict(list), defaultdict( 17 | list), defaultdict(list) 18 | 19 | if 'videos' in self.dataset: 20 | for video in self.dataset['videos']: 21 | vids[video['id']] = video 22 | 23 | if 'annotations' in self.dataset: 24 | for ann in self.dataset['annotations']: 25 | imgToAnns[ann['image_id']].append(ann) 26 | anns[ann['id']] = ann 27 | 28 | if 'images' in self.dataset: 29 | for img in self.dataset['images']: 30 | vidToImgs[img['video_id']].append(img) 31 | imgs[img['id']] = img 32 | 33 | if 'categories' in self.dataset: 34 | for cat in self.dataset['categories']: 35 | cats[cat['id']] = cat 36 | 37 | if 'annotations' in self.dataset and 'categories' in self.dataset: 38 | for ann in self.dataset['annotations']: 39 | catToImgs[ann['category_id']].append(ann['image_id']) 40 | 41 | print('index created!') 42 | 43 | self.anns = anns 44 | self.imgToAnns = imgToAnns 45 | self.catToImgs = catToImgs 46 | self.imgs = imgs 47 | self.cats = cats 48 | self.videos = vids 49 | self.vidToImgs = vidToImgs 50 | 51 | def get_vid_ids(self, vidIds=[]): 52 | vidIds = vidIds if _isArrayLike(vidIds) else [vidIds] 53 | 54 | if len(vidIds) == 0: 55 | ids = self.videos.keys() 56 | else: 57 | ids = set(vidIds) 58 | 59 | return list(ids) 60 | 61 | def get_img_ids_from_vid(self, vidId): 62 | img_infos = self.vidToImgs[vidId] 63 | ids = list(np.zeros([len(img_infos)], dtype=np.int)) 64 | for img_info in img_infos: 65 | ids[img_info['frame_id']] = img_info['id'] 66 | return ids 67 | 68 | def load_vids(self, ids=[]): 69 | if _isArrayLike(ids): 70 | return [self.videos[id] for id in ids] 71 | elif type(ids) == int: 72 | return [self.videos[ids]] 73 | -------------------------------------------------------------------------------- /projects/DDETRS/ddetrs/data/datasets/vg.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from fvcore.common.timer import Timer 4 | from detectron2.structures import BoxMode 5 | from fvcore.common.file_io import PathManager 6 | from detectron2.data import DatasetCatalog, MetadataCatalog 7 | from lvis import LVIS 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | __all__ = ["load_vg_json", "register_vg_instances"] 12 | 13 | 14 | def register_vg_instances(name, metadata, prompt, json_file, image_root): 15 | """ 16 | """ 17 | DatasetCatalog.register(name, lambda: load_vg_json( 18 | json_file, image_root, name, prompt)) 19 | MetadataCatalog.get(name).set( 20 | json_file=json_file, image_root=image_root, 21 | evaluator_type="vg", **metadata 22 | ) 23 | 24 | 25 | def get_vg_meta(): 26 | categories = [{'supercategory': 'object', 'id': 1, 'name': 'object'}] 27 | vg_categories = sorted(categories, key=lambda x: x["id"]) 28 | thing_classes = [k["name"] for k in vg_categories] 29 | meta = {"thing_classes": thing_classes} 30 | return meta 31 | 32 | 33 | def load_vg_json(json_file, image_root, dataset_name=None, prompt=None): 34 | 35 | json_file = PathManager.get_local_path(json_file) 36 | 37 | timer = Timer() 38 | lvis_api = LVIS(json_file) 39 | if timer.seconds() > 1: 40 | logger.info("Loading {} takes {:.2f} seconds.".format( 41 | json_file, timer.seconds())) 42 | 43 | img_ids = sorted(lvis_api.imgs.keys()) 44 | imgs = lvis_api.load_imgs(img_ids) 45 | anns = [lvis_api.img_ann_map[img_id] for img_id in img_ids] 46 | 47 | ann_ids = [ann["id"] for anns_per_image in anns for ann in anns_per_image] 48 | assert len(set(ann_ids)) == len(ann_ids), \ 49 | "Annotation ids in '{}' are not unique".format(json_file) 50 | 51 | imgs_anns = list(zip(imgs, anns)) 52 | logger.info("Loaded {} images in the LVIS v1 format from {}".format( 53 | len(imgs_anns), json_file)) 54 | 55 | dataset_dicts = [] 56 | 57 | for (img_dict, anno_dict_list) in imgs_anns: 58 | record = {} 59 | if "file_name" in img_dict: 60 | file_name = img_dict["file_name"] 61 | record["file_name"] = os.path.join(image_root, file_name) 62 | 63 | record["height"] = int(img_dict["height"]) 64 | record["width"] = int(img_dict["width"]) 65 | image_id = record["image_id"] = img_dict["id"] 66 | 67 | objs = [] 68 | for anno in anno_dict_list: 69 | assert anno["image_id"] == image_id 70 | if anno.get('iscrowd', 0) > 0: 71 | continue 72 | obj = {"bbox": anno["bbox"], "bbox_mode": BoxMode.XYWH_ABS} 73 | obj["category_id"] = 0 74 | 75 | if "caption_with_token" in anno.keys(): 76 | obj["object_description"] = anno["caption_with_token"] 77 | elif "object_name" in anno.keys(): 78 | obj["object_description"] = anno["object_name"] 79 | else: 80 | obj["object_description"] = anno["caption"] 81 | objs.append(obj) 82 | record["annotations"] = objs 83 | if len(record["annotations"]) == 0: 84 | continue 85 | record["task"] = prompt 86 | record["anno_type"] = 'box' 87 | dataset_dicts.append(record) 88 | 89 | return dataset_dicts 90 | 91 | _CUSTOM_SPLITS_LVIS = { 92 | "vg_from_objects": ("vg/images", "vg/train_from_objects.json", 'detect'), 93 | } 94 | 95 | 96 | for key, (image_root, json_file, prompt) in _CUSTOM_SPLITS_LVIS.items(): 97 | register_vg_instances( 98 | key, 99 | get_vg_meta(), 100 | prompt, 101 | os.path.join("datasets", json_file) if "://" not in json_file else json_file, 102 | os.path.join("datasets", image_root), 103 | ) 104 | -------------------------------------------------------------------------------- /projects/DDETRS/ddetrs/data/mapper/__init__.py: -------------------------------------------------------------------------------- 1 | from .build import * 2 | from ..datasets import * 3 | from .custom_dataset_dataloader import * -------------------------------------------------------------------------------- /projects/DDETRS/ddetrs/data/transforms/custom_augmentation_impl.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 3 | # Part of the code is from https://github.com/rwightman/efficientdet-pytorch/blob/master/effdet/data/transforms.py 4 | # Modified by Xingyi Zhou 5 | # The original code is under Apache-2.0 License 6 | import numpy as np 7 | from PIL import Image 8 | 9 | from detectron2.data.transforms.augmentation import Augmentation 10 | from .custom_transform import EfficientDetResizeCropTransform 11 | 12 | __all__ = [ 13 | "EfficientDetResizeCrop", 14 | ] 15 | 16 | 17 | class EfficientDetResizeCrop(Augmentation): 18 | """ 19 | Scale the shorter edge to the given size, with a limit of `max_size` on the longer edge. 20 | If `max_size` is reached, then downscale so that the longer edge does not exceed max_size. 21 | """ 22 | 23 | def __init__( 24 | self, size, scale, interp=Image.BILINEAR 25 | ): 26 | """ 27 | """ 28 | super().__init__() 29 | self.target_size = (size, size) 30 | self.scale = scale 31 | self.interp = interp 32 | 33 | def get_transform(self, img): 34 | # Select a random scale factor. 35 | scale_factor = np.random.uniform(*self.scale) 36 | scaled_target_height = scale_factor * self.target_size[0] 37 | scaled_target_width = scale_factor * self.target_size[1] 38 | # Recompute the accurate scale_factor using rounded scaled image size. 39 | width, height = img.shape[1], img.shape[0] 40 | img_scale_y = scaled_target_height / height 41 | img_scale_x = scaled_target_width / width 42 | img_scale = min(img_scale_y, img_scale_x) 43 | 44 | # Select non-zero random offset (x, y) if scaled image is larger than target size 45 | scaled_h = int(height * img_scale) 46 | scaled_w = int(width * img_scale) 47 | offset_y = scaled_h - self.target_size[0] 48 | offset_x = scaled_w - self.target_size[1] 49 | offset_y = int(max(0.0, float(offset_y)) * np.random.uniform(0, 1)) 50 | offset_x = int(max(0.0, float(offset_x)) * np.random.uniform(0, 1)) 51 | return EfficientDetResizeCropTransform( 52 | scaled_h, scaled_w, offset_y, offset_x, img_scale, self.target_size, self.interp) -------------------------------------------------------------------------------- /projects/DDETRS/ddetrs/data/transforms/custom_transform.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 3 | # Part of the code is from https://github.com/rwightman/efficientdet-pytorch/blob/master/effdet/data/transforms.py 4 | # Modified by Xingyi Zhou 5 | # The original code is under Apache-2.0 License 6 | import numpy as np 7 | import torch 8 | import torch.nn.functional as F 9 | from fvcore.transforms.transform import ( 10 | CropTransform, 11 | HFlipTransform, 12 | NoOpTransform, 13 | Transform, 14 | TransformList, 15 | ) 16 | from PIL import Image 17 | 18 | try: 19 | import cv2 # noqa 20 | except ImportError: 21 | # OpenCV is an optional dependency at the moment 22 | pass 23 | 24 | __all__ = [ 25 | "EfficientDetResizeCropTransform", 26 | ] 27 | 28 | 29 | class EfficientDetResizeCropTransform(Transform): 30 | """ 31 | """ 32 | 33 | def __init__(self, scaled_h, scaled_w, offset_y, offset_x, img_scale, \ 34 | target_size, interp=None): 35 | """ 36 | Args: 37 | h, w (int): original image size 38 | new_h, new_w (int): new image size 39 | interp: PIL interpolation methods, defaults to bilinear. 40 | """ 41 | # TODO decide on PIL vs opencv 42 | super().__init__() 43 | if interp is None: 44 | interp = Image.BILINEAR 45 | self._set_attributes(locals()) 46 | 47 | def apply_image(self, img, interp=None): 48 | assert len(img.shape) <= 4 49 | 50 | if img.dtype == np.uint8: 51 | pil_image = Image.fromarray(img) 52 | interp_method = interp if interp is not None else self.interp 53 | pil_image = pil_image.resize((self.scaled_w, self.scaled_h), interp_method) 54 | ret = np.asarray(pil_image) 55 | right = min(self.scaled_w, self.offset_x + self.target_size[1]) 56 | lower = min(self.scaled_h, self.offset_y + self.target_size[0]) 57 | if len(ret.shape) <= 3: 58 | ret = ret[self.offset_y: lower, self.offset_x: right] 59 | else: 60 | ret = ret[..., self.offset_y: lower, self.offset_x: right, :] 61 | else: 62 | # PIL only supports uint8 63 | img = torch.from_numpy(img) 64 | shape = list(img.shape) 65 | shape_4d = shape[:2] + [1] * (4 - len(shape)) + shape[2:] 66 | img = img.view(shape_4d).permute(2, 3, 0, 1) # hw(c) -> nchw 67 | _PIL_RESIZE_TO_INTERPOLATE_MODE = {Image.BILINEAR: "bilinear", Image.BICUBIC: "bicubic"} 68 | mode = _PIL_RESIZE_TO_INTERPOLATE_MODE[self.interp] 69 | img = F.interpolate(img, (self.scaled_h, self.scaled_w), mode=mode, align_corners=False) 70 | shape[:2] = (self.scaled_h, self.scaled_w) 71 | ret = img.permute(2, 3, 0, 1).view(shape).numpy() # nchw -> hw(c) 72 | right = min(self.scaled_w, self.offset_x + self.target_size[1]) 73 | lower = min(self.scaled_h, self.offset_y + self.target_size[0]) 74 | if len(ret.shape) <= 3: 75 | ret = ret[self.offset_y: lower, self.offset_x: right] 76 | else: 77 | ret = ret[..., self.offset_y: lower, self.offset_x: right, :] 78 | return ret 79 | 80 | 81 | def apply_coords(self, coords): 82 | coords[:, 0] = coords[:, 0] * self.img_scale 83 | coords[:, 1] = coords[:, 1] * self.img_scale 84 | coords[:, 0] -= self.offset_x 85 | coords[:, 1] -= self.offset_y 86 | return coords 87 | 88 | 89 | def apply_segmentation(self, segmentation): 90 | segmentation = self.apply_image(segmentation, interp=Image.NEAREST) 91 | return segmentation 92 | 93 | 94 | def inverse(self): 95 | raise NotImplementedError 96 | 97 | 98 | def inverse_apply_coords(self, coords): 99 | coords[:, 0] += self.offset_x 100 | coords[:, 1] += self.offset_y 101 | coords[:, 0] = coords[:, 0] / self.img_scale 102 | coords[:, 1] = coords[:, 1] / self.img_scale 103 | return coords 104 | 105 | 106 | def inverse_apply_box(self, box: np.ndarray) -> np.ndarray: 107 | """ 108 | """ 109 | idxs = np.array([(0, 1), (2, 1), (0, 3), (2, 3)]).flatten() 110 | coords = np.asarray(box).reshape(-1, 4)[:, idxs].reshape(-1, 2) 111 | coords = self.inverse_apply_coords(coords).reshape((-1, 4, 2)) 112 | minxy = coords.min(axis=1) 113 | maxxy = coords.max(axis=1) 114 | trans_boxes = np.concatenate((minxy, maxxy), axis=1) 115 | return trans_boxes -------------------------------------------------------------------------------- /projects/DDETRS/ddetrs/models/conv_with_kaiming_uniform.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | 3 | from detectron2.layers import Conv2d 4 | # from .deform_conv import DFConv2d 5 | from detectron2.layers.batch_norm import get_norm 6 | 7 | 8 | def conv_with_kaiming_uniform( 9 | norm=None, activation=None, 10 | use_deformable=False, use_sep=False): 11 | def make_conv( 12 | in_channels, out_channels, kernel_size, stride=1, dilation=1 13 | ): 14 | if use_deformable: 15 | # conv_func = DFConv2d 16 | assert("deformable is not supported for now") 17 | else: 18 | conv_func = Conv2d 19 | if use_sep: 20 | assert in_channels == out_channels 21 | groups = in_channels 22 | else: 23 | groups = 1 24 | conv = conv_func( 25 | in_channels, 26 | out_channels, 27 | kernel_size=kernel_size, 28 | stride=stride, 29 | padding=dilation * (kernel_size - 1) // 2, 30 | dilation=dilation, 31 | groups=groups, 32 | bias=(norm is None) 33 | ) 34 | if not use_deformable: 35 | # Caffe2 implementation uses XavierFill, which in fact 36 | # corresponds to kaiming_uniform_ in PyTorch 37 | nn.init.kaiming_uniform_(conv.weight, a=1) 38 | if norm is None: 39 | nn.init.constant_(conv.bias, 0) 40 | module = [conv,] 41 | if norm is not None and len(norm) > 0: 42 | if norm == "GN": 43 | norm_module = nn.GroupNorm(32, out_channels) 44 | else: 45 | norm_module = get_norm(norm, out_channels) 46 | module.append(norm_module) 47 | if activation is not None: 48 | module.append(nn.ReLU(inplace=True)) 49 | if len(module) > 1: 50 | return nn.Sequential(*module) 51 | return conv 52 | 53 | return make_conv 54 | -------------------------------------------------------------------------------- /projects/DDETRS/ddetrs/models/deformable_detr/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | # Modified from DETR (https://github.com/facebookresearch/detr) 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 8 | # ------------------------------------------------------------------------ 9 | 10 | 11 | -------------------------------------------------------------------------------- /projects/DDETRS/ddetrs/models/deformable_detr/bert_model.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | import numpy as np 3 | import torch 4 | from torch import nn 5 | 6 | # from pytorch_pretrained_bert.modeling import BertModel 7 | from transformers import BertConfig, RobertaConfig, RobertaModel, BertModel 8 | 9 | 10 | class BertEncoder(nn.Module): 11 | def __init__(self, cfg): 12 | super(BertEncoder, self).__init__() 13 | self.cfg = cfg 14 | self.bert_name = cfg.MODEL.LANGUAGE_BACKBONE.MODEL_TYPE 15 | print("LANGUAGE BACKBONE USE GRADIENT CHECKPOINTING: ", self.cfg.MODEL.LANGUAGE_BACKBONE.USE_CHECKPOINT) 16 | 17 | if self.bert_name == "bert-base-uncased": 18 | config = BertConfig.from_pretrained("projects/DDETRS/%s"%self.bert_name) 19 | config.gradient_checkpointing = self.cfg.MODEL.LANGUAGE_BACKBONE.USE_CHECKPOINT 20 | self.model = BertModel.from_pretrained("projects/DDETRS/%s" % self.bert_name, add_pooling_layer=False, config=config) 21 | self.language_dim = 768 22 | elif self.bert_name == "roberta-base": 23 | config = RobertaConfig.from_pretrained(self.bert_name) 24 | config.gradient_checkpointing = self.cfg.MODEL.LANGUAGE_BACKBONE.USE_CHECKPOINT 25 | self.model = RobertaModel.from_pretrained(self.bert_name, add_pooling_layer=False, config=config) 26 | self.language_dim = 768 27 | else: 28 | raise NotImplementedError 29 | 30 | self.num_layers = cfg.MODEL.LANGUAGE_BACKBONE.N_LAYERS # 1 31 | self.parallel_det = cfg.MODEL.PARALLEL_DET 32 | 33 | def forward(self, x, task=None): 34 | input = x["input_ids"] # (bs, seq_len) 35 | mask = x["attention_mask"] # (bs, seq_len) 36 | 37 | if self.parallel_det and task == "detection": 38 | # disable interaction among tokens 39 | bs, seq_len = mask.shape 40 | mask_new = torch.zeros((bs, seq_len, seq_len), device=mask.device) 41 | for _ in range(bs): 42 | mask_new[_, :, :] = mask[_] 43 | num_valid = torch.sum(mask[_]) 44 | mask_new[_, :num_valid, :num_valid] = torch.eye(num_valid) 45 | # with padding, always 256 46 | outputs = self.model( 47 | input_ids=input, 48 | attention_mask=mask_new, 49 | output_hidden_states=True, 50 | ) 51 | else: 52 | # with padding, always 256 53 | outputs = self.model( 54 | input_ids=input, 55 | attention_mask=mask, 56 | output_hidden_states=True, 57 | ) 58 | # outputs has 13 layers, 1 input layer and 12 hidden layers 59 | encoded_layers = outputs.hidden_states[1:] 60 | # features = None 61 | # features = torch.stack(encoded_layers[-self.num_layers:], 1).mean(1) # (bs, seq_len, language_dim) 62 | 63 | # # language embedding has shape [len(phrase), seq_len, language_dim] 64 | # features = features / self.num_layers 65 | 66 | # embedded = features * mask.unsqueeze(-1).float() # use mask to zero out invalid token features 67 | # aggregate = embedded.sum(1) / (mask.sum(-1).unsqueeze(-1).float()) 68 | 69 | ret = { 70 | # "aggregate": aggregate, 71 | # "embedded": embedded, 72 | "masks": mask, 73 | "hidden": encoded_layers[-1] 74 | } 75 | return ret 76 | -------------------------------------------------------------------------------- /projects/DDETRS/ddetrs/models/deformable_detr/ops/functions/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | from .ms_deform_attn_func import MSDeformAttnFunction 10 | 11 | -------------------------------------------------------------------------------- /projects/DDETRS/ddetrs/models/deformable_detr/ops/functions/ms_deform_attn_func.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | from __future__ import absolute_import 10 | from __future__ import print_function 11 | from __future__ import division 12 | 13 | import torch 14 | import torch.nn.functional as F 15 | from torch.autograd import Function 16 | from torch.autograd.function import once_differentiable 17 | 18 | import MultiScaleDeformableAttention as MSDA 19 | 20 | 21 | class MSDeformAttnFunction(Function): 22 | @staticmethod 23 | @torch.cuda.amp.custom_fwd(cast_inputs=torch.float32) 24 | def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step): 25 | ctx.im2col_step = im2col_step 26 | output = MSDA.ms_deform_attn_forward( 27 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step) 28 | ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights) 29 | return output 30 | 31 | @staticmethod 32 | @once_differentiable 33 | @torch.cuda.amp.custom_bwd 34 | def backward(ctx, grad_output): 35 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors 36 | grad_value, grad_sampling_loc, grad_attn_weight = \ 37 | MSDA.ms_deform_attn_backward( 38 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step) 39 | 40 | return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None 41 | 42 | 43 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights): 44 | # for debug and test only, 45 | # need to use cuda version instead 46 | N_, S_, M_, D_ = value.shape 47 | _, Lq_, M_, L_, P_, _ = sampling_locations.shape 48 | value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1) 49 | sampling_grids = 2 * sampling_locations - 1 50 | sampling_value_list = [] 51 | for lid_, (H_, W_) in enumerate(value_spatial_shapes): 52 | # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_ 53 | value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_) 54 | # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2 55 | sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1) 56 | # N_*M_, D_, Lq_, P_ 57 | sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_, 58 | mode='bilinear', padding_mode='zeros', align_corners=False) 59 | sampling_value_list.append(sampling_value_l_) 60 | # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_) 61 | attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_) 62 | output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_) 63 | return output.transpose(1, 2).contiguous() 64 | -------------------------------------------------------------------------------- /projects/DDETRS/ddetrs/models/deformable_detr/ops/make.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # ------------------------------------------------------------------------------------------------ 3 | # Deformable DETR 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | # ------------------------------------------------------------------------------------------------ 7 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | # ------------------------------------------------------------------------------------------------ 9 | 10 | python3 setup.py build install --user 11 | -------------------------------------------------------------------------------- /projects/DDETRS/ddetrs/models/deformable_detr/ops/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | from .ms_deform_attn import MSDeformAttn 10 | -------------------------------------------------------------------------------- /projects/DDETRS/ddetrs/models/deformable_detr/ops/setup.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | import os 10 | import glob 11 | 12 | import torch 13 | 14 | from torch.utils.cpp_extension import CUDA_HOME 15 | from torch.utils.cpp_extension import CppExtension 16 | from torch.utils.cpp_extension import CUDAExtension 17 | 18 | from setuptools import find_packages 19 | from setuptools import setup 20 | 21 | requirements = ["torch", "torchvision"] 22 | 23 | def get_extensions(): 24 | this_dir = os.path.dirname(os.path.abspath(__file__)) 25 | extensions_dir = os.path.join(this_dir, "src") 26 | 27 | main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) 28 | source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) 29 | source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) 30 | 31 | sources = main_file + source_cpu 32 | extension = CppExtension 33 | extra_compile_args = {"cxx": []} 34 | define_macros = [] 35 | 36 | if torch.cuda.is_available() and CUDA_HOME is not None: 37 | extension = CUDAExtension 38 | sources += source_cuda 39 | define_macros += [("WITH_CUDA", None)] 40 | extra_compile_args["nvcc"] = [ 41 | "-DCUDA_HAS_FP16=1", 42 | "-D__CUDA_NO_HALF_OPERATORS__", 43 | "-D__CUDA_NO_HALF_CONVERSIONS__", 44 | "-D__CUDA_NO_HALF2_OPERATORS__", 45 | ] 46 | else: 47 | raise NotImplementedError('Cuda is not availabel') 48 | 49 | sources = [os.path.join(extensions_dir, s) for s in sources] 50 | include_dirs = [extensions_dir] 51 | ext_modules = [ 52 | extension( 53 | "MultiScaleDeformableAttention", 54 | sources, 55 | include_dirs=include_dirs, 56 | define_macros=define_macros, 57 | extra_compile_args=extra_compile_args, 58 | ) 59 | ] 60 | return ext_modules 61 | 62 | setup( 63 | name="MultiScaleDeformableAttention", 64 | version="1.0", 65 | author="Weijie Su", 66 | url="https://github.com/fundamentalvision/Deformable-DETR", 67 | description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention", 68 | packages=find_packages(exclude=("configs", "tests",)), 69 | ext_modules=get_extensions(), 70 | cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, 71 | ) 72 | -------------------------------------------------------------------------------- /projects/DDETRS/ddetrs/models/deformable_detr/ops/src/cpu/ms_deform_attn_cpu.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #include 12 | 13 | #include 14 | #include 15 | 16 | 17 | at::Tensor 18 | ms_deform_attn_cpu_forward( 19 | const at::Tensor &value, 20 | const at::Tensor &spatial_shapes, 21 | const at::Tensor &level_start_index, 22 | const at::Tensor &sampling_loc, 23 | const at::Tensor &attn_weight, 24 | const int im2col_step) 25 | { 26 | AT_ERROR("Not implement on cpu"); 27 | } 28 | 29 | std::vector 30 | ms_deform_attn_cpu_backward( 31 | const at::Tensor &value, 32 | const at::Tensor &spatial_shapes, 33 | const at::Tensor &level_start_index, 34 | const at::Tensor &sampling_loc, 35 | const at::Tensor &attn_weight, 36 | const at::Tensor &grad_output, 37 | const int im2col_step) 38 | { 39 | AT_ERROR("Not implement on cpu"); 40 | } 41 | 42 | -------------------------------------------------------------------------------- /projects/DDETRS/ddetrs/models/deformable_detr/ops/src/cpu/ms_deform_attn_cpu.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #pragma once 12 | #include 13 | 14 | at::Tensor 15 | ms_deform_attn_cpu_forward( 16 | const at::Tensor &value, 17 | const at::Tensor &spatial_shapes, 18 | const at::Tensor &level_start_index, 19 | const at::Tensor &sampling_loc, 20 | const at::Tensor &attn_weight, 21 | const int im2col_step); 22 | 23 | std::vector 24 | ms_deform_attn_cpu_backward( 25 | const at::Tensor &value, 26 | const at::Tensor &spatial_shapes, 27 | const at::Tensor &level_start_index, 28 | const at::Tensor &sampling_loc, 29 | const at::Tensor &attn_weight, 30 | const at::Tensor &grad_output, 31 | const int im2col_step); 32 | 33 | 34 | -------------------------------------------------------------------------------- /projects/DDETRS/ddetrs/models/deformable_detr/ops/src/cuda/ms_deform_attn_cuda.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #pragma once 12 | #include 13 | 14 | at::Tensor ms_deform_attn_cuda_forward( 15 | const at::Tensor &value, 16 | const at::Tensor &spatial_shapes, 17 | const at::Tensor &level_start_index, 18 | const at::Tensor &sampling_loc, 19 | const at::Tensor &attn_weight, 20 | const int im2col_step); 21 | 22 | std::vector ms_deform_attn_cuda_backward( 23 | const at::Tensor &value, 24 | const at::Tensor &spatial_shapes, 25 | const at::Tensor &level_start_index, 26 | const at::Tensor &sampling_loc, 27 | const at::Tensor &attn_weight, 28 | const at::Tensor &grad_output, 29 | const int im2col_step); 30 | 31 | -------------------------------------------------------------------------------- /projects/DDETRS/ddetrs/models/deformable_detr/ops/src/ms_deform_attn.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #pragma once 12 | 13 | #include "cpu/ms_deform_attn_cpu.h" 14 | 15 | #ifdef WITH_CUDA 16 | #include "cuda/ms_deform_attn_cuda.h" 17 | #endif 18 | 19 | 20 | at::Tensor 21 | ms_deform_attn_forward( 22 | const at::Tensor &value, 23 | const at::Tensor &spatial_shapes, 24 | const at::Tensor &level_start_index, 25 | const at::Tensor &sampling_loc, 26 | const at::Tensor &attn_weight, 27 | const int im2col_step) 28 | { 29 | if (value.type().is_cuda()) 30 | { 31 | #ifdef WITH_CUDA 32 | return ms_deform_attn_cuda_forward( 33 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step); 34 | #else 35 | AT_ERROR("Not compiled with GPU support"); 36 | #endif 37 | } 38 | AT_ERROR("Not implemented on the CPU"); 39 | } 40 | 41 | std::vector 42 | ms_deform_attn_backward( 43 | const at::Tensor &value, 44 | const at::Tensor &spatial_shapes, 45 | const at::Tensor &level_start_index, 46 | const at::Tensor &sampling_loc, 47 | const at::Tensor &attn_weight, 48 | const at::Tensor &grad_output, 49 | const int im2col_step) 50 | { 51 | if (value.type().is_cuda()) 52 | { 53 | #ifdef WITH_CUDA 54 | return ms_deform_attn_cuda_backward( 55 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step); 56 | #else 57 | AT_ERROR("Not compiled with GPU support"); 58 | #endif 59 | } 60 | AT_ERROR("Not implemented on the CPU"); 61 | } 62 | 63 | -------------------------------------------------------------------------------- /projects/DDETRS/ddetrs/models/deformable_detr/ops/src/vision.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #include "ms_deform_attn.h" 12 | 13 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 14 | m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward"); 15 | m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward"); 16 | } 17 | -------------------------------------------------------------------------------- /projects/DDETRS/ddetrs/models/deformable_detr/ops/test.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | from __future__ import absolute_import 10 | from __future__ import print_function 11 | from __future__ import division 12 | 13 | import time 14 | import torch 15 | import torch.nn as nn 16 | from torch.autograd import gradcheck 17 | 18 | from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch 19 | 20 | 21 | N, M, D = 1, 2, 2 22 | Lq, L, P = 2, 2, 2 23 | shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda() 24 | level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1])) 25 | S = sum([(H*W).item() for H, W in shapes]) 26 | 27 | 28 | torch.manual_seed(3) 29 | 30 | 31 | @torch.no_grad() 32 | def check_forward_equal_with_pytorch_double(): 33 | value = torch.rand(N, S, M, D).cuda() * 0.01 34 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 35 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 36 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 37 | im2col_step = 2 38 | output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu() 39 | output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu() 40 | fwdok = torch.allclose(output_cuda, output_pytorch) 41 | max_abs_err = (output_cuda - output_pytorch).abs().max() 42 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() 43 | 44 | print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') 45 | 46 | 47 | @torch.no_grad() 48 | def check_forward_equal_with_pytorch_float(): 49 | value = torch.rand(N, S, M, D).cuda() * 0.01 50 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 51 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 52 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 53 | im2col_step = 2 54 | output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu() 55 | output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu() 56 | fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3) 57 | max_abs_err = (output_cuda - output_pytorch).abs().max() 58 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() 59 | 60 | print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') 61 | 62 | 63 | def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True): 64 | 65 | value = torch.rand(N, S, M, channels).cuda() * 0.01 66 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 67 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 68 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 69 | im2col_step = 2 70 | func = MSDeformAttnFunction.apply 71 | 72 | value.requires_grad = grad_value 73 | sampling_locations.requires_grad = grad_sampling_loc 74 | attention_weights.requires_grad = grad_attn_weight 75 | 76 | gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step)) 77 | 78 | print(f'* {gradok} check_gradient_numerical(D={channels})') 79 | 80 | 81 | if __name__ == '__main__': 82 | check_forward_equal_with_pytorch_double() 83 | check_forward_equal_with_pytorch_float() 84 | 85 | for channels in [30, 32, 64, 71, 1025, 2048, 3096]: 86 | check_gradient_numerical(channels, True, True, True) 87 | 88 | 89 | 90 | -------------------------------------------------------------------------------- /projects/DDETRS/ddetrs/models/deformable_detr/position_encoding.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | # Modified from DETR (https://github.com/facebookresearch/detr) 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 8 | # ------------------------------------------------------------------------ 9 | 10 | """ 11 | Various positional encodings for the transformer. 12 | """ 13 | import math 14 | import torch 15 | from torch import nn 16 | 17 | from ...util.misc import NestedTensor 18 | 19 | 20 | class PositionEmbeddingSine(nn.Module): 21 | """ 22 | This is a more standard version of the position embedding, very similar to the one 23 | used by the Attention is all you need paper, generalized to work on images. 24 | """ 25 | def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None): 26 | super().__init__() 27 | self.num_pos_feats = num_pos_feats 28 | self.temperature = temperature 29 | self.normalize = normalize 30 | if scale is not None and normalize is False: 31 | raise ValueError("normalize should be True if scale is passed") 32 | if scale is None: 33 | scale = 2 * math.pi 34 | self.scale = scale 35 | 36 | def forward(self, tensor_list: NestedTensor): 37 | x = tensor_list.tensors 38 | mask = tensor_list.mask 39 | assert mask is not None 40 | not_mask = ~mask 41 | y_embed = not_mask.cumsum(1, dtype=torch.float32) 42 | x_embed = not_mask.cumsum(2, dtype=torch.float32) 43 | if self.normalize: 44 | eps = 1e-6 45 | y_embed = (y_embed - 0.5) / (y_embed[:, -1:, :] + eps) * self.scale 46 | x_embed = (x_embed - 0.5) / (x_embed[:, :, -1:] + eps) * self.scale 47 | 48 | dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) 49 | dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) 50 | 51 | pos_x = x_embed[:, :, :, None] / dim_t 52 | pos_y = y_embed[:, :, :, None] / dim_t 53 | pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3) 54 | pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3) 55 | pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) 56 | return pos 57 | 58 | 59 | class PositionEmbeddingLearned(nn.Module): 60 | """ 61 | Absolute pos embedding, learned. 62 | """ 63 | def __init__(self, num_pos_feats=256): 64 | super().__init__() 65 | self.row_embed = nn.Embedding(50, num_pos_feats) 66 | self.col_embed = nn.Embedding(50, num_pos_feats) 67 | self.reset_parameters() 68 | 69 | def reset_parameters(self): 70 | nn.init.uniform_(self.row_embed.weight) 71 | nn.init.uniform_(self.col_embed.weight) 72 | 73 | def forward(self, tensor_list: NestedTensor): 74 | x = tensor_list.tensors 75 | h, w = x.shape[-2:] 76 | i = torch.arange(w, device=x.device) 77 | j = torch.arange(h, device=x.device) 78 | x_emb = self.col_embed(i) 79 | y_emb = self.row_embed(j) 80 | pos = torch.cat([ 81 | x_emb.unsqueeze(0).repeat(h, 1, 1), 82 | y_emb.unsqueeze(1).repeat(1, w, 1), 83 | ], dim=-1).permute(2, 0, 1).unsqueeze(0).repeat(x.shape[0], 1, 1, 1) 84 | return pos 85 | 86 | 87 | def build_position_encoding(args): 88 | N_steps = args.hidden_dim // 2 89 | if args.position_embedding in ('v2', 'sine'): 90 | # TODO find a better way of exposing other arguments 91 | position_embedding = PositionEmbeddingSine(N_steps, normalize=True) 92 | elif args.position_embedding in ('v3', 'learned'): 93 | position_embedding = PositionEmbeddingLearned(N_steps) 94 | else: 95 | raise ValueError(f"not supported {args.position_embedding}") 96 | 97 | return position_embedding 98 | -------------------------------------------------------------------------------- /projects/DDETRS/ddetrs/util/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | # Modified from DETR (https://github.com/facebookresearch/detr) 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 8 | # ------------------------------------------------------------------------ 9 | -------------------------------------------------------------------------------- /projects/DDETRS/ddetrs/util/plot_utils.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | # Modified from DETR (https://github.com/facebookresearch/detr) 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 8 | # ------------------------------------------------------------------------ 9 | 10 | """ 11 | Plotting utilities to visualize training logs. 12 | """ 13 | import torch 14 | import pandas as pd 15 | import seaborn as sns 16 | import matplotlib.pyplot as plt 17 | 18 | from pathlib import Path, PurePath 19 | 20 | 21 | def plot_logs(logs, fields=('class_error', 'loss_bbox_unscaled', 'mAP'), ewm_col=0, log_name='log.txt'): 22 | ''' 23 | Function to plot specific fields from training log(s). Plots both training and test results. 24 | 25 | :: Inputs - logs = list containing Path objects, each pointing to individual dir with a log file 26 | - fields = which results to plot from each log file - plots both training and test for each field. 27 | - ewm_col = optional, which column to use as the exponential weighted smoothing of the plots 28 | - log_name = optional, name of log file if different than default 'log.txt'. 29 | 30 | :: Outputs - matplotlib plots of results in fields, color coded for each log file. 31 | - solid lines are training results, dashed lines are test results. 32 | 33 | ''' 34 | func_name = "plot_utils.py::plot_logs" 35 | 36 | # verify logs is a list of Paths (list[Paths]) or single Pathlib object Path, 37 | # convert single Path to list to avoid 'not iterable' error 38 | 39 | if not isinstance(logs, list): 40 | if isinstance(logs, PurePath): 41 | logs = [logs] 42 | print(f"{func_name} info: logs param expects a list argument, converted to list[Path].") 43 | else: 44 | raise ValueError(f"{func_name} - invalid argument for logs parameter.\n \ 45 | Expect list[Path] or single Path obj, received {type(logs)}") 46 | 47 | # verify valid dir(s) and that every item in list is Path object 48 | for i, dir in enumerate(logs): 49 | if not isinstance(dir, PurePath): 50 | raise ValueError(f"{func_name} - non-Path object in logs argument of {type(dir)}: \n{dir}") 51 | if dir.exists(): 52 | continue 53 | raise ValueError(f"{func_name} - invalid directory in logs argument:\n{dir}") 54 | 55 | # load log file(s) and plot 56 | dfs = [pd.read_json(Path(p) / log_name, lines=True) for p in logs] 57 | 58 | fig, axs = plt.subplots(ncols=len(fields), figsize=(16, 5)) 59 | 60 | for df, color in zip(dfs, sns.color_palette(n_colors=len(logs))): 61 | for j, field in enumerate(fields): 62 | if field == 'mAP': 63 | coco_eval = pd.DataFrame(pd.np.stack(df.test_coco_eval.dropna().values)[:, 1]).ewm(com=ewm_col).mean() 64 | axs[j].plot(coco_eval, c=color) 65 | else: 66 | df.interpolate().ewm(com=ewm_col).mean().plot( 67 | y=[f'train_{field}', f'test_{field}'], 68 | ax=axs[j], 69 | color=[color] * 2, 70 | style=['-', '--'] 71 | ) 72 | for ax, field in zip(axs, fields): 73 | ax.legend([Path(p).name for p in logs]) 74 | ax.set_title(field) 75 | 76 | 77 | def plot_precision_recall(files, naming_scheme='iter'): 78 | if naming_scheme == 'exp_id': 79 | # name becomes exp_id 80 | names = [f.parts[-3] for f in files] 81 | elif naming_scheme == 'iter': 82 | names = [f.stem for f in files] 83 | else: 84 | raise ValueError(f'not supported {naming_scheme}') 85 | fig, axs = plt.subplots(ncols=2, figsize=(16, 5)) 86 | for f, color, name in zip(files, sns.color_palette("Blues", n_colors=len(files)), names): 87 | data = torch.load(f) 88 | # precision is n_iou, n_points, n_cat, n_area, max_det 89 | precision = data['precision'] 90 | recall = data['params'].recThrs 91 | scores = data['scores'] 92 | # take precision for all classes, all areas and 100 detections 93 | precision = precision[0, :, :, 0, -1].mean(1) 94 | scores = scores[0, :, :, 0, -1].mean(1) 95 | prec = precision.mean() 96 | rec = data['recall'][0, :, 0, -1].mean() 97 | print(f'{naming_scheme} {name}: mAP@50={prec * 100: 05.1f}, ' + 98 | f'score={scores.mean():0.3f}, ' + 99 | f'f1={2 * prec * rec / (prec + rec + 1e-8):0.3f}' 100 | ) 101 | axs[0].plot(recall, precision, c=color) 102 | axs[1].plot(recall, scores, c=color) 103 | 104 | axs[0].set_title('Precision / Recall') 105 | axs[0].legend(names) 106 | axs[1].set_title('Scores / Recall') 107 | axs[1].legend(names) 108 | return fig, axs 109 | 110 | 111 | 112 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | shapely==1.7.1 2 | git+https://github.com/XD7479/cocoapi.git#egg=pycocotools&subdirectory=PythonAPI 3 | git+https://github.com/lvis-dataset/lvis-api.git 4 | jpeg4py 5 | visdom 6 | easydict 7 | scikit-image 8 | omegaconf 9 | boto3 10 | ipdb 11 | git+https://github.com/openai/CLIP.git 12 | transformers==4.25 13 | einops 14 | timm==0.9.8 15 | opencv-python-headless 16 | git+https://github.com/salaniz/pycocoevalcap.git 17 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [isort] 2 | line_length=100 3 | multi_line_output=3 4 | include_trailing_comma=True 5 | known_standard_library=numpy,setuptools,mock 6 | skip=./datasets,docs 7 | skip_glob=*/__init__.py,**/configs/**,tests/config/** 8 | known_myself=detectron2 9 | known_third_party=fvcore,matplotlib,cv2,torch,torchvision,PIL,pycocotools,yacs,termcolor,cityscapesscripts,tabulate,tqdm,scipy,lvis,psutil,pkg_resources,caffe2,onnx,panopticapi,black,isort,av,iopath,omegaconf,hydra,yaml,pydoc,submitit,cloudpickle 10 | no_lines_before=STDLIB,THIRDPARTY 11 | sections=FUTURE,STDLIB,THIRDPARTY,myself,FIRSTPARTY,LOCALFOLDER 12 | default_section=FIRSTPARTY 13 | 14 | [mypy] 15 | python_version=3.6 16 | ignore_missing_imports = True 17 | warn_unused_configs = True 18 | disallow_untyped_defs = True 19 | check_untyped_defs = True 20 | warn_unused_ignores = True 21 | warn_redundant_casts = True 22 | show_column_numbers = True 23 | follow_imports = silent 24 | allow_redefinition = True 25 | ; Require all functions to be annotated 26 | disallow_incomplete_defs = True 27 | -------------------------------------------------------------------------------- /tools/convert-pretrained-swin-model-to-d2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 3 | 4 | import pickle as pkl 5 | import sys 6 | 7 | import torch 8 | 9 | """ 10 | Usage: 11 | # download pretrained swin model: 12 | wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth 13 | # run the conversion 14 | ./convert-pretrained-model-to-d2.py swin_tiny_patch4_window7_224.pth swin_tiny_patch4_window7_224.pkl 15 | # Then, use swin_tiny_patch4_window7_224.pkl with the following changes in config: 16 | MODEL: 17 | WEIGHTS: "/path/to/swin_tiny_patch4_window7_224.pkl" 18 | INPUT: 19 | FORMAT: "RGB" 20 | """ 21 | 22 | if __name__ == "__main__": 23 | input = sys.argv[1] 24 | 25 | obj = torch.load(input, map_location="cpu")["model"] 26 | 27 | res = {"model": obj, "__author__": "third_party", "matching_heuristics": True} 28 | 29 | with open(sys.argv[2], "wb") as f: 30 | pkl.dump(res, f) -------------------------------------------------------------------------------- /tools/evaluate_ap_fixed.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | from collections import defaultdict 4 | from pathlib import Path 5 | 6 | from lvis import LVIS, LVISResults, LVISEval 7 | import sys,os 8 | sys.path.append('./') 9 | from logger import setup_logger 10 | 11 | 12 | def main(): 13 | # Use first line of file docstring as description if it exists. 14 | parser = argparse.ArgumentParser( 15 | description=__doc__.split("\n")[0] if __doc__ else "", 16 | formatter_class=argparse.ArgumentDefaultsHelpFormatter, 17 | ) 18 | parser.add_argument("annotations_json", type=Path) 19 | parser.add_argument("results_json", type=Path) 20 | parser.add_argument("output_dir", type=Path) 21 | parser.add_argument("--type", default="bbox", choices=["segm", "bbox"]) 22 | parser.add_argument("--dets-per-cat", default=10000, type=int) 23 | parser.add_argument("--ious", nargs="*", type=float) 24 | 25 | args = parser.parse_args() 26 | args.output_dir.mkdir(exist_ok=True, parents=True) 27 | logger = setup_logger(output=str(args.output_dir.resolve()), name=__file__) 28 | log_path = args.output_dir / "log.txt" 29 | 30 | assert args.dets_per_cat > 0 31 | with open(args.results_json, "r") as f: 32 | results = json.load(f) 33 | 34 | by_cat = defaultdict(list) 35 | for ann in results: 36 | by_cat[ann["category_id"]].append(ann) 37 | results = [] 38 | topk = args.dets_per_cat 39 | missing_dets_cats = set() 40 | for cat, cat_anns in by_cat.items(): 41 | if len(cat_anns) < topk: 42 | missing_dets_cats.add(cat) 43 | results.extend(sorted(cat_anns, key=lambda x: x["score"], reverse=True)[:topk]) 44 | 45 | if args.type == "segm": 46 | # When evaluating mask AP, if the results contain bbox, LVIS API will 47 | # use the box area as the area of the instance, instead of the mask 48 | # area. This leads to a different definition of small/medium/large. 49 | # We remove the bbox field to let mask AP use mask area. 50 | for x in results: 51 | x.pop("bbox", None) 52 | 53 | if missing_dets_cats: 54 | logger.warning( 55 | f"\n===\n" 56 | f"{len(missing_dets_cats)} classes had less than {topk} detections!\n" 57 | f"Outputting {topk} detections for each class will improve AP further.\n" 58 | f"If using detectron2, please use the lvdevil/infer_topk.py script to " 59 | f"output a results file with {topk} detections for each class.\n" 60 | f"===" 61 | ) 62 | 63 | gt = LVIS(args.annotations_json) 64 | results = LVISResults(gt, results, max_dets=-1) 65 | lvis_eval = LVISEval(gt, results, iou_type=args.type) 66 | params = lvis_eval.params 67 | params.max_dets = -1 # No limit on detections per image. 68 | if args.ious: 69 | params.iou_thrs = args.ious 70 | 71 | lvis_eval.run() 72 | lvis_eval.print_results() 73 | metrics = {k: v for k, v in lvis_eval.results.items() if k.startswith("AP")} 74 | logger.info("copypaste: %s,%s", ",".join(map(str, metrics.keys())), "path") 75 | logger.info( 76 | "copypaste: %s,%s", 77 | ",".join(f"{v*100:.2f}" for v in metrics.values()), 78 | log_path, 79 | ) 80 | 81 | 82 | if __name__ == "__main__": 83 | main() 84 | -------------------------------------------------------------------------------- /tools/logger.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | # Copied from detectron2 to avoid dependency. 3 | import functools 4 | import logging 5 | import os 6 | import sys 7 | from fvcore.common.file_io import PathManager 8 | from termcolor import colored 9 | 10 | 11 | class _ColorfulFormatter(logging.Formatter): 12 | def __init__(self, *args, **kwargs): 13 | self._root_name = kwargs.pop("root_name") + "." 14 | self._abbrev_name = kwargs.pop("abbrev_name", "") 15 | if len(self._abbrev_name): 16 | self._abbrev_name = self._abbrev_name + "." 17 | super(_ColorfulFormatter, self).__init__(*args, **kwargs) 18 | 19 | def formatMessage(self, record): 20 | record.name = record.name.replace(self._root_name, self._abbrev_name) 21 | log = super(_ColorfulFormatter, self).formatMessage(record) 22 | if record.levelno == logging.WARNING: 23 | prefix = colored("WARNING", "red", attrs=["blink"]) 24 | elif record.levelno == logging.ERROR or record.levelno == logging.CRITICAL: 25 | prefix = colored("ERROR", "red", attrs=["blink", "underline"]) 26 | else: 27 | return log 28 | return prefix + " " + log 29 | 30 | 31 | @functools.lru_cache() # so that calling setup_logger multiple times won't add many handlers 32 | def setup_logger( 33 | output=None, distributed_rank=0, *, color=True, name="detectron2", abbrev_name=None 34 | ): 35 | """ 36 | Initialize the detectron2 logger and set its verbosity level to "DEBUG". 37 | 38 | Args: 39 | output (str): a file name or a directory to save log. If None, will not save log file. 40 | If ends with ".txt" or ".log", assumed to be a file name. 41 | Otherwise, logs will be saved to `output/log.txt`. 42 | name (str): the root module name of this logger 43 | abbrev_name (str): an abbreviation of the module, to avoid long names in logs. 44 | Set to "" to not log the root module in logs. 45 | By default, will abbreviate "detectron2" to "d2" and leave other 46 | modules unchanged. 47 | 48 | Returns: 49 | logging.Logger: a logger 50 | """ 51 | logger = logging.getLogger(name) 52 | logger.setLevel(logging.DEBUG) 53 | logger.propagate = False 54 | 55 | if abbrev_name is None: 56 | abbrev_name = "d2" if name == "detectron2" else name 57 | 58 | plain_formatter = logging.Formatter( 59 | "[%(asctime)s] %(name)s %(levelname)s: %(message)s", datefmt="%m/%d %H:%M:%S" 60 | ) 61 | # stdout logging: master only 62 | if distributed_rank == 0: 63 | ch = logging.StreamHandler(stream=sys.stdout) 64 | ch.setLevel(logging.DEBUG) 65 | if color: 66 | formatter = _ColorfulFormatter( 67 | colored("[%(asctime)s %(name)s]: ", "green") + "%(message)s", 68 | datefmt="%m/%d %H:%M:%S", 69 | root_name=name, 70 | abbrev_name=str(abbrev_name), 71 | ) 72 | else: 73 | formatter = plain_formatter 74 | ch.setFormatter(formatter) 75 | logger.addHandler(ch) 76 | 77 | # file logging: all workers 78 | if output is not None: 79 | if output.endswith(".txt") or output.endswith(".log"): 80 | filename = output 81 | else: 82 | filename = os.path.join(output, "log.txt") 83 | if distributed_rank > 0: 84 | filename = filename + ".rank{}".format(distributed_rank) 85 | PathManager.mkdirs(os.path.dirname(filename)) 86 | 87 | fh = logging.StreamHandler(_cached_log_stream(filename)) 88 | fh.setLevel(logging.DEBUG) 89 | fh.setFormatter(plain_formatter) 90 | logger.addHandler(fh) 91 | 92 | return logger 93 | 94 | 95 | # cache the opened file object, so that different calls to `setup_logger` 96 | # with the same file name can safely write to the same file. 97 | @functools.lru_cache(maxsize=None) 98 | def _cached_log_stream(filename): 99 | return PathManager.open(filename, "a") 100 | 101 | --------------------------------------------------------------------------------