├── mmdet ├── ops │ ├── nms │ │ ├── .gitignore │ │ ├── __init__.py │ │ ├── Makefile │ │ ├── gpu_nms.hpp │ │ ├── gpu_nms.pyx │ │ ├── nms_wrapper.py │ │ ├── cpu_nms.pyx │ │ ├── setup.py │ │ └── cpu_soft_nms.pyx │ ├── roi_pool │ │ ├── modules │ │ │ ├── __init__.py │ │ │ └── roi_pool.py │ │ ├── functions │ │ │ ├── __init__.py │ │ │ └── roi_pool.py │ │ ├── __init__.py │ │ ├── setup.py │ │ ├── gradcheck.py │ │ └── src │ │ │ └── roi_pool_cuda.cpp │ ├── roi_align │ │ ├── functions │ │ │ ├── __init__.py │ │ │ └── roi_align.py │ │ ├── modules │ │ │ ├── __init__.py │ │ │ └── roi_align.py │ │ ├── __init__.py │ │ ├── setup.py │ │ ├── gradcheck.py │ │ └── src │ │ │ └── roi_align_cuda.cpp │ └── __init__.py ├── models │ ├── necks │ │ └── __init__.py │ ├── mask_heads │ │ └── __init__.py │ ├── track_heads │ │ └── __init__.py │ ├── roi_extractors │ │ ├── __init__.py │ │ └── single_level.py │ ├── backbones │ │ └── __init__.py │ ├── bbox_heads │ │ └── __init__.py │ ├── anchor_heads │ │ ├── __init__.py │ │ ├── retina_head.py │ │ └── rpn_head.py │ ├── utils │ │ ├── __init__.py │ │ ├── weight_init.py │ │ ├── norm.py │ │ └── conv_module.py │ ├── detectors │ │ ├── __init__.py │ │ ├── retinanet.py │ │ ├── faster_rcnn.py │ │ ├── mask_rcnn.py │ │ ├── fast_rcnn.py │ │ ├── single_stage.py │ │ └── rpn.py │ ├── __init__.py │ ├── registry.py │ └── builder.py ├── __init__.py ├── core │ ├── mask │ │ ├── __init__.py │ │ ├── utils.py │ │ └── mask_target.py │ ├── anchor │ │ ├── __init__.py │ │ └── anchor_generator.py │ ├── bbox │ │ ├── assigners │ │ │ ├── __init__.py │ │ │ ├── base_assigner.py │ │ │ └── assign_result.py │ │ ├── samplers │ │ │ ├── combined_sampler.py │ │ │ ├── __init__.py │ │ │ ├── pseudo_sampler.py │ │ │ ├── sampling_result.py │ │ │ ├── instance_balanced_pos_sampler.py │ │ │ ├── random_sampler.py │ │ │ ├── ohem_sampler.py │ │ │ ├── iou_balanced_neg_sampler.py │ │ │ └── base_sampler.py │ │ ├── __init__.py │ │ ├── assign_sampling.py │ │ ├── geometry.py │ │ └── bbox_target.py │ ├── utils │ │ ├── __init__.py │ │ ├── misc.py │ │ └── dist_utils.py │ ├── __init__.py │ ├── post_processing │ │ ├── __init__.py │ │ ├── bbox_nms.py │ │ └── merge_augs.py │ ├── loss │ │ ├── __init__.py │ │ └── losses.py │ └── evaluation │ │ ├── __init__.py │ │ └── bbox_overlaps.py ├── datasets │ ├── loader │ │ ├── __init__.py │ │ └── build_loader.py │ ├── repeat_dataset.py │ ├── voc.py │ ├── __init__.py │ ├── concat_dataset.py │ ├── xml_style.py │ ├── utils.py │ └── transforms.py └── apis │ ├── __init__.py │ ├── env.py │ ├── inference.py │ └── train.py ├── doc ├── framework.png └── sample_gt.png ├── demo └── coco_test_12510.jpg ├── .travis.yml ├── tools ├── dist_train.sh ├── coco_eval.py ├── voc_eval.py ├── train.py └── test_video.py ├── compile.sh ├── INSTALL.md ├── .gitignore ├── setup.py ├── configs ├── retinanet_r50_fpn_1x.py ├── retinanet_r101_fpn_1x.py ├── retinanet_x101_32x4d_fpn_1x.py ├── retinanet_x101_64x4d_fpn_1x.py ├── rpn_r50_fpn_1x.py ├── rpn_r101_fpn_1x.py ├── rpn_x101_32x4d_fpn_1x.py ├── rpn_x101_64x4d_fpn_1x.py ├── fast_rcnn_r50_fpn_1x.py ├── fast_rcnn_r101_fpn_1x.py ├── ssd300_coco.py ├── ssd512_coco.py └── pascal_voc │ └── ssd300_voc.py ├── README.md └── TECHNICAL_DETAILS.md /mmdet/ops/nms/.gitignore: -------------------------------------------------------------------------------- 1 | *.cpp 2 | -------------------------------------------------------------------------------- /mmdet/ops/roi_pool/modules/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /mmdet/ops/roi_align/functions/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /mmdet/ops/roi_align/modules/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /mmdet/ops/roi_pool/functions/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /mmdet/models/necks/__init__.py: -------------------------------------------------------------------------------- 1 | from .fpn import FPN 2 | 3 | __all__ = ['FPN'] 4 | -------------------------------------------------------------------------------- /doc/framework.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/youtubevos/MaskTrackRCNN/HEAD/doc/framework.png -------------------------------------------------------------------------------- /doc/sample_gt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/youtubevos/MaskTrackRCNN/HEAD/doc/sample_gt.png -------------------------------------------------------------------------------- /demo/coco_test_12510.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/youtubevos/MaskTrackRCNN/HEAD/demo/coco_test_12510.jpg -------------------------------------------------------------------------------- /mmdet/ops/nms/__init__.py: -------------------------------------------------------------------------------- 1 | from .nms_wrapper import nms, soft_nms 2 | 3 | __all__ = ['nms', 'soft_nms'] 4 | -------------------------------------------------------------------------------- /mmdet/models/mask_heads/__init__.py: -------------------------------------------------------------------------------- 1 | from .fcn_mask_head import FCNMaskHead 2 | 3 | __all__ = ['FCNMaskHead'] 4 | -------------------------------------------------------------------------------- /mmdet/models/track_heads/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from .track_head import TrackHead 3 | 4 | __all__ = ['TrackHead'] 5 | -------------------------------------------------------------------------------- /mmdet/__init__.py: -------------------------------------------------------------------------------- 1 | from .version import __version__, short_version 2 | 3 | __all__ = ['__version__', 'short_version'] 4 | -------------------------------------------------------------------------------- /mmdet/models/roi_extractors/__init__.py: -------------------------------------------------------------------------------- 1 | from .single_level import SingleRoIExtractor 2 | 3 | __all__ = ['SingleRoIExtractor'] 4 | -------------------------------------------------------------------------------- /mmdet/ops/roi_pool/__init__.py: -------------------------------------------------------------------------------- 1 | from .functions.roi_pool import roi_pool 2 | from .modules.roi_pool import RoIPool 3 | 4 | __all__ = ['roi_pool', 'RoIPool'] 5 | -------------------------------------------------------------------------------- /mmdet/ops/roi_align/__init__.py: -------------------------------------------------------------------------------- 1 | from .functions.roi_align import roi_align 2 | from .modules.roi_align import RoIAlign 3 | 4 | __all__ = ['roi_align', 'RoIAlign'] 5 | -------------------------------------------------------------------------------- /mmdet/core/mask/__init__.py: -------------------------------------------------------------------------------- 1 | from .utils import split_combined_polys 2 | from .mask_target import mask_target 3 | 4 | __all__ = ['split_combined_polys', 'mask_target'] 5 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | dist: trusty 2 | language: python 3 | 4 | install: 5 | - pip install flake8 6 | 7 | python: 8 | - "3.5" 9 | - "3.6" 10 | 11 | script: 12 | - flake8 -------------------------------------------------------------------------------- /mmdet/ops/nms/Makefile: -------------------------------------------------------------------------------- 1 | PYTHON=${PYTHON:-python} 2 | 3 | all: 4 | echo "Compiling nms kernels..." 5 | $(PYTHON) setup.py build_ext --inplace 6 | 7 | clean: 8 | rm -f *.so 9 | -------------------------------------------------------------------------------- /mmdet/core/anchor/__init__.py: -------------------------------------------------------------------------------- 1 | from .anchor_generator import AnchorGenerator 2 | from .anchor_target import anchor_target 3 | 4 | __all__ = ['AnchorGenerator', 'anchor_target'] 5 | -------------------------------------------------------------------------------- /mmdet/models/backbones/__init__.py: -------------------------------------------------------------------------------- 1 | from .resnet import ResNet 2 | from .resnext import ResNeXt 3 | from .ssd_vgg import SSDVGG 4 | 5 | __all__ = ['ResNet', 'ResNeXt', 'SSDVGG'] 6 | -------------------------------------------------------------------------------- /tools/dist_train.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | PYTHON=${PYTHON:-"python"} 4 | 5 | $PYTHON -m torch.distributed.launch --nproc_per_node=$2 $(dirname "$0")/train.py $1 --launcher pytorch ${@:3} 6 | -------------------------------------------------------------------------------- /mmdet/models/bbox_heads/__init__.py: -------------------------------------------------------------------------------- 1 | from .bbox_head import BBoxHead 2 | from .convfc_bbox_head import ConvFCBBoxHead, SharedFCBBoxHead 3 | 4 | __all__ = ['BBoxHead', 'ConvFCBBoxHead', 'SharedFCBBoxHead'] 5 | -------------------------------------------------------------------------------- /mmdet/ops/nms/gpu_nms.hpp: -------------------------------------------------------------------------------- 1 | void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num, 2 | int boxes_dim, float nms_overlap_thresh, int device_id, size_t base); 3 | size_t nms_Malloc(); 4 | -------------------------------------------------------------------------------- /mmdet/ops/__init__.py: -------------------------------------------------------------------------------- 1 | from .nms import nms, soft_nms 2 | from .roi_align import RoIAlign, roi_align 3 | from .roi_pool import RoIPool, roi_pool 4 | 5 | __all__ = ['nms', 'soft_nms', 'RoIAlign', 'roi_align', 'RoIPool', 'roi_pool'] 6 | -------------------------------------------------------------------------------- /mmdet/datasets/loader/__init__.py: -------------------------------------------------------------------------------- 1 | from .build_loader import build_dataloader 2 | from .sampler import GroupSampler, DistributedGroupSampler 3 | 4 | __all__ = [ 5 | 'GroupSampler', 'DistributedGroupSampler', 'build_dataloader' 6 | ] 7 | -------------------------------------------------------------------------------- /mmdet/core/bbox/assigners/__init__.py: -------------------------------------------------------------------------------- 1 | from .base_assigner import BaseAssigner 2 | from .max_iou_assigner import MaxIoUAssigner 3 | from .assign_result import AssignResult 4 | 5 | __all__ = ['BaseAssigner', 'MaxIoUAssigner', 'AssignResult'] 6 | -------------------------------------------------------------------------------- /mmdet/models/anchor_heads/__init__.py: -------------------------------------------------------------------------------- 1 | from .anchor_head import AnchorHead 2 | from .rpn_head import RPNHead 3 | from .retina_head import RetinaHead 4 | from .ssd_head import SSDHead 5 | 6 | __all__ = ['AnchorHead', 'RPNHead', 'RetinaHead', 'SSDHead'] 7 | -------------------------------------------------------------------------------- /mmdet/core/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .dist_utils import allreduce_grads, DistOptimizerHook 2 | from .misc import tensor2imgs, unmap, multi_apply 3 | 4 | __all__ = [ 5 | 'allreduce_grads', 'DistOptimizerHook', 'tensor2imgs', 'unmap', 6 | 'multi_apply' 7 | ] 8 | -------------------------------------------------------------------------------- /mmdet/core/bbox/assigners/base_assigner.py: -------------------------------------------------------------------------------- 1 | from abc import ABCMeta, abstractmethod 2 | 3 | 4 | class BaseAssigner(metaclass=ABCMeta): 5 | 6 | @abstractmethod 7 | def assign(self, bboxes, gt_bboxes, gt_bboxes_ignore=None, gt_labels=None): 8 | pass 9 | -------------------------------------------------------------------------------- /mmdet/apis/__init__.py: -------------------------------------------------------------------------------- 1 | from .env import init_dist, get_root_logger, set_random_seed 2 | from .train import train_detector 3 | from .inference import inference_detector, show_result 4 | 5 | __all__ = [ 6 | 'init_dist', 'get_root_logger', 'set_random_seed', 'train_detector', 7 | 'inference_detector', 'show_result' 8 | ] 9 | -------------------------------------------------------------------------------- /mmdet/core/__init__.py: -------------------------------------------------------------------------------- 1 | from .anchor import * # noqa: F401, F403 2 | from .bbox import * # noqa: F401, F403 3 | from .mask import * # noqa: F401, F403 4 | from .loss import * # noqa: F401, F403 5 | from .evaluation import * # noqa: F401, F403 6 | from .post_processing import * # noqa: F401, F403 7 | from .utils import * # noqa: F401, F403 8 | -------------------------------------------------------------------------------- /mmdet/core/post_processing/__init__.py: -------------------------------------------------------------------------------- 1 | from .bbox_nms import multiclass_nms 2 | from .merge_augs import (merge_aug_proposals, merge_aug_bboxes, 3 | merge_aug_scores, merge_aug_masks) 4 | 5 | __all__ = [ 6 | 'multiclass_nms', 'merge_aug_proposals', 'merge_aug_bboxes', 7 | 'merge_aug_scores', 'merge_aug_masks' 8 | ] 9 | -------------------------------------------------------------------------------- /mmdet/ops/roi_pool/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension 3 | 4 | setup( 5 | name='roi_pool', 6 | ext_modules=[ 7 | CUDAExtension('roi_pool_cuda', [ 8 | 'src/roi_pool_cuda.cpp', 9 | 'src/roi_pool_kernel.cu', 10 | ]) 11 | ], 12 | cmdclass={'build_ext': BuildExtension}) 13 | -------------------------------------------------------------------------------- /mmdet/models/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .conv_module import ConvModule 2 | from .norm import build_norm_layer 3 | from .weight_init import (xavier_init, normal_init, uniform_init, kaiming_init, 4 | bias_init_with_prob) 5 | 6 | __all__ = [ 7 | 'ConvModule', 'build_norm_layer', 'xavier_init', 'normal_init', 8 | 'uniform_init', 'kaiming_init', 'bias_init_with_prob' 9 | ] 10 | -------------------------------------------------------------------------------- /mmdet/ops/roi_align/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension 3 | 4 | setup( 5 | name='roi_align_cuda', 6 | ext_modules=[ 7 | CUDAExtension('roi_align_cuda', [ 8 | 'src/roi_align_cuda.cpp', 9 | 'src/roi_align_kernel.cu', 10 | ]), 11 | ], 12 | cmdclass={'build_ext': BuildExtension}) 13 | -------------------------------------------------------------------------------- /compile.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | PYTHON=${PYTHON:-"python"} 4 | 5 | echo "Building roi align op..." 6 | cd mmdet/ops/roi_align 7 | if [ -d "build" ]; then 8 | rm -r build 9 | fi 10 | $PYTHON setup.py build_ext --inplace 11 | 12 | echo "Building roi pool op..." 13 | cd ../roi_pool 14 | if [ -d "build" ]; then 15 | rm -r build 16 | fi 17 | $PYTHON setup.py build_ext --inplace 18 | 19 | echo "Building nms op..." 20 | cd ../nms 21 | make clean 22 | make PYTHON=${PYTHON} 23 | -------------------------------------------------------------------------------- /mmdet/ops/roi_pool/modules/roi_pool.py: -------------------------------------------------------------------------------- 1 | from torch.nn.modules.module import Module 2 | from ..functions.roi_pool import roi_pool 3 | 4 | 5 | class RoIPool(Module): 6 | 7 | def __init__(self, out_size, spatial_scale): 8 | super(RoIPool, self).__init__() 9 | 10 | self.out_size = out_size 11 | self.spatial_scale = float(spatial_scale) 12 | 13 | def forward(self, features, rois): 14 | return roi_pool(features, rois, self.out_size, self.spatial_scale) 15 | -------------------------------------------------------------------------------- /mmdet/models/detectors/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import BaseDetector 2 | from .single_stage import SingleStageDetector 3 | from .two_stage import TwoStageDetector 4 | from .rpn import RPN 5 | from .fast_rcnn import FastRCNN 6 | from .faster_rcnn import FasterRCNN 7 | from .mask_rcnn import MaskRCNN 8 | from .cascade_rcnn import CascadeRCNN 9 | from .retinanet import RetinaNet 10 | 11 | __all__ = [ 12 | 'BaseDetector', 'SingleStageDetector', 'TwoStageDetector', 'RPN', 13 | 'FastRCNN', 'FasterRCNN', 'MaskRCNN', 'CascadeRCNN', 'RetinaNet' 14 | ] 15 | -------------------------------------------------------------------------------- /mmdet/core/loss/__init__.py: -------------------------------------------------------------------------------- 1 | from .losses import (weighted_nll_loss, weighted_cross_entropy, 2 | weighted_binary_cross_entropy, sigmoid_focal_loss, 3 | weighted_sigmoid_focal_loss, mask_cross_entropy, 4 | smooth_l1_loss, weighted_smoothl1, accuracy) 5 | 6 | __all__ = [ 7 | 'weighted_nll_loss', 'weighted_cross_entropy', 8 | 'weighted_binary_cross_entropy', 'sigmoid_focal_loss', 9 | 'weighted_sigmoid_focal_loss', 'mask_cross_entropy', 'smooth_l1_loss', 10 | 'weighted_smoothl1', 'accuracy' 11 | ] 12 | -------------------------------------------------------------------------------- /mmdet/datasets/repeat_dataset.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class RepeatDataset(object): 5 | 6 | def __init__(self, dataset, times): 7 | self.dataset = dataset 8 | self.times = times 9 | self.CLASSES = dataset.CLASSES 10 | if hasattr(self.dataset, 'flag'): 11 | self.flag = np.tile(self.dataset.flag, times) 12 | 13 | self._ori_len = len(self.dataset) 14 | 15 | def __getitem__(self, idx): 16 | return self.dataset[idx % self._ori_len] 17 | 18 | def __len__(self): 19 | return self.times * self._ori_len 20 | -------------------------------------------------------------------------------- /mmdet/models/detectors/retinanet.py: -------------------------------------------------------------------------------- 1 | from .single_stage import SingleStageDetector 2 | from ..registry import DETECTORS 3 | 4 | 5 | @DETECTORS.register_module 6 | class RetinaNet(SingleStageDetector): 7 | 8 | def __init__(self, 9 | backbone, 10 | neck, 11 | bbox_head, 12 | train_cfg=None, 13 | test_cfg=None, 14 | pretrained=None): 15 | super(RetinaNet, self).__init__(backbone, neck, bbox_head, train_cfg, 16 | test_cfg, pretrained) 17 | -------------------------------------------------------------------------------- /mmdet/ops/roi_pool/gradcheck.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.autograd import gradcheck 3 | 4 | import os.path as osp 5 | import sys 6 | sys.path.append(osp.abspath(osp.join(__file__, '../../'))) 7 | from roi_pool import RoIPool # noqa: E402 8 | 9 | feat = torch.randn(4, 16, 15, 15, requires_grad=True).cuda() 10 | rois = torch.Tensor([[0, 0, 0, 50, 50], [0, 10, 30, 43, 55], 11 | [1, 67, 40, 110, 120]]).cuda() 12 | inputs = (feat, rois) 13 | print('Gradcheck for roi pooling...') 14 | test = gradcheck(RoIPool(4, 1.0 / 8), inputs, eps=1e-5, atol=1e-3) 15 | print(test) 16 | -------------------------------------------------------------------------------- /mmdet/core/bbox/samplers/combined_sampler.py: -------------------------------------------------------------------------------- 1 | from .base_sampler import BaseSampler 2 | from ..assign_sampling import build_sampler 3 | 4 | 5 | class CombinedSampler(BaseSampler): 6 | 7 | def __init__(self, pos_sampler, neg_sampler, **kwargs): 8 | super(CombinedSampler, self).__init__(**kwargs) 9 | self.pos_sampler = build_sampler(pos_sampler, **kwargs) 10 | self.neg_sampler = build_sampler(neg_sampler, **kwargs) 11 | 12 | def _sample_pos(self, **kwargs): 13 | raise NotImplementedError 14 | 15 | def _sample_neg(self, **kwargs): 16 | raise NotImplementedError 17 | -------------------------------------------------------------------------------- /mmdet/ops/roi_align/modules/roi_align.py: -------------------------------------------------------------------------------- 1 | from torch.nn.modules.module import Module 2 | from ..functions.roi_align import RoIAlignFunction 3 | 4 | 5 | class RoIAlign(Module): 6 | 7 | def __init__(self, out_size, spatial_scale, sample_num=0): 8 | super(RoIAlign, self).__init__() 9 | 10 | self.out_size = out_size 11 | self.spatial_scale = float(spatial_scale) 12 | self.sample_num = int(sample_num) 13 | 14 | def forward(self, features, rois): 15 | return RoIAlignFunction.apply(features, rois, self.out_size, 16 | self.spatial_scale, self.sample_num) 17 | -------------------------------------------------------------------------------- /mmdet/core/bbox/samplers/__init__.py: -------------------------------------------------------------------------------- 1 | from .base_sampler import BaseSampler 2 | from .pseudo_sampler import PseudoSampler 3 | from .random_sampler import RandomSampler 4 | from .instance_balanced_pos_sampler import InstanceBalancedPosSampler 5 | from .iou_balanced_neg_sampler import IoUBalancedNegSampler 6 | from .combined_sampler import CombinedSampler 7 | from .ohem_sampler import OHEMSampler 8 | from .sampling_result import SamplingResult 9 | 10 | __all__ = [ 11 | 'BaseSampler', 'PseudoSampler', 'RandomSampler', 12 | 'InstanceBalancedPosSampler', 'IoUBalancedNegSampler', 'CombinedSampler', 13 | 'OHEMSampler', 'SamplingResult' 14 | ] 15 | -------------------------------------------------------------------------------- /mmdet/datasets/voc.py: -------------------------------------------------------------------------------- 1 | from .xml_style import XMLDataset 2 | 3 | 4 | class VOCDataset(XMLDataset): 5 | 6 | CLASSES = ('aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 7 | 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', 8 | 'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train', 9 | 'tvmonitor') 10 | 11 | def __init__(self, **kwargs): 12 | super(VOCDataset, self).__init__(**kwargs) 13 | if 'VOC2007' in self.img_prefix: 14 | self.year = 2007 15 | elif 'VOC2012' in self.img_prefix: 16 | self.year = 2012 17 | else: 18 | raise ValueError('Cannot infer dataset year from img_prefix') 19 | -------------------------------------------------------------------------------- /mmdet/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .backbones import * # noqa: F401,F403 2 | from .necks import * # noqa: F401,F403 3 | from .roi_extractors import * # noqa: F401,F403 4 | from .anchor_heads import * # noqa: F401,F403 5 | from .bbox_heads import * # noqa: F401,F403 6 | from .mask_heads import * # noqa: F401,F403 7 | from .track_heads import * 8 | from .detectors import * # noqa: F401,F403 9 | from .registry import BACKBONES, NECKS, ROI_EXTRACTORS, HEADS, DETECTORS 10 | from .builder import (build_backbone, build_neck, build_roi_extractor, 11 | build_head, build_detector) 12 | 13 | __all__ = [ 14 | 'BACKBONES', 'NECKS', 'ROI_EXTRACTORS', 'HEADS', 'DETECTORS', 15 | 'build_backbone', 'build_neck', 'build_roi_extractor', 'build_head', 16 | 'build_detector' 17 | ] 18 | -------------------------------------------------------------------------------- /mmdet/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from .custom import CustomDataset 2 | from .xml_style import XMLDataset 3 | from .coco import CocoDataset 4 | from .ytvos import YTVOSDataset 5 | from .voc import VOCDataset 6 | from .loader import GroupSampler, DistributedGroupSampler, build_dataloader 7 | from .utils import to_tensor, random_scale, show_ann, get_dataset 8 | from .concat_dataset import ConcatDataset 9 | from .repeat_dataset import RepeatDataset 10 | from .extra_aug import ExtraAugmentation 11 | 12 | __all__ = [ 13 | 'CustomDataset', 'XMLDataset', 'CocoDataset', 'YTVOSDataset', 14 | 'VOCDataset', 'GroupSampler', 15 | 'DistributedGroupSampler', 'build_dataloader', 'to_tensor', 'random_scale', 16 | 'show_ann', 'get_dataset', 'ConcatDataset', 'RepeatDataset', 17 | 'ExtraAugmentation' 18 | ] 19 | -------------------------------------------------------------------------------- /mmdet/datasets/concat_dataset.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from torch.utils.data.dataset import ConcatDataset as _ConcatDataset 3 | 4 | 5 | class ConcatDataset(_ConcatDataset): 6 | """A wrapper of concatenated dataset. 7 | 8 | Same as :obj:`torch.utils.data.dataset.ConcatDataset`, but 9 | concat the group flag for image aspect ratio. 10 | 11 | Args: 12 | datasets (list[:obj:`Dataset`]): A list of datasets. 13 | """ 14 | 15 | def __init__(self, datasets): 16 | super(ConcatDataset, self).__init__(datasets) 17 | self.CLASSES = datasets[0].CLASSES 18 | if hasattr(datasets[0], 'flag'): 19 | flags = [] 20 | for i in range(0, len(datasets)): 21 | flags.append(datasets[i].flag) 22 | self.flag = np.concatenate(flags) 23 | -------------------------------------------------------------------------------- /mmdet/models/detectors/faster_rcnn.py: -------------------------------------------------------------------------------- 1 | from .two_stage import TwoStageDetector 2 | from ..registry import DETECTORS 3 | 4 | 5 | @DETECTORS.register_module 6 | class FasterRCNN(TwoStageDetector): 7 | 8 | def __init__(self, 9 | backbone, 10 | neck, 11 | rpn_head, 12 | bbox_roi_extractor, 13 | bbox_head, 14 | train_cfg, 15 | test_cfg, 16 | pretrained=None): 17 | super(FasterRCNN, self).__init__( 18 | backbone=backbone, 19 | neck=neck, 20 | rpn_head=rpn_head, 21 | bbox_roi_extractor=bbox_roi_extractor, 22 | bbox_head=bbox_head, 23 | train_cfg=train_cfg, 24 | test_cfg=test_cfg, 25 | pretrained=pretrained) 26 | -------------------------------------------------------------------------------- /mmdet/core/bbox/assigners/assign_result.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | class AssignResult(object): 5 | 6 | def __init__(self, num_gts, gt_inds, max_overlaps, labels=None, pids=None): 7 | self.num_gts = num_gts 8 | self.gt_inds = gt_inds 9 | self.max_overlaps = max_overlaps 10 | self.labels = labels 11 | self.pids = pids 12 | 13 | def add_gt_(self, gt_labels, gt_pids): 14 | self_inds = torch.arange( 15 | 1, len(gt_labels) + 1, dtype=torch.long, device=gt_labels.device) 16 | self.gt_inds = torch.cat([self_inds, self.gt_inds]) 17 | self.max_overlaps = torch.cat( 18 | [self.max_overlaps.new_ones(self.num_gts), self.max_overlaps]) 19 | if self.labels is not None: 20 | self.labels = torch.cat([gt_labels, self.labels]) 21 | if self.pids is not None: 22 | self.pids = torch.cat([gt_pids, self.pids]) 23 | -------------------------------------------------------------------------------- /tools/coco_eval.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | 3 | from mmdet.core import coco_eval 4 | 5 | 6 | def main(): 7 | parser = ArgumentParser(description='COCO Evaluation') 8 | parser.add_argument('result', help='result file path') 9 | parser.add_argument('--ann', help='annotation file path') 10 | parser.add_argument( 11 | '--types', 12 | type=str, 13 | nargs='+', 14 | choices=['proposal_fast', 'proposal', 'bbox', 'segm', 'keypoint'], 15 | default=['bbox'], 16 | help='result types') 17 | parser.add_argument( 18 | '--max-dets', 19 | type=int, 20 | nargs='+', 21 | default=[100, 300, 1000], 22 | help='proposal numbers, only used for recall evaluation') 23 | args = parser.parse_args() 24 | coco_eval(args.result, args.types, args.ann, args.max_dets) 25 | 26 | 27 | if __name__ == '__main__': 28 | main() 29 | -------------------------------------------------------------------------------- /mmdet/core/bbox/samplers/pseudo_sampler.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from .base_sampler import BaseSampler 4 | from .sampling_result import SamplingResult 5 | 6 | 7 | class PseudoSampler(BaseSampler): 8 | 9 | def __init__(self, **kwargs): 10 | pass 11 | 12 | def _sample_pos(self, **kwargs): 13 | raise NotImplementedError 14 | 15 | def _sample_neg(self, **kwargs): 16 | raise NotImplementedError 17 | 18 | def sample(self, assign_result, bboxes, gt_bboxes, **kwargs): 19 | pos_inds = torch.nonzero( 20 | assign_result.gt_inds > 0).squeeze(-1).unique() 21 | neg_inds = torch.nonzero( 22 | assign_result.gt_inds == 0).squeeze(-1).unique() 23 | gt_flags = bboxes.new_zeros(bboxes.shape[0], dtype=torch.uint8) 24 | sampling_result = SamplingResult(pos_inds, neg_inds, bboxes, gt_bboxes, 25 | assign_result, gt_flags) 26 | return sampling_result 27 | -------------------------------------------------------------------------------- /mmdet/ops/roi_align/gradcheck.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from torch.autograd import gradcheck 4 | 5 | import os.path as osp 6 | import sys 7 | sys.path.append(osp.abspath(osp.join(__file__, '../../'))) 8 | from roi_align import RoIAlign # noqa: E402 9 | 10 | feat_size = 15 11 | spatial_scale = 1.0 / 8 12 | img_size = feat_size / spatial_scale 13 | num_imgs = 2 14 | num_rois = 20 15 | 16 | batch_ind = np.random.randint(num_imgs, size=(num_rois, 1)) 17 | rois = np.random.rand(num_rois, 4) * img_size * 0.5 18 | rois[:, 2:] += img_size * 0.5 19 | rois = np.hstack((batch_ind, rois)) 20 | 21 | feat = torch.randn( 22 | num_imgs, 16, feat_size, feat_size, requires_grad=True, device='cuda:0') 23 | rois = torch.from_numpy(rois).float().cuda() 24 | inputs = (feat, rois) 25 | print('Gradcheck for roi align...') 26 | test = gradcheck(RoIAlign(3, spatial_scale), inputs, atol=1e-3, eps=1e-3) 27 | print(test) 28 | test = gradcheck(RoIAlign(3, spatial_scale, 2), inputs, atol=1e-3, eps=1e-3) 29 | print(test) 30 | -------------------------------------------------------------------------------- /mmdet/models/detectors/mask_rcnn.py: -------------------------------------------------------------------------------- 1 | from .two_stage import TwoStageDetector 2 | from ..registry import DETECTORS 3 | 4 | 5 | @DETECTORS.register_module 6 | class MaskRCNN(TwoStageDetector): 7 | 8 | def __init__(self, 9 | backbone, 10 | neck, 11 | rpn_head, 12 | bbox_roi_extractor, 13 | bbox_head, 14 | track_head, 15 | mask_roi_extractor, 16 | mask_head, 17 | train_cfg, 18 | test_cfg, 19 | pretrained=None): 20 | super(MaskRCNN, self).__init__( 21 | backbone=backbone, 22 | neck=neck, 23 | rpn_head=rpn_head, 24 | bbox_roi_extractor=bbox_roi_extractor, 25 | bbox_head=bbox_head, 26 | track_head=track_head, 27 | mask_roi_extractor=mask_roi_extractor, 28 | mask_head=mask_head, 29 | train_cfg=train_cfg, 30 | test_cfg=test_cfg, 31 | pretrained=pretrained) 32 | -------------------------------------------------------------------------------- /mmdet/core/bbox/samplers/sampling_result.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | class SamplingResult(object): 5 | 6 | def __init__(self, pos_inds, neg_inds, bboxes, gt_bboxes, assign_result, 7 | gt_flags): 8 | self.pos_inds = pos_inds 9 | self.neg_inds = neg_inds 10 | self.pos_bboxes = bboxes[pos_inds] 11 | self.neg_bboxes = bboxes[neg_inds] 12 | self.pos_is_gt = gt_flags[pos_inds] 13 | 14 | self.num_gts = gt_bboxes.shape[0] 15 | self.pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1 16 | self.pos_gt_bboxes = gt_bboxes[self.pos_assigned_gt_inds, :] 17 | if assign_result.labels is not None: 18 | self.pos_gt_labels = assign_result.labels[pos_inds] 19 | else: 20 | self.pos_gt_labels = None 21 | 22 | if assign_result.pids is not None: 23 | self.pos_gt_pids = assign_result.pids[pos_inds] 24 | else: 25 | self.pos_gt_pids = None 26 | 27 | @property 28 | def bboxes(self): 29 | return torch.cat([self.pos_bboxes, self.neg_bboxes]) 30 | -------------------------------------------------------------------------------- /mmdet/core/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | from .class_names import (voc_classes, imagenet_det_classes, 2 | imagenet_vid_classes, coco_classes, dataset_aliases, 3 | get_classes) 4 | from .coco_utils import coco_eval, fast_eval_recall, results2json, results2json_videoseg, ytvos_eval 5 | from .eval_hooks import (DistEvalHook, DistEvalmAPHook, CocoDistEvalRecallHook, 6 | CocoDistEvalmAPHook) 7 | from .mean_ap import average_precision, eval_map, print_map_summary 8 | from .recall import (eval_recalls, print_recall_summary, plot_num_recall, 9 | plot_iou_recall) 10 | 11 | __all__ = [ 12 | 'voc_classes', 'imagenet_det_classes', 'imagenet_vid_classes', 13 | 'coco_classes', 'dataset_aliases', 'get_classes', 'coco_eval', 14 | 'fast_eval_recall', 'results2json', 'DistEvalHook', 'DistEvalmAPHook', 15 | 'CocoDistEvalRecallHook', 'CocoDistEvalmAPHook', 'average_precision', 16 | 'eval_map', 'print_map_summary', 'eval_recalls', 'print_recall_summary', 17 | 'plot_num_recall', 'plot_iou_recall', 18 | 'results2json_videoseg', 'ytvos_eval' 19 | ] 20 | -------------------------------------------------------------------------------- /mmdet/core/bbox/__init__.py: -------------------------------------------------------------------------------- 1 | from .geometry import bbox_overlaps 2 | from .assigners import BaseAssigner, MaxIoUAssigner, AssignResult 3 | from .samplers import (BaseSampler, PseudoSampler, RandomSampler, 4 | InstanceBalancedPosSampler, IoUBalancedNegSampler, 5 | CombinedSampler, SamplingResult) 6 | from .assign_sampling import build_assigner, build_sampler, assign_and_sample 7 | from .transforms import (bbox2delta, delta2bbox, bbox_flip, bbox_mapping, 8 | bbox_mapping_back, bbox2roi, roi2bbox, bbox2result, 9 | bbox2result_with_id) 10 | from .bbox_target import bbox_target 11 | 12 | __all__ = [ 13 | 'bbox_overlaps', 'BaseAssigner', 'MaxIoUAssigner', 'AssignResult', 14 | 'BaseSampler', 'PseudoSampler', 'RandomSampler', 15 | 'InstanceBalancedPosSampler', 'IoUBalancedNegSampler', 'CombinedSampler', 16 | 'SamplingResult', 'build_assigner', 'build_sampler', 'assign_and_sample', 17 | 'bbox2delta', 'delta2bbox', 'bbox_flip', 'bbox_mapping', 18 | 'bbox_mapping_back', 'bbox2roi', 'roi2bbox', 'bbox2result', 'bbox_target', 19 | 'bbox2result_with_id' 20 | ] 21 | -------------------------------------------------------------------------------- /mmdet/core/utils/misc.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | 3 | import mmcv 4 | import numpy as np 5 | from six.moves import map, zip 6 | 7 | 8 | def tensor2imgs(tensor, mean=(0, 0, 0), std=(1, 1, 1), to_rgb=True): 9 | num_imgs = tensor.size(0) 10 | mean = np.array(mean, dtype=np.float32) 11 | std = np.array(std, dtype=np.float32) 12 | imgs = [] 13 | for img_id in range(num_imgs): 14 | img = tensor[img_id, ...].cpu().numpy().transpose(1, 2, 0) 15 | img = mmcv.imdenormalize( 16 | img, mean, std, to_bgr=to_rgb).astype(np.uint8) 17 | imgs.append(np.ascontiguousarray(img)) 18 | return imgs 19 | 20 | 21 | def multi_apply(func, *args, **kwargs): 22 | pfunc = partial(func, **kwargs) if kwargs else func 23 | map_results = map(pfunc, *args) 24 | return tuple(map(list, zip(*map_results))) 25 | 26 | 27 | def unmap(data, count, inds, fill=0): 28 | """ Unmap a subset of item (data) back to the original set of items (of 29 | size count) """ 30 | if data.dim() == 1: 31 | ret = data.new_full((count, ), fill) 32 | ret[inds] = data 33 | else: 34 | new_size = (count, ) + data.size()[1:] 35 | ret = data.new_full(new_size, fill) 36 | ret[inds, :] = data 37 | return ret 38 | -------------------------------------------------------------------------------- /mmdet/core/mask/utils.py: -------------------------------------------------------------------------------- 1 | import mmcv 2 | 3 | 4 | def split_combined_polys(polys, poly_lens, polys_per_mask): 5 | """Split the combined 1-D polys into masks. 6 | 7 | A mask is represented as a list of polys, and a poly is represented as 8 | a 1-D array. In dataset, all masks are concatenated into a single 1-D 9 | tensor. Here we need to split the tensor into original representations. 10 | 11 | Args: 12 | polys (list): a list (length = image num) of 1-D tensors 13 | poly_lens (list): a list (length = image num) of poly length 14 | polys_per_mask (list): a list (length = image num) of poly number 15 | of each mask 16 | 17 | Returns: 18 | list: a list (length = image num) of list (length = mask num) of 19 | list (length = poly num) of numpy array 20 | """ 21 | mask_polys_list = [] 22 | for img_id in range(len(polys)): 23 | polys_single = polys[img_id] 24 | polys_lens_single = poly_lens[img_id].tolist() 25 | polys_per_mask_single = polys_per_mask[img_id].tolist() 26 | 27 | split_polys = mmcv.slice_list(polys_single, polys_lens_single) 28 | mask_polys = mmcv.slice_list(split_polys, polys_per_mask_single) 29 | mask_polys_list.append(mask_polys) 30 | return mask_polys_list 31 | -------------------------------------------------------------------------------- /mmdet/models/registry.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | 3 | 4 | class Registry(object): 5 | 6 | def __init__(self, name): 7 | self._name = name 8 | self._module_dict = dict() 9 | 10 | @property 11 | def name(self): 12 | return self._name 13 | 14 | @property 15 | def module_dict(self): 16 | return self._module_dict 17 | 18 | def _register_module(self, module_class): 19 | """Register a module. 20 | 21 | Args: 22 | module (:obj:`nn.Module`): Module to be registered. 23 | """ 24 | if not issubclass(module_class, nn.Module): 25 | raise TypeError( 26 | 'module must be a child of nn.Module, but got {}'.format( 27 | type(module_class))) 28 | module_name = module_class.__name__ 29 | if module_name in self._module_dict: 30 | raise KeyError('{} is already registered in {}'.format( 31 | module_name, self.name)) 32 | self._module_dict[module_name] = module_class 33 | 34 | def register_module(self, cls): 35 | self._register_module(cls) 36 | return cls 37 | 38 | 39 | BACKBONES = Registry('backbone') 40 | NECKS = Registry('neck') 41 | ROI_EXTRACTORS = Registry('roi_extractor') 42 | HEADS = Registry('head') 43 | DETECTORS = Registry('detector') 44 | -------------------------------------------------------------------------------- /mmdet/core/bbox/assign_sampling.py: -------------------------------------------------------------------------------- 1 | import mmcv 2 | 3 | from . import assigners, samplers 4 | 5 | 6 | def build_assigner(cfg, **kwargs): 7 | if isinstance(cfg, assigners.BaseAssigner): 8 | return cfg 9 | elif isinstance(cfg, dict): 10 | return mmcv.runner.obj_from_dict( 11 | cfg, assigners, default_args=kwargs) 12 | else: 13 | raise TypeError('Invalid type {} for building a sampler'.format( 14 | type(cfg))) 15 | 16 | 17 | def build_sampler(cfg, **kwargs): 18 | if isinstance(cfg, samplers.BaseSampler): 19 | return cfg 20 | elif isinstance(cfg, dict): 21 | return mmcv.runner.obj_from_dict( 22 | cfg, samplers, default_args=kwargs) 23 | else: 24 | raise TypeError('Invalid type {} for building a sampler'.format( 25 | type(cfg))) 26 | 27 | 28 | def assign_and_sample(bboxes, gt_bboxes, gt_bboxes_ignore, gt_labels, cfg): 29 | bbox_assigner = build_assigner(cfg.assigner) 30 | bbox_sampler = build_sampler(cfg.sampler) 31 | assign_result = bbox_assigner.assign(bboxes, gt_bboxes, gt_bboxes_ignore, 32 | gt_labels) 33 | sampling_result = bbox_sampler.sample(assign_result, bboxes, gt_bboxes, 34 | gt_labels) 35 | return assign_result, sampling_result 36 | -------------------------------------------------------------------------------- /mmdet/datasets/loader/build_loader.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | 3 | from mmcv.runner import get_dist_info 4 | from mmcv.parallel import collate 5 | from torch.utils.data import DataLoader 6 | 7 | from .sampler import GroupSampler, DistributedGroupSampler 8 | 9 | # https://github.com/pytorch/pytorch/issues/973 10 | import resource 11 | rlimit = resource.getrlimit(resource.RLIMIT_NOFILE) 12 | resource.setrlimit(resource.RLIMIT_NOFILE, (4096, rlimit[1])) 13 | 14 | 15 | def build_dataloader(dataset, 16 | imgs_per_gpu, 17 | workers_per_gpu, 18 | num_gpus=1, 19 | dist=True, 20 | **kwargs): 21 | if dist: 22 | rank, world_size = get_dist_info() 23 | sampler = DistributedGroupSampler(dataset, imgs_per_gpu, world_size, 24 | rank) 25 | batch_size = imgs_per_gpu 26 | num_workers = workers_per_gpu 27 | else: 28 | if not kwargs.get('shuffle', True): 29 | sampler = None 30 | else: 31 | sampler = GroupSampler(dataset, imgs_per_gpu) 32 | batch_size = num_gpus * imgs_per_gpu 33 | num_workers = num_gpus * workers_per_gpu 34 | 35 | data_loader = DataLoader( 36 | dataset, 37 | batch_size=batch_size, 38 | sampler=sampler, 39 | num_workers=num_workers, 40 | collate_fn=partial(collate, samples_per_gpu=imgs_per_gpu), 41 | pin_memory=False, 42 | **kwargs) 43 | 44 | return data_loader 45 | -------------------------------------------------------------------------------- /mmdet/core/mask/mask_target.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import mmcv 4 | 5 | 6 | def mask_target(pos_proposals_list, pos_assigned_gt_inds_list, gt_masks_list, 7 | cfg): 8 | cfg_list = [cfg for _ in range(len(pos_proposals_list))] 9 | mask_targets = map(mask_target_single, pos_proposals_list, 10 | pos_assigned_gt_inds_list, gt_masks_list, cfg_list) 11 | mask_targets = torch.cat(list(mask_targets)) 12 | return mask_targets 13 | 14 | 15 | def mask_target_single(pos_proposals, pos_assigned_gt_inds, gt_masks, cfg): 16 | mask_size = cfg.mask_size 17 | num_pos = pos_proposals.size(0) 18 | mask_targets = [] 19 | if num_pos > 0: 20 | proposals_np = pos_proposals.cpu().numpy() 21 | pos_assigned_gt_inds = pos_assigned_gt_inds.cpu().numpy() 22 | for i in range(num_pos): 23 | gt_mask = gt_masks[pos_assigned_gt_inds[i]] 24 | bbox = proposals_np[i, :].astype(np.int32) 25 | x1, y1, x2, y2 = bbox 26 | w = np.maximum(x2 - x1 + 1, 1) 27 | h = np.maximum(y2 - y1 + 1, 1) 28 | # mask is uint8 both before and after resizing 29 | target = mmcv.imresize(gt_mask[y1:y1 + h, x1:x1 + w], 30 | (mask_size, mask_size)) 31 | mask_targets.append(target) 32 | mask_targets = torch.from_numpy(np.stack(mask_targets)).float().to( 33 | pos_proposals.device) 34 | else: 35 | mask_targets = pos_proposals.new_zeros((0, mask_size, mask_size)) 36 | return mask_targets 37 | -------------------------------------------------------------------------------- /mmdet/ops/nms/gpu_nms.pyx: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Faster R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | cimport numpy as np 10 | 11 | assert sizeof(int) == sizeof(np.int32_t) 12 | 13 | cdef extern from "gpu_nms.hpp": 14 | void _nms(np.int32_t*, int*, np.float32_t*, int, int, float, int, size_t) nogil 15 | size_t nms_Malloc() nogil 16 | 17 | memory_pool = {} 18 | 19 | def gpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh, 20 | np.int32_t device_id=0): 21 | cdef int boxes_num = dets.shape[0] 22 | cdef int boxes_dim = 5 23 | cdef int num_out 24 | cdef size_t base 25 | cdef np.ndarray[np.int32_t, ndim=1] \ 26 | keep = np.zeros(boxes_num, dtype=np.int32) 27 | cdef np.ndarray[np.float32_t, ndim=1] \ 28 | scores = dets[:, 4] 29 | cdef np.ndarray[np.int_t, ndim=1] \ 30 | order = scores.argsort()[::-1] 31 | cdef np.ndarray[np.float32_t, ndim=2] \ 32 | sorted_dets = dets[order, :5] 33 | cdef float cthresh = thresh 34 | if device_id not in memory_pool: 35 | with nogil: 36 | base = nms_Malloc() 37 | memory_pool[device_id] = base 38 | # print "malloc", base 39 | base = memory_pool[device_id] 40 | with nogil: 41 | _nms(&keep[0], &num_out, &sorted_dets[0, 0], boxes_num, boxes_dim, cthresh, device_id, base) 42 | keep = keep[:num_out] 43 | return list(order[keep]) 44 | -------------------------------------------------------------------------------- /mmdet/models/utils/weight_init.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch.nn as nn 3 | 4 | 5 | def xavier_init(module, gain=1, bias=0, distribution='normal'): 6 | assert distribution in ['uniform', 'normal'] 7 | if distribution == 'uniform': 8 | nn.init.xavier_uniform_(module.weight, gain=gain) 9 | else: 10 | nn.init.xavier_normal_(module.weight, gain=gain) 11 | if hasattr(module, 'bias'): 12 | nn.init.constant_(module.bias, bias) 13 | 14 | 15 | def normal_init(module, mean=0, std=1, bias=0): 16 | nn.init.normal_(module.weight, mean, std) 17 | if hasattr(module, 'bias'): 18 | nn.init.constant_(module.bias, bias) 19 | 20 | 21 | def uniform_init(module, a=0, b=1, bias=0): 22 | nn.init.uniform_(module.weight, a, b) 23 | if hasattr(module, 'bias'): 24 | nn.init.constant_(module.bias, bias) 25 | 26 | 27 | def kaiming_init(module, 28 | mode='fan_out', 29 | nonlinearity='relu', 30 | bias=0, 31 | distribution='normal'): 32 | assert distribution in ['uniform', 'normal'] 33 | if distribution == 'uniform': 34 | nn.init.kaiming_uniform_( 35 | module.weight, mode=mode, nonlinearity=nonlinearity) 36 | else: 37 | nn.init.kaiming_normal_( 38 | module.weight, mode=mode, nonlinearity=nonlinearity) 39 | if hasattr(module, 'bias'): 40 | nn.init.constant_(module.bias, bias) 41 | 42 | 43 | def bias_init_with_prob(prior_prob): 44 | """ initialize conv/fc bias value according to giving probablity""" 45 | bias_init = float(-np.log((1 - prior_prob) / prior_prob)) 46 | return bias_init 47 | -------------------------------------------------------------------------------- /mmdet/models/builder.py: -------------------------------------------------------------------------------- 1 | import mmcv 2 | from torch import nn 3 | 4 | from .registry import BACKBONES, NECKS, ROI_EXTRACTORS, HEADS, DETECTORS 5 | 6 | 7 | def _build_module(cfg, registry, default_args): 8 | assert isinstance(cfg, dict) and 'type' in cfg 9 | assert isinstance(default_args, dict) or default_args is None 10 | args = cfg.copy() 11 | obj_type = args.pop('type') 12 | if mmcv.is_str(obj_type): 13 | if obj_type not in registry.module_dict: 14 | raise KeyError('{} is not in the {} registry'.format( 15 | obj_type, registry.name)) 16 | obj_type = registry.module_dict[obj_type] 17 | elif not isinstance(obj_type, type): 18 | raise TypeError('type must be a str or valid type, but got {}'.format( 19 | type(obj_type))) 20 | if default_args is not None: 21 | for name, value in default_args.items(): 22 | args.setdefault(name, value) 23 | return obj_type(**args) 24 | 25 | 26 | def build(cfg, registry, default_args=None): 27 | if isinstance(cfg, list): 28 | modules = [_build_module(cfg_, registry, default_args) for cfg_ in cfg] 29 | return nn.Sequential(*modules) 30 | else: 31 | return _build_module(cfg, registry, default_args) 32 | 33 | 34 | def build_backbone(cfg): 35 | return build(cfg, BACKBONES) 36 | 37 | 38 | def build_neck(cfg): 39 | return build(cfg, NECKS) 40 | 41 | 42 | def build_roi_extractor(cfg): 43 | return build(cfg, ROI_EXTRACTORS) 44 | 45 | 46 | def build_head(cfg): 47 | return build(cfg, HEADS) 48 | 49 | 50 | def build_detector(cfg, train_cfg=None, test_cfg=None): 51 | return build(cfg, DETECTORS, dict(train_cfg=train_cfg, test_cfg=test_cfg)) 52 | -------------------------------------------------------------------------------- /INSTALL.md: -------------------------------------------------------------------------------- 1 | ## Installation 2 | 3 | ### Requirements 4 | 5 | - Linux (tested on Ubuntu 16.04 and CentOS 7.2) 6 | - Python 3.4+ 7 | - PyTorch 0.4.1 8 | - Cython 9 | - [mmcv](https://github.com/open-mmlab/mmcv) 10 | 11 | ### Install mmdetection 12 | 13 | a. Install PyTorch 0.4.1 and torchvision following the [official instructions](https://pytorch.org/). 14 | 15 | b. Clone the mmdetection repository. 16 | 17 | ```shell 18 | git clone https://github.com/open-mmlab/mmdetection.git 19 | ``` 20 | 21 | c. Compile cuda extensions. 22 | 23 | ```shell 24 | cd mmdetection 25 | pip install cython # or "conda install cython" if you prefer conda 26 | ./compile.sh # or "PYTHON=python3 ./compile.sh" if you use system python3 without virtual environments 27 | ``` 28 | 29 | d. Install mmdetection (other dependencies will be installed automatically). 30 | 31 | ```shell 32 | python(3) setup.py install # add --user if you want to install it locally 33 | # or "pip install ." 34 | ``` 35 | 36 | Note: You need to run the last step each time you pull updates from github. 37 | The git commit id will be written to the version number and also saved in trained models. 38 | 39 | ### Prepare COCO dataset. 40 | 41 | It is recommended to symlink the dataset root to `$MMDETECTION/data`. 42 | 43 | ``` 44 | mmdetection 45 | ├── mmdet 46 | ├── tools 47 | ├── configs 48 | ├── data 49 | │ ├── coco 50 | │ │ ├── annotations 51 | │ │ ├── train2017 52 | │ │ ├── val2017 53 | │ │ ├── test2017 54 | │ ├── VOCdevkit 55 | │ │ ├── VOC2007 56 | │ │ ├── VOC2012 57 | 58 | ``` 59 | 60 | ### Scripts 61 | Just for reference, [Here](https://gist.github.com/hellock/bf23cd7348c727d69d48682cb6909047) is 62 | a script for setting up mmdetection with conda. 63 | -------------------------------------------------------------------------------- /mmdet/apis/env.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import random 4 | 5 | import numpy as np 6 | import torch 7 | import torch.distributed as dist 8 | import torch.multiprocessing as mp 9 | from mmcv.runner import get_dist_info 10 | 11 | 12 | def init_dist(launcher, backend='nccl', **kwargs): 13 | if mp.get_start_method(allow_none=True) is None: 14 | mp.set_start_method('spawn') 15 | if launcher == 'pytorch': 16 | _init_dist_pytorch(backend, **kwargs) 17 | elif launcher == 'mpi': 18 | _init_dist_mpi(backend, **kwargs) 19 | elif launcher == 'slurm': 20 | _init_dist_slurm(backend, **kwargs) 21 | else: 22 | raise ValueError('Invalid launcher type: {}'.format(launcher)) 23 | 24 | 25 | def _init_dist_pytorch(backend, **kwargs): 26 | # TODO: use local_rank instead of rank % num_gpus 27 | rank = int(os.environ['RANK']) 28 | num_gpus = torch.cuda.device_count() 29 | torch.cuda.set_device(rank % num_gpus) 30 | dist.init_process_group(backend=backend, **kwargs) 31 | 32 | 33 | def _init_dist_mpi(backend, **kwargs): 34 | raise NotImplementedError 35 | 36 | 37 | def _init_dist_slurm(backend, **kwargs): 38 | raise NotImplementedError 39 | 40 | 41 | def set_random_seed(seed): 42 | random.seed(seed) 43 | np.random.seed(seed) 44 | torch.manual_seed(seed) 45 | torch.cuda.manual_seed_all(seed) 46 | 47 | 48 | def get_root_logger(log_level=logging.INFO): 49 | logger = logging.getLogger() 50 | if not logger.hasHandlers(): 51 | logging.basicConfig( 52 | format='%(asctime)s - %(levelname)s - %(message)s', 53 | level=log_level) 54 | rank, _ = get_dist_info() 55 | if rank != 0: 56 | logger.setLevel('ERROR') 57 | return logger 58 | -------------------------------------------------------------------------------- /mmdet/core/evaluation/bbox_overlaps.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def bbox_overlaps(bboxes1, bboxes2, mode='iou'): 5 | """Calculate the ious between each bbox of bboxes1 and bboxes2. 6 | 7 | Args: 8 | bboxes1(ndarray): shape (n, 4) 9 | bboxes2(ndarray): shape (k, 4) 10 | mode(str): iou (intersection over union) or iof (intersection 11 | over foreground) 12 | 13 | Returns: 14 | ious(ndarray): shape (n, k) 15 | """ 16 | 17 | assert mode in ['iou', 'iof'] 18 | 19 | bboxes1 = bboxes1.astype(np.float32) 20 | bboxes2 = bboxes2.astype(np.float32) 21 | rows = bboxes1.shape[0] 22 | cols = bboxes2.shape[0] 23 | ious = np.zeros((rows, cols), dtype=np.float32) 24 | if rows * cols == 0: 25 | return ious 26 | exchange = False 27 | if bboxes1.shape[0] > bboxes2.shape[0]: 28 | bboxes1, bboxes2 = bboxes2, bboxes1 29 | ious = np.zeros((cols, rows), dtype=np.float32) 30 | exchange = True 31 | area1 = (bboxes1[:, 2] - bboxes1[:, 0] + 1) * ( 32 | bboxes1[:, 3] - bboxes1[:, 1] + 1) 33 | area2 = (bboxes2[:, 2] - bboxes2[:, 0] + 1) * ( 34 | bboxes2[:, 3] - bboxes2[:, 1] + 1) 35 | for i in range(bboxes1.shape[0]): 36 | x_start = np.maximum(bboxes1[i, 0], bboxes2[:, 0]) 37 | y_start = np.maximum(bboxes1[i, 1], bboxes2[:, 1]) 38 | x_end = np.minimum(bboxes1[i, 2], bboxes2[:, 2]) 39 | y_end = np.minimum(bboxes1[i, 3], bboxes2[:, 3]) 40 | overlap = np.maximum(x_end - x_start + 1, 0) * np.maximum( 41 | y_end - y_start + 1, 0) 42 | if mode == 'iou': 43 | union = area1[i] + area2 - overlap 44 | else: 45 | union = area1[i] if not exchange else area2 46 | ious[i, :] = overlap / union 47 | if exchange: 48 | ious = ious.T 49 | return ious 50 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | # cython generated cpp 107 | mmdet/ops/nms/*.cpp 108 | mmdet/version.py 109 | data 110 | .vscode 111 | .idea 112 | -------------------------------------------------------------------------------- /mmdet/models/detectors/fast_rcnn.py: -------------------------------------------------------------------------------- 1 | from .two_stage import TwoStageDetector 2 | from ..registry import DETECTORS 3 | 4 | 5 | @DETECTORS.register_module 6 | class FastRCNN(TwoStageDetector): 7 | 8 | def __init__(self, 9 | backbone, 10 | neck, 11 | bbox_roi_extractor, 12 | bbox_head, 13 | train_cfg, 14 | test_cfg, 15 | mask_roi_extractor=None, 16 | mask_head=None, 17 | pretrained=None): 18 | super(FastRCNN, self).__init__( 19 | backbone=backbone, 20 | neck=neck, 21 | bbox_roi_extractor=bbox_roi_extractor, 22 | bbox_head=bbox_head, 23 | train_cfg=train_cfg, 24 | test_cfg=test_cfg, 25 | mask_roi_extractor=mask_roi_extractor, 26 | mask_head=mask_head, 27 | pretrained=pretrained) 28 | 29 | def forward_test(self, imgs, img_metas, proposals, **kwargs): 30 | for var, name in [(imgs, 'imgs'), (img_metas, 'img_metas')]: 31 | if not isinstance(var, list): 32 | raise TypeError('{} must be a list, but got {}'.format( 33 | name, type(var))) 34 | 35 | num_augs = len(imgs) 36 | if num_augs != len(img_metas): 37 | raise ValueError( 38 | 'num of augmentations ({}) != num of image meta ({})'.format( 39 | len(imgs), len(img_metas))) 40 | # TODO: remove the restriction of imgs_per_gpu == 1 when prepared 41 | imgs_per_gpu = imgs[0].size(0) 42 | assert imgs_per_gpu == 1 43 | 44 | if num_augs == 1: 45 | return self.simple_test(imgs[0], img_metas[0], proposals[0], 46 | **kwargs) 47 | else: 48 | return self.aug_test(imgs, img_metas, proposals, **kwargs) 49 | -------------------------------------------------------------------------------- /mmdet/models/utils/norm.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | 3 | 4 | norm_cfg = { 5 | # format: layer_type: (abbreviation, module) 6 | 'BN': ('bn', nn.BatchNorm2d), 7 | 'SyncBN': ('bn', None), 8 | 'GN': ('gn', nn.GroupNorm), 9 | # and potentially 'SN' 10 | } 11 | 12 | 13 | def build_norm_layer(cfg, num_features, postfix=''): 14 | """ Build normalization layer 15 | 16 | Args: 17 | cfg (dict): cfg should contain: 18 | type (str): identify norm layer type. 19 | layer args: args needed to instantiate a norm layer. 20 | frozen (bool): [optional] whether stop gradient updates 21 | of norm layer, it is helpful to set frozen mode 22 | in backbone's norms. 23 | num_features (int): number of channels from input 24 | postfix (int, str): appended into norm abbreation to 25 | create named layer. 26 | 27 | Returns: 28 | name (str): abbreation + postfix 29 | layer (nn.Module): created norm layer 30 | """ 31 | assert isinstance(cfg, dict) and 'type' in cfg 32 | cfg_ = cfg.copy() 33 | 34 | layer_type = cfg_.pop('type') 35 | if layer_type not in norm_cfg: 36 | raise KeyError('Unrecognized norm type {}'.format(layer_type)) 37 | else: 38 | abbr, norm_layer = norm_cfg[layer_type] 39 | if norm_layer is None: 40 | raise NotImplementedError 41 | 42 | assert isinstance(postfix, (int, str)) 43 | name = abbr + str(postfix) 44 | 45 | frozen = cfg_.pop('frozen', False) 46 | cfg_.setdefault('eps', 1e-5) 47 | if layer_type != 'GN': 48 | layer = norm_layer(num_features, **cfg_) 49 | else: 50 | assert 'num_groups' in cfg_ 51 | layer = norm_layer(num_channels=num_features, **cfg_) 52 | 53 | if frozen: 54 | for param in layer.parameters(): 55 | param.requires_grad = False 56 | 57 | return name, layer 58 | -------------------------------------------------------------------------------- /mmdet/core/bbox/samplers/instance_balanced_pos_sampler.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | from .random_sampler import RandomSampler 5 | 6 | 7 | class InstanceBalancedPosSampler(RandomSampler): 8 | 9 | def _sample_pos(self, assign_result, num_expected, **kwargs): 10 | pos_inds = torch.nonzero(assign_result.gt_inds > 0) 11 | if pos_inds.numel() != 0: 12 | pos_inds = pos_inds.squeeze(1) 13 | if pos_inds.numel() <= num_expected: 14 | return pos_inds 15 | else: 16 | unique_gt_inds = assign_result.gt_inds[pos_inds].unique() 17 | num_gts = len(unique_gt_inds) 18 | num_per_gt = int(round(num_expected / float(num_gts)) + 1) 19 | sampled_inds = [] 20 | for i in unique_gt_inds: 21 | inds = torch.nonzero(assign_result.gt_inds == i.item()) 22 | if inds.numel() != 0: 23 | inds = inds.squeeze(1) 24 | else: 25 | continue 26 | if len(inds) > num_per_gt: 27 | inds = self.random_choice(inds, num_per_gt) 28 | sampled_inds.append(inds) 29 | sampled_inds = torch.cat(sampled_inds) 30 | if len(sampled_inds) < num_expected: 31 | num_extra = num_expected - len(sampled_inds) 32 | extra_inds = np.array( 33 | list(set(pos_inds.cpu()) - set(sampled_inds.cpu()))) 34 | if len(extra_inds) > num_extra: 35 | extra_inds = self.random_choice(extra_inds, num_extra) 36 | extra_inds = torch.from_numpy(extra_inds).to( 37 | assign_result.gt_inds.device).long() 38 | sampled_inds = torch.cat([sampled_inds, extra_inds]) 39 | elif len(sampled_inds) > num_expected: 40 | sampled_inds = self.random_choice(sampled_inds, num_expected) 41 | return sampled_inds 42 | -------------------------------------------------------------------------------- /mmdet/ops/roi_pool/functions/roi_pool.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.autograd import Function 3 | 4 | from .. import roi_pool_cuda 5 | 6 | 7 | class RoIPoolFunction(Function): 8 | 9 | @staticmethod 10 | def forward(ctx, features, rois, out_size, spatial_scale): 11 | if isinstance(out_size, int): 12 | out_h = out_size 13 | out_w = out_size 14 | elif isinstance(out_size, tuple): 15 | assert len(out_size) == 2 16 | assert isinstance(out_size[0], int) 17 | assert isinstance(out_size[1], int) 18 | out_h, out_w = out_size 19 | else: 20 | raise TypeError( 21 | '"out_size" must be an integer or tuple of integers') 22 | assert features.is_cuda 23 | ctx.save_for_backward(rois) 24 | num_channels = features.size(1) 25 | num_rois = rois.size(0) 26 | out_size = (num_rois, num_channels, out_h, out_w) 27 | output = features.new_zeros(*out_size) 28 | 29 | argmax = features.new_zeros(*out_size, dtype=torch.int) 30 | roi_pool_cuda.forward(features, rois, out_h, out_w, spatial_scale, 31 | output, argmax) 32 | ctx.spatial_scale = spatial_scale 33 | ctx.feature_size = features.size() 34 | ctx.argmax = argmax 35 | 36 | return output 37 | 38 | @staticmethod 39 | def backward(ctx, grad_output): 40 | assert grad_output.is_cuda 41 | spatial_scale = ctx.spatial_scale 42 | feature_size = ctx.feature_size 43 | argmax = ctx.argmax 44 | rois = ctx.saved_tensors[0] 45 | assert feature_size is not None 46 | 47 | grad_input = grad_rois = None 48 | if ctx.needs_input_grad[0]: 49 | grad_input = grad_output.new(feature_size).zero_() 50 | roi_pool_cuda.backward(grad_output, rois, argmax, spatial_scale, 51 | grad_input) 52 | 53 | return grad_input, grad_rois, None, None 54 | 55 | 56 | roi_pool = RoIPoolFunction.apply 57 | -------------------------------------------------------------------------------- /tools/voc_eval.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | 3 | import mmcv 4 | import numpy as np 5 | 6 | from mmdet import datasets 7 | from mmdet.core import eval_map 8 | 9 | 10 | def voc_eval(result_file, dataset, iou_thr=0.5): 11 | det_results = mmcv.load(result_file) 12 | gt_bboxes = [] 13 | gt_labels = [] 14 | gt_ignore = [] 15 | for i in range(len(dataset)): 16 | ann = dataset.get_ann_info(i) 17 | bboxes = ann['bboxes'] 18 | labels = ann['labels'] 19 | if 'bboxes_ignore' in ann: 20 | ignore = np.concatenate([ 21 | np.zeros(bboxes.shape[0], dtype=np.bool), 22 | np.ones(ann['bboxes_ignore'].shape[0], dtype=np.bool) 23 | ]) 24 | gt_ignore.append(ignore) 25 | bboxes = np.vstack([bboxes, ann['bboxes_ignore']]) 26 | labels = np.concatenate([labels, ann['labels_ignore']]) 27 | gt_bboxes.append(bboxes) 28 | gt_labels.append(labels) 29 | if not gt_ignore: 30 | gt_ignore = gt_ignore 31 | if hasattr(dataset, 'year') and dataset.year == 2007: 32 | dataset_name = 'voc07' 33 | else: 34 | dataset_name = dataset.CLASSES 35 | eval_map( 36 | det_results, 37 | gt_bboxes, 38 | gt_labels, 39 | gt_ignore=gt_ignore, 40 | scale_ranges=None, 41 | iou_thr=iou_thr, 42 | dataset=dataset_name, 43 | print_summary=True) 44 | 45 | 46 | def main(): 47 | parser = ArgumentParser(description='VOC Evaluation') 48 | parser.add_argument('result', help='result file path') 49 | parser.add_argument('config', help='config file path') 50 | parser.add_argument( 51 | '--iou-thr', 52 | type=float, 53 | default=0.5, 54 | help='IoU threshold for evaluation') 55 | args = parser.parse_args() 56 | cfg = mmcv.Config.fromfile(args.config) 57 | test_dataset = mmcv.runner.obj_from_dict(cfg.data.test, datasets) 58 | voc_eval(args.result, test_dataset, args.iou_thr) 59 | 60 | 61 | if __name__ == '__main__': 62 | main() 63 | -------------------------------------------------------------------------------- /mmdet/core/bbox/samplers/random_sampler.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | from .base_sampler import BaseSampler 5 | 6 | 7 | class RandomSampler(BaseSampler): 8 | 9 | def __init__(self, 10 | num, 11 | pos_fraction, 12 | neg_pos_ub=-1, 13 | add_gt_as_proposals=True, 14 | **kwargs): 15 | super(RandomSampler, self).__init__(num, pos_fraction, neg_pos_ub, 16 | add_gt_as_proposals) 17 | 18 | @staticmethod 19 | def random_choice(gallery, num): 20 | """Random select some elements from the gallery. 21 | 22 | It seems that Pytorch's implementation is slower than numpy so we use 23 | numpy to randperm the indices. 24 | """ 25 | assert len(gallery) >= num 26 | if isinstance(gallery, list): 27 | gallery = np.array(gallery) 28 | cands = np.arange(len(gallery)) 29 | np.random.shuffle(cands) 30 | rand_inds = cands[:num] 31 | if not isinstance(gallery, np.ndarray): 32 | rand_inds = torch.from_numpy(rand_inds).long().to(gallery.device) 33 | return gallery[rand_inds] 34 | 35 | def _sample_pos(self, assign_result, num_expected, **kwargs): 36 | """Randomly sample some positive samples.""" 37 | pos_inds = torch.nonzero(assign_result.gt_inds > 0) 38 | if pos_inds.numel() != 0: 39 | pos_inds = pos_inds.squeeze(1) 40 | if pos_inds.numel() <= num_expected: 41 | return pos_inds 42 | else: 43 | return self.random_choice(pos_inds, num_expected) 44 | 45 | def _sample_neg(self, assign_result, num_expected, **kwargs): 46 | """Randomly sample some negative samples.""" 47 | neg_inds = torch.nonzero(assign_result.gt_inds == 0) 48 | if neg_inds.numel() != 0: 49 | neg_inds = neg_inds.squeeze(1) 50 | if len(neg_inds) <= num_expected: 51 | return neg_inds 52 | else: 53 | return self.random_choice(neg_inds, num_expected) 54 | -------------------------------------------------------------------------------- /mmdet/core/utils/dist_utils.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | 3 | import torch.distributed as dist 4 | from torch._utils import (_flatten_dense_tensors, _unflatten_dense_tensors, 5 | _take_tensors) 6 | from mmcv.runner import OptimizerHook 7 | 8 | 9 | def _allreduce_coalesced(tensors, world_size, bucket_size_mb=-1): 10 | if bucket_size_mb > 0: 11 | bucket_size_bytes = bucket_size_mb * 1024 * 1024 12 | buckets = _take_tensors(tensors, bucket_size_bytes) 13 | else: 14 | buckets = OrderedDict() 15 | for tensor in tensors: 16 | tp = tensor.type() 17 | if tp not in buckets: 18 | buckets[tp] = [] 19 | buckets[tp].append(tensor) 20 | buckets = buckets.values() 21 | 22 | for bucket in buckets: 23 | flat_tensors = _flatten_dense_tensors(bucket) 24 | dist.all_reduce(flat_tensors) 25 | flat_tensors.div_(world_size) 26 | for tensor, synced in zip( 27 | bucket, _unflatten_dense_tensors(flat_tensors, bucket)): 28 | tensor.copy_(synced) 29 | 30 | 31 | def allreduce_grads(model, coalesce=True, bucket_size_mb=-1): 32 | grads = [ 33 | param.grad.data for param in model.parameters() 34 | if param.requires_grad and param.grad is not None 35 | ] 36 | world_size = dist.get_world_size() 37 | if coalesce: 38 | _allreduce_coalesced(grads, world_size, bucket_size_mb) 39 | else: 40 | for tensor in grads: 41 | dist.all_reduce(tensor.div_(world_size)) 42 | 43 | 44 | class DistOptimizerHook(OptimizerHook): 45 | 46 | def __init__(self, grad_clip=None, coalesce=True, bucket_size_mb=-1): 47 | self.grad_clip = grad_clip 48 | self.coalesce = coalesce 49 | self.bucket_size_mb = bucket_size_mb 50 | 51 | def after_train_iter(self, runner): 52 | runner.optimizer.zero_grad() 53 | runner.outputs['loss'].backward() 54 | allreduce_grads(runner.model, self.coalesce, self.bucket_size_mb) 55 | if self.grad_clip is not None: 56 | self.clip_grads(runner.model.parameters()) 57 | runner.optimizer.step() 58 | -------------------------------------------------------------------------------- /mmdet/ops/nms/nms_wrapper.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | from .gpu_nms import gpu_nms 5 | from .cpu_nms import cpu_nms 6 | from .cpu_soft_nms import cpu_soft_nms 7 | 8 | 9 | def nms(dets, iou_thr, device_id=None): 10 | """Dispatch to either CPU or GPU NMS implementations.""" 11 | if isinstance(dets, torch.Tensor): 12 | is_tensor = True 13 | if dets.is_cuda: 14 | device_id = dets.get_device() 15 | dets_np = dets.detach().cpu().numpy() 16 | elif isinstance(dets, np.ndarray): 17 | is_tensor = False 18 | dets_np = dets 19 | else: 20 | raise TypeError( 21 | 'dets must be either a Tensor or numpy array, but got {}'.format( 22 | type(dets))) 23 | 24 | if dets_np.shape[0] == 0: 25 | inds = [] 26 | else: 27 | inds = (gpu_nms(dets_np, iou_thr, device_id=device_id) 28 | if device_id is not None else cpu_nms(dets_np, iou_thr)) 29 | 30 | if is_tensor: 31 | inds = dets.new_tensor(inds, dtype=torch.long) 32 | else: 33 | inds = np.array(inds, dtype=np.int64) 34 | return dets[inds, :], inds 35 | 36 | 37 | def soft_nms(dets, iou_thr, method='linear', sigma=0.5, min_score=1e-3): 38 | if isinstance(dets, torch.Tensor): 39 | is_tensor = True 40 | dets_np = dets.detach().cpu().numpy() 41 | elif isinstance(dets, np.ndarray): 42 | is_tensor = False 43 | dets_np = dets 44 | else: 45 | raise TypeError( 46 | 'dets must be either a Tensor or numpy array, but got {}'.format( 47 | type(dets))) 48 | 49 | method_codes = {'linear': 1, 'gaussian': 2} 50 | if method not in method_codes: 51 | raise ValueError('Invalid method for SoftNMS: {}'.format(method)) 52 | new_dets, inds = cpu_soft_nms( 53 | dets_np, 54 | iou_thr, 55 | method=method_codes[method], 56 | sigma=sigma, 57 | min_score=min_score) 58 | 59 | if is_tensor: 60 | return dets.new_tensor(new_dets), dets.new_tensor( 61 | inds, dtype=torch.long) 62 | else: 63 | return new_dets.astype(np.float32), inds.astype(np.int64) 64 | -------------------------------------------------------------------------------- /mmdet/apis/inference.py: -------------------------------------------------------------------------------- 1 | import mmcv 2 | import numpy as np 3 | import torch 4 | 5 | from mmdet.datasets import to_tensor 6 | from mmdet.datasets.transforms import ImageTransform 7 | from mmdet.core import get_classes 8 | 9 | 10 | def _prepare_data(img, img_transform, cfg, device): 11 | ori_shape = img.shape 12 | img, img_shape, pad_shape, scale_factor = img_transform( 13 | img, scale=cfg.data.test.img_scale) 14 | img = to_tensor(img).to(device).unsqueeze(0) 15 | img_meta = [ 16 | dict( 17 | ori_shape=ori_shape, 18 | img_shape=img_shape, 19 | pad_shape=pad_shape, 20 | scale_factor=scale_factor, 21 | flip=False) 22 | ] 23 | return dict(img=[img], img_meta=[img_meta]) 24 | 25 | 26 | def _inference_single(model, img, img_transform, cfg, device): 27 | img = mmcv.imread(img) 28 | data = _prepare_data(img, img_transform, cfg, device) 29 | with torch.no_grad(): 30 | result = model(return_loss=False, rescale=True, **data) 31 | return result 32 | 33 | 34 | def _inference_generator(model, imgs, img_transform, cfg, device): 35 | for img in imgs: 36 | yield _inference_single(model, img, img_transform, cfg, device) 37 | 38 | 39 | def inference_detector(model, imgs, cfg, device='cuda:0'): 40 | img_transform = ImageTransform( 41 | size_divisor=cfg.data.test.size_divisor, **cfg.img_norm_cfg) 42 | model = model.to(device) 43 | model.eval() 44 | 45 | if not isinstance(imgs, list): 46 | return _inference_single(model, imgs, img_transform, cfg, device) 47 | else: 48 | return _inference_generator(model, imgs, img_transform, cfg, device) 49 | 50 | 51 | def show_result(img, result, dataset='coco', score_thr=0.3): 52 | class_names = get_classes(dataset) 53 | labels = [ 54 | np.full(bbox.shape[0], i, dtype=np.int32) 55 | for i, bbox in enumerate(result) 56 | ] 57 | labels = np.concatenate(labels) 58 | bboxes = np.vstack(result) 59 | img = mmcv.imread(img) 60 | mmcv.imshow_det_bboxes( 61 | img.copy(), 62 | bboxes, 63 | labels, 64 | class_names=class_names, 65 | score_thr=score_thr) 66 | -------------------------------------------------------------------------------- /mmdet/core/post_processing/bbox_nms.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from mmdet.ops.nms import nms_wrapper 4 | 5 | 6 | def multiclass_nms(multi_bboxes, multi_scores, score_thr, nms_cfg, max_num=-1, match_ids=None): 7 | """NMS for multi-class bboxes. 8 | 9 | Args: 10 | multi_bboxes (Tensor): shape (n, #class*4) or (n, 4) 11 | multi_scores (Tensor): shape (n, #class) 12 | score_thr (float): bbox threshold, bboxes with scores lower than it 13 | will not be considered. 14 | nms_thr (float): NMS IoU threshold 15 | max_num (int): if there are more than max_num bboxes after NMS, 16 | only top max_num will be kept. 17 | 18 | Returns: 19 | tuple: (bboxes, labels), tensors of shape (k, 5) and (k, 1). Labels 20 | are 0-based. 21 | """ 22 | num_classes = multi_scores.shape[1] 23 | bboxes, labels, ids = [], [], [] 24 | nms_cfg_ = nms_cfg.copy() 25 | nms_type = nms_cfg_.pop('type', 'nms') 26 | nms_op = getattr(nms_wrapper, nms_type) 27 | for i in range(1, num_classes): 28 | cls_inds = multi_scores[:, i] > score_thr 29 | if not cls_inds.any(): 30 | continue 31 | # get bboxes and scores of this class 32 | if multi_bboxes.shape[1] == 4: 33 | _bboxes = multi_bboxes[cls_inds, :] 34 | else: 35 | _bboxes = multi_bboxes[cls_inds, i * 4:(i + 1) * 4] 36 | _scores = multi_scores[cls_inds, i] 37 | cls_dets = torch.cat([_bboxes, _scores[:, None]], dim=1) 38 | cls_dets, _ = nms_op(cls_dets, **nms_cfg_) 39 | cls_labels = multi_bboxes.new_full( 40 | (cls_dets.shape[0], ), i - 1, dtype=torch.long) 41 | bboxes.append(cls_dets) 42 | labels.append(cls_labels) 43 | if bboxes: 44 | bboxes = torch.cat(bboxes) 45 | labels = torch.cat(labels) 46 | if bboxes.shape[0] > max_num: 47 | _, inds = bboxes[:, -1].sort(descending=True) 48 | inds = inds[:max_num] 49 | bboxes = bboxes[inds] 50 | labels = labels[inds] 51 | else: 52 | bboxes = multi_bboxes.new_zeros((0, 5)) 53 | labels = multi_bboxes.new_zeros((0, ), dtype=torch.long) 54 | 55 | return bboxes, labels 56 | -------------------------------------------------------------------------------- /mmdet/ops/roi_align/functions/roi_align.py: -------------------------------------------------------------------------------- 1 | from torch.autograd import Function, Variable 2 | 3 | from .. import roi_align_cuda 4 | 5 | 6 | class RoIAlignFunction(Function): 7 | 8 | @staticmethod 9 | def forward(ctx, features, rois, out_size, spatial_scale, sample_num=0): 10 | if isinstance(out_size, int): 11 | out_h = out_size 12 | out_w = out_size 13 | elif isinstance(out_size, tuple): 14 | assert len(out_size) == 2 15 | assert isinstance(out_size[0], int) 16 | assert isinstance(out_size[1], int) 17 | out_h, out_w = out_size 18 | else: 19 | raise TypeError( 20 | '"out_size" must be an integer or tuple of integers') 21 | ctx.spatial_scale = spatial_scale 22 | ctx.sample_num = sample_num 23 | ctx.save_for_backward(rois) 24 | ctx.feature_size = features.size() 25 | 26 | batch_size, num_channels, data_height, data_width = features.size() 27 | num_rois = rois.size(0) 28 | 29 | output = features.new_zeros(num_rois, num_channels, out_h, out_w) 30 | if features.is_cuda: 31 | roi_align_cuda.forward(features, rois, out_h, out_w, spatial_scale, 32 | sample_num, output) 33 | else: 34 | raise NotImplementedError 35 | 36 | return output 37 | 38 | @staticmethod 39 | def backward(ctx, grad_output): 40 | feature_size = ctx.feature_size 41 | spatial_scale = ctx.spatial_scale 42 | sample_num = ctx.sample_num 43 | rois = ctx.saved_tensors[0] 44 | assert (feature_size is not None and grad_output.is_cuda) 45 | 46 | batch_size, num_channels, data_height, data_width = feature_size 47 | out_w = grad_output.size(3) 48 | out_h = grad_output.size(2) 49 | 50 | grad_input = grad_rois = None 51 | if ctx.needs_input_grad[0]: 52 | grad_input = Variable( 53 | rois.new(batch_size, num_channels, data_height, data_width) 54 | .zero_()) 55 | roi_align_cuda.backward(grad_output, rois, out_h, out_w, 56 | spatial_scale, sample_num, grad_input) 57 | 58 | return grad_input, grad_rois, None, None, None 59 | 60 | 61 | roi_align = RoIAlignFunction.apply 62 | -------------------------------------------------------------------------------- /mmdet/core/bbox/geometry.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def bbox_overlaps(bboxes1, bboxes2, mode='iou', is_aligned=False): 5 | """Calculate overlap between two set of bboxes. 6 | 7 | If ``is_aligned`` is ``False``, then calculate the ious between each bbox 8 | of bboxes1 and bboxes2, otherwise the ious between each aligned pair of 9 | bboxes1 and bboxes2. 10 | 11 | Args: 12 | bboxes1 (Tensor): shape (m, 4) 13 | bboxes2 (Tensor): shape (n, 4), if is_aligned is ``True``, then m and n 14 | must be equal. 15 | mode (str): "iou" (intersection over union) or iof (intersection over 16 | foreground). 17 | 18 | Returns: 19 | ious(Tensor): shape (n, k) if is_aligned == False else shape (n, 1) 20 | """ 21 | 22 | assert mode in ['iou', 'iof'] 23 | 24 | rows = bboxes1.size(0) 25 | cols = bboxes2.size(0) 26 | if is_aligned: 27 | assert rows == cols 28 | 29 | if rows * cols == 0: 30 | return bboxes1.new(rows, 1) if is_aligned else bboxes1.new(rows, cols) 31 | 32 | if is_aligned: 33 | lt = torch.max(bboxes1[:, :2], bboxes2[:, :2]) # [rows, 2] 34 | rb = torch.min(bboxes1[:, 2:], bboxes2[:, 2:]) # [rows, 2] 35 | 36 | wh = (rb - lt + 1).clamp(min=0) # [rows, 2] 37 | overlap = wh[:, 0] * wh[:, 1] 38 | area1 = (bboxes1[:, 2] - bboxes1[:, 0] + 1) * ( 39 | bboxes1[:, 3] - bboxes1[:, 1] + 1) 40 | 41 | if mode == 'iou': 42 | area2 = (bboxes2[:, 2] - bboxes2[:, 0] + 1) * ( 43 | bboxes2[:, 3] - bboxes2[:, 1] + 1) 44 | ious = overlap / (area1 + area2 - overlap) 45 | else: 46 | ious = overlap / area1 47 | else: 48 | lt = torch.max(bboxes1[:, None, :2], bboxes2[:, :2]) # [rows, cols, 2] 49 | rb = torch.min(bboxes1[:, None, 2:], bboxes2[:, 2:]) # [rows, cols, 2] 50 | 51 | wh = (rb - lt + 1).clamp(min=0) # [rows, cols, 2] 52 | overlap = wh[:, :, 0] * wh[:, :, 1] 53 | area1 = (bboxes1[:, 2] - bboxes1[:, 0] + 1) * ( 54 | bboxes1[:, 3] - bboxes1[:, 1] + 1) 55 | 56 | if mode == 'iou': 57 | area2 = (bboxes2[:, 2] - bboxes2[:, 0] + 1) * ( 58 | bboxes2[:, 3] - bboxes2[:, 1] + 1) 59 | ious = overlap / (area1[:, None] + area2 - overlap) 60 | else: 61 | ious = overlap / (area1[:, None]) 62 | 63 | return ious 64 | -------------------------------------------------------------------------------- /mmdet/models/detectors/single_stage.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | 3 | from .base import BaseDetector 4 | from .. import builder 5 | from ..registry import DETECTORS 6 | from mmdet.core import bbox2result 7 | 8 | 9 | @DETECTORS.register_module 10 | class SingleStageDetector(BaseDetector): 11 | 12 | def __init__(self, 13 | backbone, 14 | neck=None, 15 | bbox_head=None, 16 | train_cfg=None, 17 | test_cfg=None, 18 | pretrained=None): 19 | super(SingleStageDetector, self).__init__() 20 | self.backbone = builder.build_backbone(backbone) 21 | if neck is not None: 22 | self.neck = builder.build_neck(neck) 23 | self.bbox_head = builder.build_head(bbox_head) 24 | self.train_cfg = train_cfg 25 | self.test_cfg = test_cfg 26 | self.init_weights(pretrained=pretrained) 27 | 28 | def init_weights(self, pretrained=None): 29 | super(SingleStageDetector, self).init_weights(pretrained) 30 | self.backbone.init_weights(pretrained=pretrained) 31 | if self.with_neck: 32 | if isinstance(self.neck, nn.Sequential): 33 | for m in self.neck: 34 | m.init_weights() 35 | else: 36 | self.neck.init_weights() 37 | self.bbox_head.init_weights() 38 | 39 | def extract_feat(self, img): 40 | x = self.backbone(img) 41 | if self.with_neck: 42 | x = self.neck(x) 43 | return x 44 | 45 | def forward_train(self, img, img_metas, gt_bboxes, gt_labels): 46 | x = self.extract_feat(img) 47 | outs = self.bbox_head(x) 48 | loss_inputs = outs + (gt_bboxes, gt_labels, img_metas, self.train_cfg) 49 | losses = self.bbox_head.loss(*loss_inputs) 50 | return losses 51 | 52 | def simple_test(self, img, img_meta, rescale=False): 53 | x = self.extract_feat(img) 54 | outs = self.bbox_head(x) 55 | bbox_inputs = outs + (img_meta, self.test_cfg, rescale) 56 | bbox_list = self.bbox_head.get_bboxes(*bbox_inputs) 57 | bbox_results = [ 58 | bbox2result(det_bboxes, det_labels, self.bbox_head.num_classes) 59 | for det_bboxes, det_labels in bbox_list 60 | ] 61 | return bbox_results[0] 62 | 63 | def aug_test(self, imgs, img_metas, rescale=False): 64 | raise NotImplementedError 65 | -------------------------------------------------------------------------------- /mmdet/ops/nms/cpu_nms.pyx: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | cimport numpy as np 10 | 11 | cdef inline np.float32_t max(np.float32_t a, np.float32_t b): 12 | return a if a >= b else b 13 | 14 | cdef inline np.float32_t min(np.float32_t a, np.float32_t b): 15 | return a if a <= b else b 16 | 17 | def cpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh): 18 | cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0] 19 | cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1] 20 | cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2] 21 | cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3] 22 | cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4] 23 | 24 | cdef np.ndarray[np.float32_t, ndim=1] areas = (x2 - x1 + 1) * (y2 - y1 + 1) 25 | cdef np.ndarray[np.int_t, ndim=1] order = scores.argsort()[::-1] 26 | 27 | cdef int ndets = dets.shape[0] 28 | cdef np.ndarray[np.int_t, ndim=1] suppressed = \ 29 | np.zeros((ndets), dtype=np.int) 30 | 31 | # nominal indices 32 | cdef int _i, _j 33 | # sorted indices 34 | cdef int i, j 35 | # temp variables for box i's (the box currently under consideration) 36 | cdef np.float32_t ix1, iy1, ix2, iy2, iarea 37 | # variables for computing overlap with box j (lower scoring box) 38 | cdef np.float32_t xx1, yy1, xx2, yy2 39 | cdef np.float32_t w, h 40 | cdef np.float32_t inter, ovr 41 | 42 | keep = [] 43 | for _i in range(ndets): 44 | i = order[_i] 45 | if suppressed[i] == 1: 46 | continue 47 | keep.append(i) 48 | ix1 = x1[i] 49 | iy1 = y1[i] 50 | ix2 = x2[i] 51 | iy2 = y2[i] 52 | iarea = areas[i] 53 | for _j in range(_i + 1, ndets): 54 | j = order[_j] 55 | if suppressed[j] == 1: 56 | continue 57 | xx1 = max(ix1, x1[j]) 58 | yy1 = max(iy1, y1[j]) 59 | xx2 = min(ix2, x2[j]) 60 | yy2 = min(iy2, y2[j]) 61 | w = max(0.0, xx2 - xx1 + 1) 62 | h = max(0.0, yy2 - yy1 + 1) 63 | inter = w * h 64 | ovr = inter / (iarea + areas[j] - inter) 65 | if ovr >= thresh: 66 | suppressed[j] = 1 67 | 68 | return keep 69 | -------------------------------------------------------------------------------- /mmdet/models/anchor_heads/retina_head.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch.nn as nn 3 | from mmcv.cnn import normal_init 4 | 5 | from .anchor_head import AnchorHead 6 | from ..registry import HEADS 7 | from ..utils import bias_init_with_prob 8 | 9 | 10 | @HEADS.register_module 11 | class RetinaHead(AnchorHead): 12 | 13 | def __init__(self, 14 | num_classes, 15 | in_channels, 16 | stacked_convs=4, 17 | octave_base_scale=4, 18 | scales_per_octave=3, 19 | **kwargs): 20 | self.stacked_convs = stacked_convs 21 | self.octave_base_scale = octave_base_scale 22 | self.scales_per_octave = scales_per_octave 23 | octave_scales = np.array( 24 | [2**(i / scales_per_octave) for i in range(scales_per_octave)]) 25 | anchor_scales = octave_scales * octave_base_scale 26 | super(RetinaHead, self).__init__( 27 | num_classes, 28 | in_channels, 29 | anchor_scales=anchor_scales, 30 | use_sigmoid_cls=True, 31 | use_focal_loss=True, 32 | **kwargs) 33 | 34 | def _init_layers(self): 35 | self.relu = nn.ReLU(inplace=True) 36 | self.cls_convs = nn.ModuleList() 37 | self.reg_convs = nn.ModuleList() 38 | for i in range(self.stacked_convs): 39 | chn = self.in_channels if i == 0 else self.feat_channels 40 | self.cls_convs.append( 41 | nn.Conv2d(chn, self.feat_channels, 3, stride=1, padding=1)) 42 | self.reg_convs.append( 43 | nn.Conv2d(chn, self.feat_channels, 3, stride=1, padding=1)) 44 | self.retina_cls = nn.Conv2d( 45 | self.feat_channels, 46 | self.num_anchors * self.cls_out_channels, 47 | 3, 48 | padding=1) 49 | self.retina_reg = nn.Conv2d( 50 | self.feat_channels, self.num_anchors * 4, 3, padding=1) 51 | 52 | def init_weights(self): 53 | for m in self.cls_convs: 54 | normal_init(m, std=0.01) 55 | for m in self.reg_convs: 56 | normal_init(m, std=0.01) 57 | bias_cls = bias_init_with_prob(0.01) 58 | normal_init(self.retina_cls, std=0.01, bias=bias_cls) 59 | normal_init(self.retina_reg, std=0.01) 60 | 61 | def forward_single(self, x): 62 | cls_feat = x 63 | reg_feat = x 64 | for cls_conv in self.cls_convs: 65 | cls_feat = self.relu(cls_conv(cls_feat)) 66 | for reg_conv in self.reg_convs: 67 | reg_feat = self.relu(reg_conv(reg_feat)) 68 | cls_score = self.retina_cls(cls_feat) 69 | bbox_pred = self.retina_reg(reg_feat) 70 | return cls_score, bbox_pred 71 | -------------------------------------------------------------------------------- /mmdet/ops/nms/setup.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | from distutils.core import setup, Extension 3 | 4 | import numpy as np 5 | from Cython.Build import cythonize 6 | from Cython.Distutils import build_ext 7 | 8 | # extensions 9 | ext_args = dict( 10 | include_dirs=[np.get_include()], 11 | language='c++', 12 | extra_compile_args={ 13 | 'cc': ['-Wno-unused-function', '-Wno-write-strings'], 14 | 'nvcc': ['-c', '--compiler-options', '-fPIC'], 15 | }, 16 | ) 17 | 18 | extensions = [ 19 | Extension('cpu_nms', ['cpu_nms.pyx'], **ext_args), 20 | Extension('cpu_soft_nms', ['cpu_soft_nms.pyx'], **ext_args), 21 | Extension('gpu_nms', ['gpu_nms.pyx', 'nms_kernel.cu'], **ext_args), 22 | ] 23 | 24 | 25 | def customize_compiler_for_nvcc(self): 26 | """inject deep into distutils to customize how the dispatch 27 | to cc/nvcc works. 28 | If you subclass UnixCCompiler, it's not trivial to get your subclass 29 | injected in, and still have the right customizations (i.e. 30 | distutils.sysconfig.customize_compiler) run on it. So instead of going 31 | the OO route, I have this. Note, it's kindof like a wierd functional 32 | subclassing going on.""" 33 | 34 | # tell the compiler it can processes .cu 35 | self.src_extensions.append('.cu') 36 | 37 | # save references to the default compiler_so and _comple methods 38 | default_compiler_so = self.compiler_so 39 | super = self._compile 40 | 41 | # now redefine the _compile method. This gets executed for each 42 | # object but distutils doesn't have the ability to change compilers 43 | # based on source extension: we add it. 44 | def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts): 45 | if osp.splitext(src)[1] == '.cu': 46 | # use the cuda for .cu files 47 | self.set_executable('compiler_so', 'nvcc') 48 | # use only a subset of the extra_postargs, which are 1-1 translated 49 | # from the extra_compile_args in the Extension class 50 | postargs = extra_postargs['nvcc'] 51 | else: 52 | postargs = extra_postargs['cc'] 53 | 54 | super(obj, src, ext, cc_args, postargs, pp_opts) 55 | # reset the default compiler_so, which we might have changed for cuda 56 | self.compiler_so = default_compiler_so 57 | 58 | # inject our redefined _compile method into the class 59 | self._compile = _compile 60 | 61 | 62 | # run the customize_compiler 63 | class custom_build_ext(build_ext): 64 | 65 | def build_extensions(self): 66 | customize_compiler_for_nvcc(self.compiler) 67 | build_ext.build_extensions(self) 68 | 69 | 70 | setup( 71 | name='nms', 72 | cmdclass={'build_ext': custom_build_ext}, 73 | ext_modules=cythonize(extensions), 74 | ) 75 | -------------------------------------------------------------------------------- /mmdet/core/bbox/samplers/ohem_sampler.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from .base_sampler import BaseSampler 4 | from ..transforms import bbox2roi 5 | 6 | 7 | class OHEMSampler(BaseSampler): 8 | 9 | def __init__(self, 10 | num, 11 | pos_fraction, 12 | context, 13 | neg_pos_ub=-1, 14 | add_gt_as_proposals=True, 15 | **kwargs): 16 | super(OHEMSampler, self).__init__(num, pos_fraction, neg_pos_ub, 17 | add_gt_as_proposals) 18 | self.bbox_roi_extractor = context.bbox_roi_extractor 19 | self.bbox_head = context.bbox_head 20 | 21 | def hard_mining(self, inds, num_expected, bboxes, labels, feats): 22 | with torch.no_grad(): 23 | rois = bbox2roi([bboxes]) 24 | bbox_feats = self.bbox_roi_extractor( 25 | feats[:self.bbox_roi_extractor.num_inputs], rois) 26 | cls_score, _ = self.bbox_head(bbox_feats) 27 | loss = self.bbox_head.loss( 28 | cls_score=cls_score, 29 | bbox_pred=None, 30 | labels=labels, 31 | label_weights=cls_score.new_ones(cls_score.size(0)), 32 | bbox_targets=None, 33 | bbox_weights=None, 34 | reduce=False)['loss_cls'] 35 | _, topk_loss_inds = loss.topk(num_expected) 36 | return inds[topk_loss_inds] 37 | 38 | def _sample_pos(self, 39 | assign_result, 40 | num_expected, 41 | bboxes=None, 42 | feats=None, 43 | **kwargs): 44 | # Sample some hard positive samples 45 | pos_inds = torch.nonzero(assign_result.gt_inds > 0) 46 | if pos_inds.numel() != 0: 47 | pos_inds = pos_inds.squeeze(1) 48 | if pos_inds.numel() <= num_expected: 49 | return pos_inds 50 | else: 51 | return self.hard_mining(pos_inds, num_expected, bboxes[pos_inds], 52 | assign_result.labels[pos_inds], feats) 53 | 54 | def _sample_neg(self, 55 | assign_result, 56 | num_expected, 57 | bboxes=None, 58 | feats=None, 59 | **kwargs): 60 | # Sample some hard negative samples 61 | neg_inds = torch.nonzero(assign_result.gt_inds == 0) 62 | if neg_inds.numel() != 0: 63 | neg_inds = neg_inds.squeeze(1) 64 | if len(neg_inds) <= num_expected: 65 | return neg_inds 66 | else: 67 | return self.hard_mining(neg_inds, num_expected, bboxes[neg_inds], 68 | assign_result.labels[neg_inds], feats) 69 | -------------------------------------------------------------------------------- /mmdet/datasets/xml_style.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | import xml.etree.ElementTree as ET 3 | 4 | import mmcv 5 | import numpy as np 6 | 7 | from .custom import CustomDataset 8 | 9 | 10 | class XMLDataset(CustomDataset): 11 | 12 | def __init__(self, **kwargs): 13 | super(XMLDataset, self).__init__(**kwargs) 14 | self.cat2label = {cat: i + 1 for i, cat in enumerate(self.CLASSES)} 15 | 16 | def load_annotations(self, ann_file): 17 | img_infos = [] 18 | img_ids = mmcv.list_from_file(ann_file) 19 | for img_id in img_ids: 20 | filename = 'JPEGImages/{}.jpg'.format(img_id) 21 | xml_path = osp.join(self.img_prefix, 'Annotations', 22 | '{}.xml'.format(img_id)) 23 | tree = ET.parse(xml_path) 24 | root = tree.getroot() 25 | size = root.find('size') 26 | width = int(size.find('width').text) 27 | height = int(size.find('height').text) 28 | img_infos.append( 29 | dict(id=img_id, filename=filename, width=width, height=height)) 30 | return img_infos 31 | 32 | def get_ann_info(self, idx): 33 | img_id = self.img_infos[idx]['id'] 34 | xml_path = osp.join(self.img_prefix, 'Annotations', 35 | '{}.xml'.format(img_id)) 36 | tree = ET.parse(xml_path) 37 | root = tree.getroot() 38 | bboxes = [] 39 | labels = [] 40 | bboxes_ignore = [] 41 | labels_ignore = [] 42 | for obj in root.findall('object'): 43 | name = obj.find('name').text 44 | label = self.cat2label[name] 45 | difficult = int(obj.find('difficult').text) 46 | bnd_box = obj.find('bndbox') 47 | bbox = [ 48 | int(bnd_box.find('xmin').text), 49 | int(bnd_box.find('ymin').text), 50 | int(bnd_box.find('xmax').text), 51 | int(bnd_box.find('ymax').text) 52 | ] 53 | if difficult: 54 | bboxes_ignore.append(bbox) 55 | labels_ignore.append(label) 56 | else: 57 | bboxes.append(bbox) 58 | labels.append(label) 59 | if not bboxes: 60 | bboxes = np.zeros((0, 4)) 61 | labels = np.zeros((0, )) 62 | else: 63 | bboxes = np.array(bboxes, ndmin=2) - 1 64 | labels = np.array(labels) 65 | if not bboxes_ignore: 66 | bboxes_ignore = np.zeros((0, 4)) 67 | labels_ignore = np.zeros((0, )) 68 | else: 69 | bboxes_ignore = np.array(bboxes_ignore, ndmin=2) - 1 70 | labels_ignore = np.array(labels_ignore) 71 | ann = dict( 72 | bboxes=bboxes.astype(np.float32), 73 | labels=labels.astype(np.int64), 74 | bboxes_ignore=bboxes_ignore.astype(np.float32), 75 | labels_ignore=labels_ignore.astype(np.int64)) 76 | return ann 77 | -------------------------------------------------------------------------------- /mmdet/core/bbox/samplers/iou_balanced_neg_sampler.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | from .random_sampler import RandomSampler 5 | 6 | 7 | class IoUBalancedNegSampler(RandomSampler): 8 | 9 | def __init__(self, 10 | num, 11 | pos_fraction, 12 | hard_thr=0.1, 13 | hard_fraction=0.5, 14 | **kwargs): 15 | super(IoUBalancedNegSampler, self).__init__(num, pos_fraction, 16 | **kwargs) 17 | assert hard_thr > 0 18 | assert 0 < hard_fraction < 1 19 | self.hard_thr = hard_thr 20 | self.hard_fraction = hard_fraction 21 | 22 | def _sample_neg(self, assign_result, num_expected, **kwargs): 23 | neg_inds = torch.nonzero(assign_result.gt_inds == 0) 24 | if neg_inds.numel() != 0: 25 | neg_inds = neg_inds.squeeze(1) 26 | if len(neg_inds) <= num_expected: 27 | return neg_inds 28 | else: 29 | max_overlaps = assign_result.max_overlaps.cpu().numpy() 30 | # balance sampling for negative samples 31 | neg_set = set(neg_inds.cpu().numpy()) 32 | easy_set = set( 33 | np.where( 34 | np.logical_and(max_overlaps >= 0, 35 | max_overlaps < self.hard_thr))[0]) 36 | hard_set = set(np.where(max_overlaps >= self.hard_thr)[0]) 37 | easy_neg_inds = list(easy_set & neg_set) 38 | hard_neg_inds = list(hard_set & neg_set) 39 | 40 | num_expected_hard = int(num_expected * self.hard_fraction) 41 | if len(hard_neg_inds) > num_expected_hard: 42 | sampled_hard_inds = self.random_choice(hard_neg_inds, 43 | num_expected_hard) 44 | else: 45 | sampled_hard_inds = np.array(hard_neg_inds, dtype=np.int) 46 | num_expected_easy = num_expected - len(sampled_hard_inds) 47 | if len(easy_neg_inds) > num_expected_easy: 48 | sampled_easy_inds = self.random_choice(easy_neg_inds, 49 | num_expected_easy) 50 | else: 51 | sampled_easy_inds = np.array(easy_neg_inds, dtype=np.int) 52 | sampled_inds = np.concatenate((sampled_easy_inds, 53 | sampled_hard_inds)) 54 | if len(sampled_inds) < num_expected: 55 | num_extra = num_expected - len(sampled_inds) 56 | extra_inds = np.array(list(neg_set - set(sampled_inds))) 57 | if len(extra_inds) > num_extra: 58 | extra_inds = self.random_choice(extra_inds, num_extra) 59 | sampled_inds = np.concatenate((sampled_inds, extra_inds)) 60 | sampled_inds = torch.from_numpy(sampled_inds).long().to( 61 | assign_result.gt_inds.device) 62 | return sampled_inds 63 | -------------------------------------------------------------------------------- /tools/train.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | 3 | import argparse 4 | from mmcv import Config 5 | 6 | from mmdet import __version__ 7 | from mmdet.datasets import get_dataset 8 | from mmdet.apis import (train_detector, init_dist, get_root_logger, 9 | set_random_seed) 10 | from mmdet.models import build_detector 11 | import torch 12 | 13 | 14 | def parse_args(): 15 | parser = argparse.ArgumentParser(description='Train a detector') 16 | parser.add_argument('config', help='train config file path') 17 | parser.add_argument('--work_dir', help='the dir to save logs and models') 18 | parser.add_argument( 19 | '--resume_from', help='the checkpoint file to resume from') 20 | parser.add_argument( 21 | '--validate', 22 | action='store_true', 23 | help='whether to evaluate the checkpoint during training') 24 | parser.add_argument( 25 | '--gpus', 26 | type=int, 27 | default=1, 28 | help='number of gpus to use ' 29 | '(only applicable to non-distributed training)') 30 | parser.add_argument('--seed', type=int, default=None, help='random seed') 31 | parser.add_argument( 32 | '--launcher', 33 | choices=['none', 'pytorch', 'slurm', 'mpi'], 34 | default='none', 35 | help='job launcher') 36 | parser.add_argument('--local_rank', type=int, default=0) 37 | args = parser.parse_args() 38 | 39 | return args 40 | 41 | 42 | def main(): 43 | args = parse_args() 44 | 45 | cfg = Config.fromfile(args.config) 46 | # set cudnn_benchmark 47 | if cfg.get('cudnn_benchmark', False): 48 | torch.backends.cudnn.benchmark = True 49 | # update configs according to CLI args 50 | if args.work_dir is not None: 51 | cfg.work_dir = args.work_dir 52 | if args.resume_from is not None: 53 | cfg.resume_from = args.resume_from 54 | cfg.gpus = args.gpus 55 | if cfg.checkpoint_config is not None: 56 | # save mmdet version in checkpoints as meta data 57 | cfg.checkpoint_config.meta = dict( 58 | mmdet_version=__version__, config=cfg.text) 59 | 60 | # init distributed env first, since logger depends on the dist info. 61 | if args.launcher == 'none': 62 | distributed = False 63 | else: 64 | distributed = True 65 | init_dist(args.launcher, **cfg.dist_params) 66 | 67 | # init logger before other steps 68 | logger = get_root_logger(cfg.log_level) 69 | logger.info('Distributed training: {}'.format(distributed)) 70 | 71 | # set random seeds 72 | if args.seed is not None: 73 | logger.info('Set random seed to {}'.format(args.seed)) 74 | set_random_seed(args.seed) 75 | 76 | model = build_detector( 77 | cfg.model, train_cfg=cfg.train_cfg, test_cfg=cfg.test_cfg) 78 | 79 | train_dataset = get_dataset(cfg.data.train) 80 | train_detector( 81 | model, 82 | train_dataset, 83 | cfg, 84 | distributed=distributed, 85 | validate=args.validate, 86 | logger=logger) 87 | 88 | 89 | if __name__ == '__main__': 90 | main() 91 | -------------------------------------------------------------------------------- /mmdet/core/bbox/samplers/base_sampler.py: -------------------------------------------------------------------------------- 1 | from abc import ABCMeta, abstractmethod 2 | 3 | import torch 4 | 5 | from .sampling_result import SamplingResult 6 | 7 | 8 | class BaseSampler(metaclass=ABCMeta): 9 | 10 | def __init__(self, 11 | num, 12 | pos_fraction, 13 | neg_pos_ub=-1, 14 | add_gt_as_proposals=True, 15 | **kwargs): 16 | self.num = num 17 | self.pos_fraction = pos_fraction 18 | self.neg_pos_ub = neg_pos_ub 19 | self.add_gt_as_proposals = add_gt_as_proposals 20 | self.pos_sampler = self 21 | self.neg_sampler = self 22 | 23 | @abstractmethod 24 | def _sample_pos(self, assign_result, num_expected, **kwargs): 25 | pass 26 | 27 | @abstractmethod 28 | def _sample_neg(self, assign_result, num_expected, **kwargs): 29 | pass 30 | 31 | def sample(self, 32 | assign_result, 33 | bboxes, 34 | gt_bboxes, 35 | gt_labels=None, 36 | gt_pids = None, 37 | **kwargs): 38 | """Sample positive and negative bboxes. 39 | 40 | This is a simple implementation of bbox sampling given candidates, 41 | assigning results and ground truth bboxes. 42 | 43 | Args: 44 | assign_result (:obj:`AssignResult`): Bbox assigning results. 45 | bboxes (Tensor): Boxes to be sampled from. 46 | gt_bboxes (Tensor): Ground truth bboxes. 47 | gt_labels (Tensor, optional): Class labels of ground truth bboxes. 48 | gt_pids (Tensor, optional): identity labels of ground truth bboxes 49 | Returns: 50 | :obj:`SamplingResult`: Sampling result. 51 | """ 52 | bboxes = bboxes[:, :4] 53 | 54 | gt_flags = bboxes.new_zeros((bboxes.shape[0], ), dtype=torch.uint8) 55 | if self.add_gt_as_proposals: 56 | bboxes = torch.cat([gt_bboxes, bboxes], dim=0) 57 | assign_result.add_gt_(gt_labels, gt_pids) 58 | gt_ones = bboxes.new_ones(gt_bboxes.shape[0], dtype=torch.uint8) 59 | gt_flags = torch.cat([gt_ones, gt_flags]) 60 | 61 | num_expected_pos = int(self.num * self.pos_fraction) 62 | pos_inds = self.pos_sampler._sample_pos( 63 | assign_result, num_expected_pos, bboxes=bboxes, **kwargs) 64 | # We found that sampled indices have duplicated items occasionally. 65 | # (may be a bug of PyTorch) 66 | pos_inds = pos_inds.unique() 67 | num_sampled_pos = pos_inds.numel() 68 | num_expected_neg = self.num - num_sampled_pos 69 | if self.neg_pos_ub >= 0: 70 | _pos = max(1, num_sampled_pos) 71 | neg_upper_bound = int(self.neg_pos_ub * _pos) 72 | if num_expected_neg > neg_upper_bound: 73 | num_expected_neg = neg_upper_bound 74 | neg_inds = self.neg_sampler._sample_neg( 75 | assign_result, num_expected_neg, bboxes=bboxes, **kwargs) 76 | neg_inds = neg_inds.unique() 77 | 78 | return SamplingResult(pos_inds, neg_inds, bboxes, gt_bboxes, 79 | assign_result, gt_flags) 80 | -------------------------------------------------------------------------------- /mmdet/models/utils/conv_module.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | 3 | import torch.nn as nn 4 | from mmcv.cnn import kaiming_init, constant_init 5 | 6 | from .norm import build_norm_layer 7 | 8 | 9 | class ConvModule(nn.Module): 10 | 11 | def __init__(self, 12 | in_channels, 13 | out_channels, 14 | kernel_size, 15 | stride=1, 16 | padding=0, 17 | dilation=1, 18 | groups=1, 19 | bias=True, 20 | normalize=None, 21 | activation='relu', 22 | inplace=True, 23 | activate_last=True): 24 | super(ConvModule, self).__init__() 25 | self.with_norm = normalize is not None 26 | self.with_activatation = activation is not None 27 | self.with_bias = bias 28 | self.activation = activation 29 | self.activate_last = activate_last 30 | 31 | if self.with_norm and self.with_bias: 32 | warnings.warn('ConvModule has norm and bias at the same time') 33 | 34 | self.conv = nn.Conv2d( 35 | in_channels, 36 | out_channels, 37 | kernel_size, 38 | stride, 39 | padding, 40 | dilation, 41 | groups, 42 | bias=bias) 43 | 44 | self.in_channels = self.conv.in_channels 45 | self.out_channels = self.conv.out_channels 46 | self.kernel_size = self.conv.kernel_size 47 | self.stride = self.conv.stride 48 | self.padding = self.conv.padding 49 | self.dilation = self.conv.dilation 50 | self.transposed = self.conv.transposed 51 | self.output_padding = self.conv.output_padding 52 | self.groups = self.conv.groups 53 | 54 | if self.with_norm: 55 | norm_channels = out_channels if self.activate_last else in_channels 56 | self.norm_name, norm = build_norm_layer(normalize, norm_channels) 57 | self.add_module(self.norm_name, norm) 58 | 59 | if self.with_activatation: 60 | assert activation in ['relu'], 'Only ReLU supported.' 61 | if self.activation == 'relu': 62 | self.activate = nn.ReLU(inplace=inplace) 63 | 64 | # Default using msra init 65 | self.init_weights() 66 | 67 | @property 68 | def norm(self): 69 | return getattr(self, self.norm_name) 70 | 71 | def init_weights(self): 72 | nonlinearity = 'relu' if self.activation is None else self.activation 73 | kaiming_init(self.conv, nonlinearity=nonlinearity) 74 | if self.with_norm: 75 | constant_init(self.norm, 1, bias=0) 76 | 77 | def forward(self, x, activate=True, norm=True): 78 | if self.activate_last: 79 | x = self.conv(x) 80 | if norm and self.with_norm: 81 | x = self.norm(x) 82 | if activate and self.with_activatation: 83 | x = self.activate(x) 84 | else: 85 | if norm and self.with_norm: 86 | x = self.norm(x) 87 | if activate and self.with_activatation: 88 | x = self.activate(x) 89 | x = self.conv(x) 90 | return x 91 | -------------------------------------------------------------------------------- /mmdet/ops/roi_pool/src/roi_pool_cuda.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | 6 | int ROIPoolForwardLaucher(const at::Tensor features, const at::Tensor rois, 7 | const float spatial_scale, const int channels, 8 | const int height, const int width, const int num_rois, 9 | const int pooled_h, const int pooled_w, 10 | at::Tensor output, at::Tensor argmax); 11 | 12 | int ROIPoolBackwardLaucher(const at::Tensor top_grad, const at::Tensor rois, 13 | const at::Tensor argmax, const float spatial_scale, 14 | const int batch_size, const int channels, 15 | const int height, const int width, 16 | const int num_rois, const int pooled_h, 17 | const int pooled_w, at::Tensor bottom_grad); 18 | 19 | #define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ") 20 | #define CHECK_CONTIGUOUS(x) \ 21 | AT_CHECK(x.is_contiguous(), #x, " must be contiguous ") 22 | #define CHECK_INPUT(x) \ 23 | CHECK_CUDA(x); \ 24 | CHECK_CONTIGUOUS(x) 25 | 26 | int roi_pooling_forward_cuda(at::Tensor features, at::Tensor rois, 27 | int pooled_height, int pooled_width, 28 | float spatial_scale, at::Tensor output, 29 | at::Tensor argmax) { 30 | CHECK_INPUT(features); 31 | CHECK_INPUT(rois); 32 | CHECK_INPUT(output); 33 | CHECK_INPUT(argmax); 34 | 35 | // Number of ROIs 36 | int num_rois = rois.size(0); 37 | int size_rois = rois.size(1); 38 | 39 | if (size_rois != 5) { 40 | printf("wrong roi size\n"); 41 | return 0; 42 | } 43 | 44 | int channels = features.size(1); 45 | int height = features.size(2); 46 | int width = features.size(3); 47 | 48 | ROIPoolForwardLaucher(features, rois, spatial_scale, channels, height, width, 49 | num_rois, pooled_height, pooled_width, output, argmax); 50 | 51 | return 1; 52 | } 53 | 54 | int roi_pooling_backward_cuda(at::Tensor top_grad, at::Tensor rois, 55 | at::Tensor argmax, float spatial_scale, 56 | at::Tensor bottom_grad) { 57 | CHECK_INPUT(top_grad); 58 | CHECK_INPUT(rois); 59 | CHECK_INPUT(argmax); 60 | CHECK_INPUT(bottom_grad); 61 | 62 | int pooled_height = top_grad.size(2); 63 | int pooled_width = top_grad.size(3); 64 | int num_rois = rois.size(0); 65 | int size_rois = rois.size(1); 66 | 67 | if (size_rois != 5) { 68 | printf("wrong roi size\n"); 69 | return 0; 70 | } 71 | int batch_size = bottom_grad.size(0); 72 | int channels = bottom_grad.size(1); 73 | int height = bottom_grad.size(2); 74 | int width = bottom_grad.size(3); 75 | 76 | ROIPoolBackwardLaucher(top_grad, rois, argmax, spatial_scale, batch_size, 77 | channels, height, width, num_rois, pooled_height, 78 | pooled_width, bottom_grad); 79 | 80 | return 1; 81 | } 82 | 83 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 84 | m.def("forward", &roi_pooling_forward_cuda, "Roi_Pooling forward (CUDA)"); 85 | m.def("backward", &roi_pooling_backward_cuda, "Roi_Pooling backward (CUDA)"); 86 | } 87 | -------------------------------------------------------------------------------- /mmdet/ops/roi_align/src/roi_align_cuda.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | 6 | int ROIAlignForwardLaucher(const at::Tensor features, const at::Tensor rois, 7 | const float spatial_scale, const int sample_num, 8 | const int channels, const int height, 9 | const int width, const int num_rois, 10 | const int pooled_height, const int pooled_width, 11 | at::Tensor output); 12 | 13 | int ROIAlignBackwardLaucher(const at::Tensor top_grad, const at::Tensor rois, 14 | const float spatial_scale, const int sample_num, 15 | const int channels, const int height, 16 | const int width, const int num_rois, 17 | const int pooled_height, const int pooled_width, 18 | at::Tensor bottom_grad); 19 | 20 | #define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ") 21 | #define CHECK_CONTIGUOUS(x) \ 22 | AT_CHECK(x.is_contiguous(), #x, " must be contiguous ") 23 | #define CHECK_INPUT(x) \ 24 | CHECK_CUDA(x); \ 25 | CHECK_CONTIGUOUS(x) 26 | 27 | int roi_align_forward_cuda(at::Tensor features, at::Tensor rois, 28 | int pooled_height, int pooled_width, 29 | float spatial_scale, int sample_num, 30 | at::Tensor output) { 31 | CHECK_INPUT(features); 32 | CHECK_INPUT(rois); 33 | CHECK_INPUT(output); 34 | 35 | // Number of ROIs 36 | int num_rois = rois.size(0); 37 | int size_rois = rois.size(1); 38 | 39 | if (size_rois != 5) { 40 | printf("wrong roi size\n"); 41 | return 0; 42 | } 43 | 44 | int num_channels = features.size(1); 45 | int data_height = features.size(2); 46 | int data_width = features.size(3); 47 | 48 | ROIAlignForwardLaucher(features, rois, spatial_scale, sample_num, 49 | num_channels, data_height, data_width, num_rois, 50 | pooled_height, pooled_width, output); 51 | 52 | return 1; 53 | } 54 | 55 | int roi_align_backward_cuda(at::Tensor top_grad, at::Tensor rois, 56 | int pooled_height, int pooled_width, 57 | float spatial_scale, int sample_num, 58 | at::Tensor bottom_grad) { 59 | CHECK_INPUT(top_grad); 60 | CHECK_INPUT(rois); 61 | CHECK_INPUT(bottom_grad); 62 | 63 | // Number of ROIs 64 | int num_rois = rois.size(0); 65 | int size_rois = rois.size(1); 66 | if (size_rois != 5) { 67 | printf("wrong roi size\n"); 68 | return 0; 69 | } 70 | 71 | int num_channels = bottom_grad.size(1); 72 | int data_height = bottom_grad.size(2); 73 | int data_width = bottom_grad.size(3); 74 | 75 | ROIAlignBackwardLaucher(top_grad, rois, spatial_scale, sample_num, 76 | num_channels, data_height, data_width, num_rois, 77 | pooled_height, pooled_width, bottom_grad); 78 | 79 | return 1; 80 | } 81 | 82 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 83 | m.def("forward", &roi_align_forward_cuda, "Roi_Align forward (CUDA)"); 84 | m.def("backward", &roi_align_backward_cuda, "Roi_Align backward (CUDA)"); 85 | } 86 | -------------------------------------------------------------------------------- /mmdet/models/roi_extractors/single_level.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | 3 | import torch 4 | import torch.nn as nn 5 | 6 | from mmdet import ops 7 | from ..registry import ROI_EXTRACTORS 8 | 9 | 10 | @ROI_EXTRACTORS.register_module 11 | class SingleRoIExtractor(nn.Module): 12 | """Extract RoI features from a single level feature map. 13 | 14 | If there are mulitple input feature levels, each RoI is mapped to a level 15 | according to its scale. 16 | 17 | Args: 18 | roi_layer (dict): Specify RoI layer type and arguments. 19 | out_channels (int): Output channels of RoI layers. 20 | featmap_strides (int): Strides of input feature maps. 21 | finest_scale (int): Scale threshold of mapping to level 0. 22 | """ 23 | 24 | def __init__(self, 25 | roi_layer, 26 | out_channels, 27 | featmap_strides, 28 | finest_scale=56): 29 | super(SingleRoIExtractor, self).__init__() 30 | self.roi_layers = self.build_roi_layers(roi_layer, featmap_strides) 31 | self.out_channels = out_channels 32 | self.featmap_strides = featmap_strides 33 | self.finest_scale = finest_scale 34 | 35 | @property 36 | def num_inputs(self): 37 | """int: Input feature map levels.""" 38 | return len(self.featmap_strides) 39 | 40 | def init_weights(self): 41 | pass 42 | 43 | def build_roi_layers(self, layer_cfg, featmap_strides): 44 | cfg = layer_cfg.copy() 45 | layer_type = cfg.pop('type') 46 | assert hasattr(ops, layer_type) 47 | layer_cls = getattr(ops, layer_type) 48 | roi_layers = nn.ModuleList( 49 | [layer_cls(spatial_scale=1 / s, **cfg) for s in featmap_strides]) 50 | return roi_layers 51 | 52 | def map_roi_levels(self, rois, num_levels): 53 | """Map rois to corresponding feature levels by scales. 54 | 55 | - scale < finest_scale: level 0 56 | - finest_scale <= scale < finest_scale * 2: level 1 57 | - finest_scale * 2 <= scale < finest_scale * 4: level 2 58 | - scale >= finest_scale * 4: level 3 59 | 60 | Args: 61 | rois (Tensor): Input RoIs, shape (k, 5). 62 | num_levels (int): Total level number. 63 | 64 | Returns: 65 | Tensor: Level index (0-based) of each RoI, shape (k, ) 66 | """ 67 | scale = torch.sqrt( 68 | (rois[:, 3] - rois[:, 1] + 1) * (rois[:, 4] - rois[:, 2] + 1)) 69 | target_lvls = torch.floor(torch.log2(scale / self.finest_scale + 1e-6)) 70 | target_lvls = target_lvls.clamp(min=0, max=num_levels - 1).long() 71 | return target_lvls 72 | 73 | def forward(self, feats, rois): 74 | if len(feats) == 1: 75 | return self.roi_layers[0](feats[0], rois) 76 | 77 | out_size = self.roi_layers[0].out_size 78 | num_levels = len(feats) 79 | target_lvls = self.map_roi_levels(rois, num_levels) 80 | roi_feats = torch.cuda.FloatTensor(rois.size()[0], self.out_channels, 81 | out_size, out_size).fill_(0) 82 | for i in range(num_levels): 83 | inds = target_lvls == i 84 | if inds.any(): 85 | rois_ = rois[inds, :] 86 | roi_feats_t = self.roi_layers[i](feats[i], rois_) 87 | roi_feats[inds] += roi_feats_t 88 | return roi_feats 89 | -------------------------------------------------------------------------------- /mmdet/core/anchor/anchor_generator.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | class AnchorGenerator(object): 5 | 6 | def __init__(self, base_size, scales, ratios, scale_major=True, ctr=None): 7 | self.base_size = base_size 8 | self.scales = torch.Tensor(scales) 9 | self.ratios = torch.Tensor(ratios) 10 | self.scale_major = scale_major 11 | self.ctr = ctr 12 | self.base_anchors = self.gen_base_anchors() 13 | 14 | @property 15 | def num_base_anchors(self): 16 | return self.base_anchors.size(0) 17 | 18 | def gen_base_anchors(self): 19 | w = self.base_size 20 | h = self.base_size 21 | if self.ctr is None: 22 | x_ctr = 0.5 * (w - 1) 23 | y_ctr = 0.5 * (h - 1) 24 | else: 25 | x_ctr, y_ctr = self.ctr 26 | 27 | h_ratios = torch.sqrt(self.ratios) 28 | w_ratios = 1 / h_ratios 29 | if self.scale_major: 30 | ws = (w * w_ratios[:, None] * self.scales[None, :]).view(-1) 31 | hs = (h * h_ratios[:, None] * self.scales[None, :]).view(-1) 32 | else: 33 | ws = (w * self.scales[:, None] * w_ratios[None, :]).view(-1) 34 | hs = (h * self.scales[:, None] * h_ratios[None, :]).view(-1) 35 | 36 | base_anchors = torch.stack( 37 | [ 38 | x_ctr - 0.5 * (ws - 1), y_ctr - 0.5 * (hs - 1), 39 | x_ctr + 0.5 * (ws - 1), y_ctr + 0.5 * (hs - 1) 40 | ], 41 | dim=-1).round() 42 | 43 | return base_anchors 44 | 45 | def _meshgrid(self, x, y, row_major=True): 46 | xx = x.repeat(len(y)) 47 | yy = y.view(-1, 1).repeat(1, len(x)).view(-1) 48 | if row_major: 49 | return xx, yy 50 | else: 51 | return yy, xx 52 | 53 | def grid_anchors(self, featmap_size, stride=16, device='cuda'): 54 | base_anchors = self.base_anchors.to(device) 55 | 56 | feat_h, feat_w = featmap_size 57 | shift_x = torch.arange(0, feat_w, device=device) * stride 58 | shift_y = torch.arange(0, feat_h, device=device) * stride 59 | shift_xx, shift_yy = self._meshgrid(shift_x, shift_y) 60 | shifts = torch.stack([shift_xx, shift_yy, shift_xx, shift_yy], dim=-1) 61 | shifts = shifts.type_as(base_anchors) 62 | # first feat_w elements correspond to the first row of shifts 63 | # add A anchors (1, A, 4) to K shifts (K, 1, 4) to get 64 | # shifted anchors (K, A, 4), reshape to (K*A, 4) 65 | 66 | all_anchors = base_anchors[None, :, :] + shifts[:, None, :] 67 | all_anchors = all_anchors.view(-1, 4) 68 | # first A rows correspond to A anchors of (0, 0) in feature map, 69 | # then (0, 1), (0, 2), ... 70 | return all_anchors 71 | 72 | def valid_flags(self, featmap_size, valid_size, device='cuda'): 73 | feat_h, feat_w = featmap_size 74 | valid_h, valid_w = valid_size 75 | assert valid_h <= feat_h and valid_w <= feat_w 76 | valid_x = torch.zeros(feat_w, dtype=torch.uint8, device=device) 77 | valid_y = torch.zeros(feat_h, dtype=torch.uint8, device=device) 78 | valid_x[:valid_w] = 1 79 | valid_y[:valid_h] = 1 80 | valid_xx, valid_yy = self._meshgrid(valid_x, valid_y) 81 | valid = valid_xx & valid_yy 82 | valid = valid[:, None].expand( 83 | valid.size(0), self.num_base_anchors).contiguous().view(-1) 84 | return valid 85 | -------------------------------------------------------------------------------- /mmdet/models/detectors/rpn.py: -------------------------------------------------------------------------------- 1 | import mmcv 2 | 3 | from mmdet.core import tensor2imgs, bbox_mapping 4 | from .base import BaseDetector 5 | from .test_mixins import RPNTestMixin 6 | from .. import builder 7 | from ..registry import DETECTORS 8 | 9 | 10 | @DETECTORS.register_module 11 | class RPN(BaseDetector, RPNTestMixin): 12 | 13 | def __init__(self, 14 | backbone, 15 | neck, 16 | rpn_head, 17 | train_cfg, 18 | test_cfg, 19 | pretrained=None): 20 | super(RPN, self).__init__() 21 | self.backbone = builder.build_backbone(backbone) 22 | self.neck = builder.build_neck(neck) if neck is not None else None 23 | self.rpn_head = builder.build_head(rpn_head) 24 | self.train_cfg = train_cfg 25 | self.test_cfg = test_cfg 26 | self.init_weights(pretrained=pretrained) 27 | 28 | def init_weights(self, pretrained=None): 29 | super(RPN, self).init_weights(pretrained) 30 | self.backbone.init_weights(pretrained=pretrained) 31 | if self.with_neck: 32 | self.neck.init_weights() 33 | self.rpn_head.init_weights() 34 | 35 | def extract_feat(self, img): 36 | x = self.backbone(img) 37 | if self.with_neck: 38 | x = self.neck(x) 39 | return x 40 | 41 | def forward_train(self, img, img_meta, gt_bboxes=None): 42 | if self.train_cfg.rpn.get('debug', False): 43 | self.rpn_head.debug_imgs = tensor2imgs(img) 44 | 45 | x = self.extract_feat(img) 46 | rpn_outs = self.rpn_head(x) 47 | 48 | rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta, self.train_cfg.rpn) 49 | losses = self.rpn_head.loss(*rpn_loss_inputs) 50 | return losses 51 | 52 | def simple_test(self, img, img_meta, rescale=False): 53 | x = self.extract_feat(img) 54 | proposal_list = self.simple_test_rpn(x, img_meta, self.test_cfg.rpn) 55 | if rescale: 56 | for proposals, meta in zip(proposal_list, img_meta): 57 | proposals[:, :4] /= meta['scale_factor'] 58 | # TODO: remove this restriction 59 | return proposal_list[0].cpu().numpy() 60 | 61 | def aug_test(self, imgs, img_metas, rescale=False): 62 | proposal_list = self.aug_test_rpn( 63 | self.extract_feats(imgs), img_metas, self.test_cfg.rpn) 64 | if not rescale: 65 | for proposals, img_meta in zip(proposal_list, img_metas[0]): 66 | img_shape = img_meta['img_shape'] 67 | scale_factor = img_meta['scale_factor'] 68 | flip = img_meta['flip'] 69 | proposals[:, :4] = bbox_mapping(proposals[:, :4], img_shape, 70 | scale_factor, flip) 71 | # TODO: remove this restriction 72 | return proposal_list[0].cpu().numpy() 73 | 74 | def show_result(self, data, result, img_norm_cfg): 75 | """Show RPN proposals on the image. 76 | 77 | Although we assume batch size is 1, this method supports arbitrary 78 | batch size. 79 | """ 80 | img_tensor = data['img'][0] 81 | img_metas = data['img_meta'][0].data[0] 82 | imgs = tensor2imgs(img_tensor, **img_norm_cfg) 83 | assert len(imgs) == len(img_metas) 84 | for img, img_meta in zip(imgs, img_metas): 85 | h, w, _ = img_meta['img_shape'] 86 | img_show = img[:h, :w, :] 87 | mmcv.imshow_bboxes(img_show, result, top_k=20) 88 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import time 4 | from setuptools import find_packages, setup 5 | 6 | 7 | def readme(): 8 | with open('README.md', encoding='utf-8') as f: 9 | content = f.read() 10 | return content 11 | 12 | 13 | MAJOR = 0 14 | MINOR = 5 15 | PATCH = 6 16 | SUFFIX = '' 17 | SHORT_VERSION = '{}.{}.{}{}'.format(MAJOR, MINOR, PATCH, SUFFIX) 18 | 19 | version_file = 'mmdet/version.py' 20 | 21 | 22 | def get_git_hash(): 23 | 24 | def _minimal_ext_cmd(cmd): 25 | # construct minimal environment 26 | env = {} 27 | for k in ['SYSTEMROOT', 'PATH', 'HOME']: 28 | v = os.environ.get(k) 29 | if v is not None: 30 | env[k] = v 31 | # LANGUAGE is used on win32 32 | env['LANGUAGE'] = 'C' 33 | env['LANG'] = 'C' 34 | env['LC_ALL'] = 'C' 35 | out = subprocess.Popen( 36 | cmd, stdout=subprocess.PIPE, env=env).communicate()[0] 37 | return out 38 | 39 | try: 40 | out = _minimal_ext_cmd(['git', 'rev-parse', 'HEAD']) 41 | sha = out.strip().decode('ascii') 42 | except OSError: 43 | sha = 'unknown' 44 | 45 | return sha 46 | 47 | 48 | def get_hash(): 49 | if os.path.exists('.git'): 50 | sha = get_git_hash()[:7] 51 | elif os.path.exists(version_file): 52 | try: 53 | from mmdet.version import __version__ 54 | sha = __version__.split('+')[-1] 55 | except ImportError: 56 | raise ImportError('Unable to get git version') 57 | else: 58 | sha = 'unknown' 59 | 60 | return sha 61 | 62 | 63 | def write_version_py(): 64 | content = """# GENERATED VERSION FILE 65 | # TIME: {} 66 | 67 | __version__ = '{}' 68 | short_version = '{}' 69 | """ 70 | sha = get_hash() 71 | VERSION = SHORT_VERSION + '+' + sha 72 | 73 | with open(version_file, 'w') as f: 74 | f.write(content.format(time.asctime(), VERSION, SHORT_VERSION)) 75 | 76 | 77 | def get_version(): 78 | with open(version_file, 'r') as f: 79 | exec(compile(f.read(), version_file, 'exec')) 80 | return locals()['__version__'] 81 | 82 | 83 | if __name__ == '__main__': 84 | write_version_py() 85 | setup( 86 | name='mmdet', 87 | version=get_version(), 88 | description='Open MMLab Detection Toolbox', 89 | long_description=readme(), 90 | keywords='computer vision, object detection', 91 | url='https://github.com/open-mmlab/mmdetection', 92 | packages=find_packages(exclude=('configs', 'tools', 'demo')), 93 | package_data={'mmdet.ops': ['*/*.so']}, 94 | classifiers=[ 95 | 'Development Status :: 4 - Beta', 96 | 'License :: OSI Approved :: Apache Software License', 97 | 'Operating System :: OS Independent', 98 | 'Programming Language :: Python :: 2', 99 | 'Programming Language :: Python :: 2.7', 100 | 'Programming Language :: Python :: 3', 101 | 'Programming Language :: Python :: 3.4', 102 | 'Programming Language :: Python :: 3.5', 103 | 'Programming Language :: Python :: 3.6', 104 | ], 105 | license='GPLv3', 106 | setup_requires=['pytest-runner'], 107 | tests_require=['pytest'], 108 | install_requires=[ 109 | 'mmcv==0.4', 'numpy', 'matplotlib', 'six', 'terminaltables' 110 | ], 111 | zip_safe=False) 112 | -------------------------------------------------------------------------------- /mmdet/core/bbox/bbox_target.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from .transforms import bbox2delta 4 | from ..utils import multi_apply 5 | 6 | 7 | def bbox_target(pos_bboxes_list, 8 | neg_bboxes_list, 9 | pos_gt_bboxes_list, 10 | pos_gt_labels_list, 11 | pos_gt_ids_list, 12 | cfg, 13 | reg_classes=1, 14 | target_means=[.0, .0, .0, .0], 15 | target_stds=[1.0, 1.0, 1.0, 1.0], 16 | concat=True): 17 | labels, label_weights, bbox_targets, bbox_weights, ids, id_weights = multi_apply( 18 | bbox_target_single, 19 | pos_bboxes_list, 20 | neg_bboxes_list, 21 | pos_gt_bboxes_list, 22 | pos_gt_labels_list, 23 | pos_gt_ids_list, 24 | cfg=cfg, 25 | reg_classes=reg_classes, 26 | target_means=target_means, 27 | target_stds=target_stds) 28 | 29 | if concat: 30 | labels = torch.cat(labels, 0) 31 | ids = torch.cat(ids, 0) 32 | label_weights = torch.cat(label_weights, 0) 33 | bbox_targets = torch.cat(bbox_targets, 0) 34 | bbox_weights = torch.cat(bbox_weights, 0) 35 | return (labels, label_weights, bbox_targets, bbox_weights), (ids, id_weights) 36 | 37 | 38 | def bbox_target_single(pos_bboxes, 39 | neg_bboxes, 40 | pos_gt_bboxes, 41 | pos_gt_labels, 42 | pos_gt_ids, 43 | cfg, 44 | reg_classes=1, 45 | target_means=[.0, .0, .0, .0], 46 | target_stds=[1.0, 1.0, 1.0, 1.0]): 47 | num_pos = pos_bboxes.size(0) 48 | num_neg = neg_bboxes.size(0) 49 | num_samples = num_pos + num_neg 50 | labels = pos_bboxes.new_zeros(num_samples, dtype=torch.long) 51 | ids = pos_bboxes.new_zeros(num_samples, dtype=torch.long) 52 | label_weights = pos_bboxes.new_zeros(num_samples) 53 | id_weights = pos_bboxes.new_zeros(num_samples) 54 | bbox_targets = pos_bboxes.new_zeros(num_samples, 4) 55 | bbox_weights = pos_bboxes.new_zeros(num_samples, 4) 56 | if num_pos > 0: 57 | labels[:num_pos] = pos_gt_labels 58 | ids[:num_pos] = pos_gt_ids 59 | pos_weight = 1.0 if cfg.pos_weight <= 0 else cfg.pos_weight 60 | label_weights[:num_pos] = pos_weight 61 | pos_bbox_targets = bbox2delta(pos_bboxes, pos_gt_bboxes, target_means, 62 | target_stds) 63 | bbox_targets[:num_pos, :] = pos_bbox_targets 64 | bbox_weights[:num_pos, :] = 1 65 | id_weights[:num_pos] = pos_weight 66 | if num_neg > 0: 67 | label_weights[-num_neg:] = 1.0 68 | if reg_classes > 1: 69 | bbox_targets, bbox_weights = expand_target(bbox_targets, bbox_weights, 70 | labels, reg_classes) 71 | 72 | return labels, label_weights, bbox_targets, bbox_weights, ids, id_weights 73 | 74 | 75 | def expand_target(bbox_targets, bbox_weights, labels, num_classes): 76 | bbox_targets_expand = bbox_targets.new_zeros((bbox_targets.size(0), 77 | 4 * num_classes)) 78 | bbox_weights_expand = bbox_weights.new_zeros((bbox_weights.size(0), 79 | 4 * num_classes)) 80 | for i in torch.nonzero(labels > 0).squeeze(-1): 81 | start, end = labels[i] * 4, (labels[i] + 1) * 4 82 | bbox_targets_expand[i, start:end] = bbox_targets[i, :] 83 | bbox_weights_expand[i, start:end] = bbox_weights[i, :] 84 | return bbox_targets_expand, bbox_weights_expand 85 | -------------------------------------------------------------------------------- /mmdet/core/post_processing/merge_augs.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | import numpy as np 4 | 5 | from mmdet.ops import nms 6 | from ..bbox import bbox_mapping_back 7 | 8 | 9 | def merge_aug_proposals(aug_proposals, img_metas, rpn_test_cfg): 10 | """Merge augmented proposals (multiscale, flip, etc.) 11 | 12 | Args: 13 | aug_proposals (list[Tensor]): proposals from different testing 14 | schemes, shape (n, 5). Note that they are not rescaled to the 15 | original image size. 16 | img_metas (list[dict]): image info including "shape_scale" and "flip". 17 | rpn_test_cfg (dict): rpn test config. 18 | 19 | Returns: 20 | Tensor: shape (n, 4), proposals corresponding to original image scale. 21 | """ 22 | recovered_proposals = [] 23 | for proposals, img_info in zip(aug_proposals, img_metas): 24 | img_shape = img_info['img_shape'] 25 | scale_factor = img_info['scale_factor'] 26 | flip = img_info['flip'] 27 | _proposals = proposals.clone() 28 | _proposals[:, :4] = bbox_mapping_back(_proposals[:, :4], img_shape, 29 | scale_factor, flip) 30 | recovered_proposals.append(_proposals) 31 | aug_proposals = torch.cat(recovered_proposals, dim=0) 32 | merged_proposals, _ = nms(aug_proposals, rpn_test_cfg.nms_thr) 33 | scores = merged_proposals[:, 4] 34 | _, order = scores.sort(0, descending=True) 35 | num = min(rpn_test_cfg.max_num, merged_proposals.shape[0]) 36 | order = order[:num] 37 | merged_proposals = merged_proposals[order, :] 38 | return merged_proposals 39 | 40 | 41 | def merge_aug_bboxes(aug_bboxes, aug_scores, img_metas, rcnn_test_cfg): 42 | """Merge augmented detection bboxes and scores. 43 | 44 | Args: 45 | aug_bboxes (list[Tensor]): shape (n, 4*#class) 46 | aug_scores (list[Tensor] or None): shape (n, #class) 47 | img_shapes (list[Tensor]): shape (3, ). 48 | rcnn_test_cfg (dict): rcnn test config. 49 | 50 | Returns: 51 | tuple: (bboxes, scores) 52 | """ 53 | recovered_bboxes = [] 54 | for bboxes, img_info in zip(aug_bboxes, img_metas): 55 | img_shape = img_info[0]['img_shape'] 56 | scale_factor = img_info[0]['scale_factor'] 57 | flip = img_info[0]['flip'] 58 | bboxes = bbox_mapping_back(bboxes, img_shape, scale_factor, flip) 59 | recovered_bboxes.append(bboxes) 60 | bboxes = torch.stack(recovered_bboxes).mean(dim=0) 61 | if aug_scores is None: 62 | return bboxes 63 | else: 64 | scores = torch.stack(aug_scores).mean(dim=0) 65 | return bboxes, scores 66 | 67 | 68 | def merge_aug_scores(aug_scores): 69 | """Merge augmented bbox scores.""" 70 | if isinstance(aug_scores[0], torch.Tensor): 71 | return torch.mean(torch.stack(aug_scores), dim=0) 72 | else: 73 | return np.mean(aug_scores, axis=0) 74 | 75 | 76 | def merge_aug_masks(aug_masks, img_metas, rcnn_test_cfg, weights=None): 77 | """Merge augmented mask prediction. 78 | 79 | Args: 80 | aug_masks (list[ndarray]): shape (n, #class, h, w) 81 | img_shapes (list[ndarray]): shape (3, ). 82 | rcnn_test_cfg (dict): rcnn test config. 83 | 84 | Returns: 85 | tuple: (bboxes, scores) 86 | """ 87 | recovered_masks = [ 88 | mask if not img_info[0]['flip'] else mask[..., ::-1] 89 | for mask, img_info in zip(aug_masks, img_metas) 90 | ] 91 | if weights is None: 92 | merged_masks = np.mean(recovered_masks, axis=0) 93 | else: 94 | merged_masks = np.average( 95 | np.array(recovered_masks), axis=0, weights=np.array(weights)) 96 | return merged_masks 97 | -------------------------------------------------------------------------------- /configs/retinanet_r50_fpn_1x.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | model = dict( 3 | type='RetinaNet', 4 | pretrained='modelzoo://resnet50', 5 | backbone=dict( 6 | type='ResNet', 7 | depth=50, 8 | num_stages=4, 9 | out_indices=(0, 1, 2, 3), 10 | frozen_stages=1, 11 | style='pytorch'), 12 | neck=dict( 13 | type='FPN', 14 | in_channels=[256, 512, 1024, 2048], 15 | out_channels=256, 16 | start_level=1, 17 | add_extra_convs=True, 18 | num_outs=5), 19 | bbox_head=dict( 20 | type='RetinaHead', 21 | num_classes=81, 22 | in_channels=256, 23 | stacked_convs=4, 24 | feat_channels=256, 25 | octave_base_scale=4, 26 | scales_per_octave=3, 27 | anchor_ratios=[0.5, 1.0, 2.0], 28 | anchor_strides=[8, 16, 32, 64, 128], 29 | target_means=[.0, .0, .0, .0], 30 | target_stds=[1.0, 1.0, 1.0, 1.0])) 31 | # training and testing settings 32 | train_cfg = dict( 33 | assigner=dict( 34 | type='MaxIoUAssigner', 35 | pos_iou_thr=0.5, 36 | neg_iou_thr=0.4, 37 | min_pos_iou=0, 38 | ignore_iof_thr=-1), 39 | smoothl1_beta=0.11, 40 | gamma=2.0, 41 | alpha=0.25, 42 | allowed_border=-1, 43 | pos_weight=-1, 44 | debug=False) 45 | test_cfg = dict( 46 | nms_pre=1000, 47 | min_bbox_size=0, 48 | score_thr=0.05, 49 | nms=dict(type='nms', iou_thr=0.5), 50 | max_per_img=100) 51 | # dataset settings 52 | dataset_type = 'CocoDataset' 53 | data_root = 'data/coco/' 54 | img_norm_cfg = dict( 55 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 56 | data = dict( 57 | imgs_per_gpu=2, 58 | workers_per_gpu=2, 59 | train=dict( 60 | type=dataset_type, 61 | ann_file=data_root + 'annotations/instances_train2017.json', 62 | img_prefix=data_root + 'train2017/', 63 | img_scale=(1333, 800), 64 | img_norm_cfg=img_norm_cfg, 65 | size_divisor=32, 66 | flip_ratio=0.5, 67 | with_mask=False, 68 | with_crowd=False, 69 | with_label=True), 70 | val=dict( 71 | type=dataset_type, 72 | ann_file=data_root + 'annotations/instances_val2017.json', 73 | img_prefix=data_root + 'val2017/', 74 | img_scale=(1333, 800), 75 | img_norm_cfg=img_norm_cfg, 76 | size_divisor=32, 77 | flip_ratio=0, 78 | with_mask=False, 79 | with_crowd=False, 80 | with_label=True), 81 | test=dict( 82 | type=dataset_type, 83 | ann_file=data_root + 'annotations/instances_val2017.json', 84 | img_prefix=data_root + 'val2017/', 85 | img_scale=(1333, 800), 86 | img_norm_cfg=img_norm_cfg, 87 | size_divisor=32, 88 | flip_ratio=0, 89 | with_mask=False, 90 | with_crowd=False, 91 | with_label=False, 92 | test_mode=True)) 93 | # optimizer 94 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) 95 | optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) 96 | # learning policy 97 | lr_config = dict( 98 | policy='step', 99 | warmup='linear', 100 | warmup_iters=500, 101 | warmup_ratio=1.0 / 3, 102 | step=[8, 11]) 103 | checkpoint_config = dict(interval=1) 104 | # yapf:disable 105 | log_config = dict( 106 | interval=50, 107 | hooks=[ 108 | dict(type='TextLoggerHook'), 109 | # dict(type='TensorboardLoggerHook') 110 | ]) 111 | # yapf:enable 112 | # runtime settings 113 | total_epochs = 12 114 | device_ids = range(8) 115 | dist_params = dict(backend='nccl') 116 | log_level = 'INFO' 117 | work_dir = './work_dirs/retinanet_r50_fpn_1x' 118 | load_from = None 119 | resume_from = None 120 | workflow = [('train', 1)] 121 | -------------------------------------------------------------------------------- /configs/retinanet_r101_fpn_1x.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | model = dict( 3 | type='RetinaNet', 4 | pretrained='modelzoo://resnet101', 5 | backbone=dict( 6 | type='ResNet', 7 | depth=101, 8 | num_stages=4, 9 | out_indices=(0, 1, 2, 3), 10 | frozen_stages=1, 11 | style='pytorch'), 12 | neck=dict( 13 | type='FPN', 14 | in_channels=[256, 512, 1024, 2048], 15 | out_channels=256, 16 | start_level=1, 17 | add_extra_convs=True, 18 | num_outs=5), 19 | bbox_head=dict( 20 | type='RetinaHead', 21 | num_classes=81, 22 | in_channels=256, 23 | stacked_convs=4, 24 | feat_channels=256, 25 | octave_base_scale=4, 26 | scales_per_octave=3, 27 | anchor_ratios=[0.5, 1.0, 2.0], 28 | anchor_strides=[8, 16, 32, 64, 128], 29 | target_means=[.0, .0, .0, .0], 30 | target_stds=[1.0, 1.0, 1.0, 1.0])) 31 | # training and testing settings 32 | train_cfg = dict( 33 | assigner=dict( 34 | type='MaxIoUAssigner', 35 | pos_iou_thr=0.5, 36 | neg_iou_thr=0.4, 37 | min_pos_iou=0, 38 | ignore_iof_thr=-1), 39 | smoothl1_beta=0.11, 40 | gamma=2.0, 41 | alpha=0.25, 42 | allowed_border=-1, 43 | pos_weight=-1, 44 | debug=False) 45 | test_cfg = dict( 46 | nms_pre=1000, 47 | min_bbox_size=0, 48 | score_thr=0.05, 49 | nms=dict(type='nms', iou_thr=0.5), 50 | max_per_img=100) 51 | # dataset settings 52 | dataset_type = 'CocoDataset' 53 | data_root = 'data/coco/' 54 | img_norm_cfg = dict( 55 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 56 | data = dict( 57 | imgs_per_gpu=2, 58 | workers_per_gpu=2, 59 | train=dict( 60 | type=dataset_type, 61 | ann_file=data_root + 'annotations/instances_train2017.json', 62 | img_prefix=data_root + 'train2017/', 63 | img_scale=(1333, 800), 64 | img_norm_cfg=img_norm_cfg, 65 | size_divisor=32, 66 | flip_ratio=0.5, 67 | with_mask=False, 68 | with_crowd=False, 69 | with_label=True), 70 | val=dict( 71 | type=dataset_type, 72 | ann_file=data_root + 'annotations/instances_val2017.json', 73 | img_prefix=data_root + 'val2017/', 74 | img_scale=(1333, 800), 75 | img_norm_cfg=img_norm_cfg, 76 | size_divisor=32, 77 | flip_ratio=0, 78 | with_mask=False, 79 | with_crowd=False, 80 | with_label=True), 81 | test=dict( 82 | type=dataset_type, 83 | ann_file=data_root + 'annotations/instances_val2017.json', 84 | img_prefix=data_root + 'val2017/', 85 | img_scale=(1333, 800), 86 | img_norm_cfg=img_norm_cfg, 87 | size_divisor=32, 88 | flip_ratio=0, 89 | with_mask=False, 90 | with_crowd=False, 91 | with_label=False, 92 | test_mode=True)) 93 | # optimizer 94 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) 95 | optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) 96 | # learning policy 97 | lr_config = dict( 98 | policy='step', 99 | warmup='linear', 100 | warmup_iters=500, 101 | warmup_ratio=1.0 / 3, 102 | step=[8, 11]) 103 | checkpoint_config = dict(interval=1) 104 | # yapf:disable 105 | log_config = dict( 106 | interval=50, 107 | hooks=[ 108 | dict(type='TextLoggerHook'), 109 | # dict(type='TensorboardLoggerHook') 110 | ]) 111 | # yapf:enable 112 | # runtime settings 113 | total_epochs = 12 114 | device_ids = range(8) 115 | dist_params = dict(backend='nccl') 116 | log_level = 'INFO' 117 | work_dir = './work_dirs/retinanet_r101_fpn_1x' 118 | load_from = None 119 | resume_from = None 120 | workflow = [('train', 1)] 121 | -------------------------------------------------------------------------------- /configs/retinanet_x101_32x4d_fpn_1x.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | model = dict( 3 | type='RetinaNet', 4 | pretrained='open-mmlab://resnext101_32x4d', 5 | backbone=dict( 6 | type='ResNeXt', 7 | depth=101, 8 | groups=32, 9 | base_width=4, 10 | num_stages=4, 11 | out_indices=(0, 1, 2, 3), 12 | frozen_stages=1, 13 | style='pytorch'), 14 | neck=dict( 15 | type='FPN', 16 | in_channels=[256, 512, 1024, 2048], 17 | out_channels=256, 18 | start_level=1, 19 | add_extra_convs=True, 20 | num_outs=5), 21 | bbox_head=dict( 22 | type='RetinaHead', 23 | num_classes=81, 24 | in_channels=256, 25 | stacked_convs=4, 26 | feat_channels=256, 27 | octave_base_scale=4, 28 | scales_per_octave=3, 29 | anchor_ratios=[0.5, 1.0, 2.0], 30 | anchor_strides=[8, 16, 32, 64, 128], 31 | target_means=[.0, .0, .0, .0], 32 | target_stds=[1.0, 1.0, 1.0, 1.0])) 33 | # training and testing settings 34 | train_cfg = dict( 35 | assigner=dict( 36 | type='MaxIoUAssigner', 37 | pos_iou_thr=0.5, 38 | neg_iou_thr=0.4, 39 | min_pos_iou=0, 40 | ignore_iof_thr=-1), 41 | smoothl1_beta=0.11, 42 | gamma=2.0, 43 | alpha=0.25, 44 | allowed_border=-1, 45 | pos_weight=-1, 46 | debug=False) 47 | test_cfg = dict( 48 | nms_pre=1000, 49 | min_bbox_size=0, 50 | score_thr=0.05, 51 | nms=dict(type='nms', iou_thr=0.5), 52 | max_per_img=100) 53 | # dataset settings 54 | dataset_type = 'CocoDataset' 55 | data_root = 'data/coco/' 56 | img_norm_cfg = dict( 57 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 58 | data = dict( 59 | imgs_per_gpu=2, 60 | workers_per_gpu=2, 61 | train=dict( 62 | type=dataset_type, 63 | ann_file=data_root + 'annotations/instances_train2017.json', 64 | img_prefix=data_root + 'train2017/', 65 | img_scale=(1333, 800), 66 | img_norm_cfg=img_norm_cfg, 67 | size_divisor=32, 68 | flip_ratio=0.5, 69 | with_mask=False, 70 | with_crowd=False, 71 | with_label=True), 72 | val=dict( 73 | type=dataset_type, 74 | ann_file=data_root + 'annotations/instances_val2017.json', 75 | img_prefix=data_root + 'val2017/', 76 | img_scale=(1333, 800), 77 | img_norm_cfg=img_norm_cfg, 78 | size_divisor=32, 79 | flip_ratio=0, 80 | with_mask=False, 81 | with_crowd=False, 82 | with_label=True), 83 | test=dict( 84 | type=dataset_type, 85 | ann_file=data_root + 'annotations/instances_val2017.json', 86 | img_prefix=data_root + 'val2017/', 87 | img_scale=(1333, 800), 88 | img_norm_cfg=img_norm_cfg, 89 | size_divisor=32, 90 | flip_ratio=0, 91 | with_mask=False, 92 | with_crowd=False, 93 | with_label=False, 94 | test_mode=True)) 95 | # optimizer 96 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) 97 | optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) 98 | # learning policy 99 | lr_config = dict( 100 | policy='step', 101 | warmup='linear', 102 | warmup_iters=500, 103 | warmup_ratio=1.0 / 3, 104 | step=[8, 11]) 105 | checkpoint_config = dict(interval=1) 106 | # yapf:disable 107 | log_config = dict( 108 | interval=50, 109 | hooks=[ 110 | dict(type='TextLoggerHook'), 111 | # dict(type='TensorboardLoggerHook') 112 | ]) 113 | # yapf:enable 114 | # runtime settings 115 | total_epochs = 12 116 | device_ids = range(8) 117 | dist_params = dict(backend='nccl') 118 | log_level = 'INFO' 119 | work_dir = './work_dirs/retinanet_r50_fpn_1x' 120 | load_from = None 121 | resume_from = None 122 | workflow = [('train', 1)] 123 | -------------------------------------------------------------------------------- /configs/retinanet_x101_64x4d_fpn_1x.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | model = dict( 3 | type='RetinaNet', 4 | pretrained='open-mmlab://resnext101_64x4d', 5 | backbone=dict( 6 | type='ResNeXt', 7 | depth=101, 8 | groups=64, 9 | base_width=4, 10 | num_stages=4, 11 | out_indices=(0, 1, 2, 3), 12 | frozen_stages=1, 13 | style='pytorch'), 14 | neck=dict( 15 | type='FPN', 16 | in_channels=[256, 512, 1024, 2048], 17 | out_channels=256, 18 | start_level=1, 19 | add_extra_convs=True, 20 | num_outs=5), 21 | bbox_head=dict( 22 | type='RetinaHead', 23 | num_classes=81, 24 | in_channels=256, 25 | stacked_convs=4, 26 | feat_channels=256, 27 | octave_base_scale=4, 28 | scales_per_octave=3, 29 | anchor_ratios=[0.5, 1.0, 2.0], 30 | anchor_strides=[8, 16, 32, 64, 128], 31 | target_means=[.0, .0, .0, .0], 32 | target_stds=[1.0, 1.0, 1.0, 1.0])) 33 | # training and testing settings 34 | train_cfg = dict( 35 | assigner=dict( 36 | type='MaxIoUAssigner', 37 | pos_iou_thr=0.5, 38 | neg_iou_thr=0.4, 39 | min_pos_iou=0, 40 | ignore_iof_thr=-1), 41 | smoothl1_beta=0.11, 42 | gamma=2.0, 43 | alpha=0.25, 44 | allowed_border=-1, 45 | pos_weight=-1, 46 | debug=False) 47 | test_cfg = dict( 48 | nms_pre=1000, 49 | min_bbox_size=0, 50 | score_thr=0.05, 51 | nms=dict(type='nms', iou_thr=0.5), 52 | max_per_img=100) 53 | # dataset settings 54 | dataset_type = 'CocoDataset' 55 | data_root = 'data/coco/' 56 | img_norm_cfg = dict( 57 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 58 | data = dict( 59 | imgs_per_gpu=2, 60 | workers_per_gpu=2, 61 | train=dict( 62 | type=dataset_type, 63 | ann_file=data_root + 'annotations/instances_train2017.json', 64 | img_prefix=data_root + 'train2017/', 65 | img_scale=(1333, 800), 66 | img_norm_cfg=img_norm_cfg, 67 | size_divisor=32, 68 | flip_ratio=0.5, 69 | with_mask=False, 70 | with_crowd=False, 71 | with_label=True), 72 | val=dict( 73 | type=dataset_type, 74 | ann_file=data_root + 'annotations/instances_val2017.json', 75 | img_prefix=data_root + 'val2017/', 76 | img_scale=(1333, 800), 77 | img_norm_cfg=img_norm_cfg, 78 | size_divisor=32, 79 | flip_ratio=0, 80 | with_mask=False, 81 | with_crowd=False, 82 | with_label=True), 83 | test=dict( 84 | type=dataset_type, 85 | ann_file=data_root + 'annotations/instances_val2017.json', 86 | img_prefix=data_root + 'val2017/', 87 | img_scale=(1333, 800), 88 | img_norm_cfg=img_norm_cfg, 89 | size_divisor=32, 90 | flip_ratio=0, 91 | with_mask=False, 92 | with_crowd=False, 93 | with_label=False, 94 | test_mode=True)) 95 | # optimizer 96 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) 97 | optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) 98 | # learning policy 99 | lr_config = dict( 100 | policy='step', 101 | warmup='linear', 102 | warmup_iters=500, 103 | warmup_ratio=1.0 / 3, 104 | step=[8, 11]) 105 | checkpoint_config = dict(interval=1) 106 | # yapf:disable 107 | log_config = dict( 108 | interval=50, 109 | hooks=[ 110 | dict(type='TextLoggerHook'), 111 | # dict(type='TensorboardLoggerHook') 112 | ]) 113 | # yapf:enable 114 | # runtime settings 115 | total_epochs = 12 116 | device_ids = range(8) 117 | dist_params = dict(backend='nccl') 118 | log_level = 'INFO' 119 | work_dir = './work_dirs/retinanet_r50_fpn_1x' 120 | load_from = None 121 | resume_from = None 122 | workflow = [('train', 1)] 123 | -------------------------------------------------------------------------------- /configs/rpn_r50_fpn_1x.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | model = dict( 3 | type='RPN', 4 | pretrained='modelzoo://resnet50', 5 | backbone=dict( 6 | type='ResNet', 7 | depth=50, 8 | num_stages=4, 9 | out_indices=(0, 1, 2, 3), 10 | frozen_stages=1, 11 | style='pytorch'), 12 | neck=dict( 13 | type='FPN', 14 | in_channels=[256, 512, 1024, 2048], 15 | out_channels=256, 16 | num_outs=5), 17 | rpn_head=dict( 18 | type='RPNHead', 19 | in_channels=256, 20 | feat_channels=256, 21 | anchor_scales=[8], 22 | anchor_ratios=[0.5, 1.0, 2.0], 23 | anchor_strides=[4, 8, 16, 32, 64], 24 | target_means=[.0, .0, .0, .0], 25 | target_stds=[1.0, 1.0, 1.0, 1.0], 26 | use_sigmoid_cls=True)) 27 | # model training and testing settings 28 | train_cfg = dict( 29 | rpn=dict( 30 | assigner=dict( 31 | type='MaxIoUAssigner', 32 | pos_iou_thr=0.7, 33 | neg_iou_thr=0.3, 34 | min_pos_iou=0.3, 35 | ignore_iof_thr=-1), 36 | sampler=dict( 37 | type='RandomSampler', 38 | num=256, 39 | pos_fraction=0.5, 40 | neg_pos_ub=-1, 41 | add_gt_as_proposals=False), 42 | allowed_border=0, 43 | pos_weight=-1, 44 | smoothl1_beta=1 / 9.0, 45 | debug=False)) 46 | test_cfg = dict( 47 | rpn=dict( 48 | nms_across_levels=False, 49 | nms_pre=2000, 50 | nms_post=2000, 51 | max_num=2000, 52 | nms_thr=0.7, 53 | min_bbox_size=0)) 54 | # dataset settings 55 | dataset_type = 'CocoDataset' 56 | data_root = 'data/coco/' 57 | img_norm_cfg = dict( 58 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 59 | data = dict( 60 | imgs_per_gpu=2, 61 | workers_per_gpu=2, 62 | train=dict( 63 | type=dataset_type, 64 | ann_file=data_root + 'annotations/instances_train2017.json', 65 | img_prefix=data_root + 'train2017/', 66 | img_scale=(1333, 800), 67 | img_norm_cfg=img_norm_cfg, 68 | size_divisor=32, 69 | flip_ratio=0.5, 70 | with_mask=False, 71 | with_crowd=False, 72 | with_label=False), 73 | val=dict( 74 | type=dataset_type, 75 | ann_file=data_root + 'annotations/instances_val2017.json', 76 | img_prefix=data_root + 'val2017/', 77 | img_scale=(1333, 800), 78 | img_norm_cfg=img_norm_cfg, 79 | size_divisor=32, 80 | flip_ratio=0, 81 | with_mask=False, 82 | with_crowd=False, 83 | with_label=False), 84 | test=dict( 85 | type=dataset_type, 86 | ann_file=data_root + 'annotations/instances_val2017.json', 87 | img_prefix=data_root + 'val2017/', 88 | img_scale=(1333, 800), 89 | img_norm_cfg=img_norm_cfg, 90 | size_divisor=32, 91 | flip_ratio=0, 92 | with_mask=False, 93 | with_label=False, 94 | test_mode=True)) 95 | # optimizer 96 | optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) 97 | # runner configs 98 | optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) 99 | lr_config = dict( 100 | policy='step', 101 | warmup='linear', 102 | warmup_iters=500, 103 | warmup_ratio=1.0 / 3, 104 | step=[8, 11]) 105 | checkpoint_config = dict(interval=1) 106 | # yapf:disable 107 | log_config = dict( 108 | interval=50, 109 | hooks=[ 110 | dict(type='TextLoggerHook'), 111 | # dict(type='TensorboardLoggerHook') 112 | ]) 113 | # yapf:enable 114 | # runtime settings 115 | total_epochs = 12 116 | dist_params = dict(backend='nccl') 117 | log_level = 'INFO' 118 | work_dir = './work_dirs/rpn_r50_fpn_1x' 119 | load_from = None 120 | resume_from = None 121 | workflow = [('train', 1)] 122 | -------------------------------------------------------------------------------- /configs/rpn_r101_fpn_1x.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | model = dict( 3 | type='RPN', 4 | pretrained='modelzoo://resnet101', 5 | backbone=dict( 6 | type='ResNet', 7 | depth=101, 8 | num_stages=4, 9 | out_indices=(0, 1, 2, 3), 10 | frozen_stages=1, 11 | style='pytorch'), 12 | neck=dict( 13 | type='FPN', 14 | in_channels=[256, 512, 1024, 2048], 15 | out_channels=256, 16 | num_outs=5), 17 | rpn_head=dict( 18 | type='RPNHead', 19 | in_channels=256, 20 | feat_channels=256, 21 | anchor_scales=[8], 22 | anchor_ratios=[0.5, 1.0, 2.0], 23 | anchor_strides=[4, 8, 16, 32, 64], 24 | target_means=[.0, .0, .0, .0], 25 | target_stds=[1.0, 1.0, 1.0, 1.0], 26 | use_sigmoid_cls=True)) 27 | # model training and testing settings 28 | train_cfg = dict( 29 | rpn=dict( 30 | assigner=dict( 31 | type='MaxIoUAssigner', 32 | pos_iou_thr=0.7, 33 | neg_iou_thr=0.3, 34 | min_pos_iou=0.3, 35 | ignore_iof_thr=-1), 36 | sampler=dict( 37 | type='RandomSampler', 38 | num=256, 39 | pos_fraction=0.5, 40 | neg_pos_ub=-1, 41 | add_gt_as_proposals=False), 42 | allowed_border=0, 43 | pos_weight=-1, 44 | smoothl1_beta=1 / 9.0, 45 | debug=False)) 46 | test_cfg = dict( 47 | rpn=dict( 48 | nms_across_levels=False, 49 | nms_pre=2000, 50 | nms_post=2000, 51 | max_num=2000, 52 | nms_thr=0.7, 53 | min_bbox_size=0)) 54 | # dataset settings 55 | dataset_type = 'CocoDataset' 56 | data_root = 'data/coco/' 57 | img_norm_cfg = dict( 58 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 59 | data = dict( 60 | imgs_per_gpu=2, 61 | workers_per_gpu=2, 62 | train=dict( 63 | type=dataset_type, 64 | ann_file=data_root + 'annotations/instances_train2017.json', 65 | img_prefix=data_root + 'train2017/', 66 | img_scale=(1333, 800), 67 | img_norm_cfg=img_norm_cfg, 68 | size_divisor=32, 69 | flip_ratio=0.5, 70 | with_mask=False, 71 | with_crowd=False, 72 | with_label=False), 73 | val=dict( 74 | type=dataset_type, 75 | ann_file=data_root + 'annotations/instances_val2017.json', 76 | img_prefix=data_root + 'val2017/', 77 | img_scale=(1333, 800), 78 | img_norm_cfg=img_norm_cfg, 79 | size_divisor=32, 80 | flip_ratio=0, 81 | with_mask=False, 82 | with_crowd=False, 83 | with_label=False), 84 | test=dict( 85 | type=dataset_type, 86 | ann_file=data_root + 'annotations/instances_val2017.json', 87 | img_prefix=data_root + 'val2017/', 88 | img_scale=(1333, 800), 89 | img_norm_cfg=img_norm_cfg, 90 | size_divisor=32, 91 | flip_ratio=0, 92 | with_mask=False, 93 | with_label=False, 94 | test_mode=True)) 95 | # optimizer 96 | optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) 97 | # runner configs 98 | optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) 99 | lr_config = dict( 100 | policy='step', 101 | warmup='linear', 102 | warmup_iters=500, 103 | warmup_ratio=1.0 / 3, 104 | step=[8, 11]) 105 | checkpoint_config = dict(interval=1) 106 | # yapf:disable 107 | log_config = dict( 108 | interval=50, 109 | hooks=[ 110 | dict(type='TextLoggerHook'), 111 | # dict(type='TensorboardLoggerHook') 112 | ]) 113 | # yapf:enable 114 | # runtime settings 115 | total_epochs = 12 116 | dist_params = dict(backend='nccl') 117 | log_level = 'INFO' 118 | work_dir = './work_dirs/rpn_r101_fpn_1x' 119 | load_from = None 120 | resume_from = None 121 | workflow = [('train', 1)] 122 | -------------------------------------------------------------------------------- /configs/rpn_x101_32x4d_fpn_1x.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | model = dict( 3 | type='RPN', 4 | pretrained='open-mmlab://resnext101_32x4d', 5 | backbone=dict( 6 | type='ResNeXt', 7 | depth=101, 8 | groups=32, 9 | base_width=4, 10 | num_stages=4, 11 | out_indices=(0, 1, 2, 3), 12 | frozen_stages=1, 13 | style='pytorch'), 14 | neck=dict( 15 | type='FPN', 16 | in_channels=[256, 512, 1024, 2048], 17 | out_channels=256, 18 | num_outs=5), 19 | rpn_head=dict( 20 | type='RPNHead', 21 | in_channels=256, 22 | feat_channels=256, 23 | anchor_scales=[8], 24 | anchor_ratios=[0.5, 1.0, 2.0], 25 | anchor_strides=[4, 8, 16, 32, 64], 26 | target_means=[.0, .0, .0, .0], 27 | target_stds=[1.0, 1.0, 1.0, 1.0], 28 | use_sigmoid_cls=True)) 29 | # model training and testing settings 30 | train_cfg = dict( 31 | rpn=dict( 32 | assigner=dict( 33 | type='MaxIoUAssigner', 34 | pos_iou_thr=0.7, 35 | neg_iou_thr=0.3, 36 | min_pos_iou=0.3, 37 | ignore_iof_thr=-1), 38 | sampler=dict( 39 | type='RandomSampler', 40 | num=256, 41 | pos_fraction=0.5, 42 | neg_pos_ub=-1, 43 | add_gt_as_proposals=False), 44 | allowed_border=0, 45 | pos_weight=-1, 46 | smoothl1_beta=1 / 9.0, 47 | debug=False)) 48 | test_cfg = dict( 49 | rpn=dict( 50 | nms_across_levels=False, 51 | nms_pre=2000, 52 | nms_post=2000, 53 | max_num=2000, 54 | nms_thr=0.7, 55 | min_bbox_size=0)) 56 | # dataset settings 57 | dataset_type = 'CocoDataset' 58 | data_root = 'data/coco/' 59 | img_norm_cfg = dict( 60 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 61 | data = dict( 62 | imgs_per_gpu=2, 63 | workers_per_gpu=2, 64 | train=dict( 65 | type=dataset_type, 66 | ann_file=data_root + 'annotations/instances_train2017.json', 67 | img_prefix=data_root + 'train2017/', 68 | img_scale=(1333, 800), 69 | img_norm_cfg=img_norm_cfg, 70 | size_divisor=32, 71 | flip_ratio=0.5, 72 | with_mask=False, 73 | with_crowd=False, 74 | with_label=False), 75 | val=dict( 76 | type=dataset_type, 77 | ann_file=data_root + 'annotations/instances_val2017.json', 78 | img_prefix=data_root + 'val2017/', 79 | img_scale=(1333, 800), 80 | img_norm_cfg=img_norm_cfg, 81 | size_divisor=32, 82 | flip_ratio=0, 83 | with_mask=False, 84 | with_crowd=False, 85 | with_label=False), 86 | test=dict( 87 | type=dataset_type, 88 | ann_file=data_root + 'annotations/instances_val2017.json', 89 | img_prefix=data_root + 'val2017/', 90 | img_scale=(1333, 800), 91 | img_norm_cfg=img_norm_cfg, 92 | size_divisor=32, 93 | flip_ratio=0, 94 | with_mask=False, 95 | with_label=False, 96 | test_mode=True)) 97 | # optimizer 98 | optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) 99 | # runner configs 100 | optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) 101 | lr_config = dict( 102 | policy='step', 103 | warmup='linear', 104 | warmup_iters=500, 105 | warmup_ratio=1.0 / 3, 106 | step=[8, 11]) 107 | checkpoint_config = dict(interval=1) 108 | # yapf:disable 109 | log_config = dict( 110 | interval=50, 111 | hooks=[ 112 | dict(type='TextLoggerHook'), 113 | # dict(type='TensorboardLoggerHook') 114 | ]) 115 | # yapf:enable 116 | # runtime settings 117 | total_epochs = 12 118 | dist_params = dict(backend='nccl') 119 | log_level = 'INFO' 120 | work_dir = './work_dirs/rpn_r101_fpn_1x' 121 | load_from = None 122 | resume_from = None 123 | workflow = [('train', 1)] 124 | -------------------------------------------------------------------------------- /configs/rpn_x101_64x4d_fpn_1x.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | model = dict( 3 | type='RPN', 4 | pretrained='open-mmlab://resnext101_64x4d', 5 | backbone=dict( 6 | type='ResNeXt', 7 | depth=101, 8 | groups=64, 9 | base_width=4, 10 | num_stages=4, 11 | out_indices=(0, 1, 2, 3), 12 | frozen_stages=1, 13 | style='pytorch'), 14 | neck=dict( 15 | type='FPN', 16 | in_channels=[256, 512, 1024, 2048], 17 | out_channels=256, 18 | num_outs=5), 19 | rpn_head=dict( 20 | type='RPNHead', 21 | in_channels=256, 22 | feat_channels=256, 23 | anchor_scales=[8], 24 | anchor_ratios=[0.5, 1.0, 2.0], 25 | anchor_strides=[4, 8, 16, 32, 64], 26 | target_means=[.0, .0, .0, .0], 27 | target_stds=[1.0, 1.0, 1.0, 1.0], 28 | use_sigmoid_cls=True)) 29 | # model training and testing settings 30 | train_cfg = dict( 31 | rpn=dict( 32 | assigner=dict( 33 | type='MaxIoUAssigner', 34 | pos_iou_thr=0.7, 35 | neg_iou_thr=0.3, 36 | min_pos_iou=0.3, 37 | ignore_iof_thr=-1), 38 | sampler=dict( 39 | type='RandomSampler', 40 | num=256, 41 | pos_fraction=0.5, 42 | neg_pos_ub=-1, 43 | add_gt_as_proposals=False), 44 | allowed_border=0, 45 | pos_weight=-1, 46 | smoothl1_beta=1 / 9.0, 47 | debug=False)) 48 | test_cfg = dict( 49 | rpn=dict( 50 | nms_across_levels=False, 51 | nms_pre=2000, 52 | nms_post=2000, 53 | max_num=2000, 54 | nms_thr=0.7, 55 | min_bbox_size=0)) 56 | # dataset settings 57 | dataset_type = 'CocoDataset' 58 | data_root = 'data/coco/' 59 | img_norm_cfg = dict( 60 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 61 | data = dict( 62 | imgs_per_gpu=2, 63 | workers_per_gpu=2, 64 | train=dict( 65 | type=dataset_type, 66 | ann_file=data_root + 'annotations/instances_train2017.json', 67 | img_prefix=data_root + 'train2017/', 68 | img_scale=(1333, 800), 69 | img_norm_cfg=img_norm_cfg, 70 | size_divisor=32, 71 | flip_ratio=0.5, 72 | with_mask=False, 73 | with_crowd=False, 74 | with_label=False), 75 | val=dict( 76 | type=dataset_type, 77 | ann_file=data_root + 'annotations/instances_val2017.json', 78 | img_prefix=data_root + 'val2017/', 79 | img_scale=(1333, 800), 80 | img_norm_cfg=img_norm_cfg, 81 | size_divisor=32, 82 | flip_ratio=0, 83 | with_mask=False, 84 | with_crowd=False, 85 | with_label=False), 86 | test=dict( 87 | type=dataset_type, 88 | ann_file=data_root + 'annotations/instances_val2017.json', 89 | img_prefix=data_root + 'val2017/', 90 | img_scale=(1333, 800), 91 | img_norm_cfg=img_norm_cfg, 92 | size_divisor=32, 93 | flip_ratio=0, 94 | with_mask=False, 95 | with_label=False, 96 | test_mode=True)) 97 | # optimizer 98 | optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) 99 | # runner configs 100 | optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) 101 | lr_config = dict( 102 | policy='step', 103 | warmup='linear', 104 | warmup_iters=500, 105 | warmup_ratio=1.0 / 3, 106 | step=[8, 11]) 107 | checkpoint_config = dict(interval=1) 108 | # yapf:disable 109 | log_config = dict( 110 | interval=50, 111 | hooks=[ 112 | dict(type='TextLoggerHook'), 113 | # dict(type='TensorboardLoggerHook') 114 | ]) 115 | # yapf:enable 116 | # runtime settings 117 | total_epochs = 12 118 | dist_params = dict(backend='nccl') 119 | log_level = 'INFO' 120 | work_dir = './work_dirs/rpn_r101_fpn_1x' 121 | load_from = None 122 | resume_from = None 123 | workflow = [('train', 1)] 124 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MaskTrackRCNN for video instance segmentation 2 | 3 | ## Introduction 4 | This repo serves as the official code release of the MaskTrackRCNN model for video instance segmentation described in the tech report: 5 | ``` 6 | @article{ Yang2019vis, 7 | author = {Linjie Yang and Yuchen Fan and Ning Xu}, 8 | title = {Video instance segmentation}, 9 | journal = {CoRR}, 10 | volume = {abs/1905.04804}, 11 | year = {2019}, 12 | url = {https://arxiv.org/abs/1905.04804} 13 | } 14 | ``` 15 | In this work, a new task video instance segmentation is presented. Video instance segmentation extends the image instance segmentation task from the image domain to the video domain. The new problem aims at **simultaneous detection, segmentation and tracking** of object instances in videos. 16 | YouTubeVIS, a new dataset tailored for this task is collected based on the current largest video object segmentation dataset [YouTubeVOS](youtube-vos.org). Sample annotations of a video clip can be seen below. 17 | 18 | We also proposed an algorithm to jointly detect, segment, and track object instances in a video, named MaskTrackRCNN. A tracking head is added to the original MaskRCNN model to match objects across frames. An overview of the algorithm is shown below. 19 | 20 | ## Installation 21 | This repo is built based on [mmdetection](https://github.com/open-mmlab/mmdetection) commit hash `f3a939f`. Please refer to [INSTALL.md](INSTALL.md) to install the library. 22 | You also need to install a customized [COCO API](https://github.com/youtubevos/cocoapi) for YouTubeVIS dataset. 23 | You can use following commands to create conda env with all dependencies. 24 | ``` 25 | conda create -n MaskTrackRCNN -y 26 | conda activate MaskTrackRCNN 27 | conda install -c pytorch pytorch=0.4.1 torchvision cuda92 -y 28 | conda install -c conda-forge cudatoolkit-dev=9.2 opencv -y 29 | conda install cython pillow=7 -y 30 | pip install git+https://github.com/youtubevos/cocoapi.git#"egg=pycocotools&subdirectory=PythonAPI" 31 | bash compile.sh 32 | pip install . 33 | ``` 34 | You may also need to follow [#1](/../../issues/1) to load MSCOCO pretrained models. 35 | ## Model training and evaluation 36 | Our model is based on MaskRCNN-resnet50-FPN. The model is trained end-to-end on YouTubeVIS based on a MSCOCO pretrained checkpoint ([link](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/mask_rcnn_r50_fpn_1x_20181010-069fa190.pth)). 37 | ### Training 38 | 1. Download YouTubeVIS from [here](https://youtube-vos.org/dataset/vis/). 39 | 2. Symlink the train/validation dataset to `$MMDETECTION/data` folder. Put COCO-style annotations under `$MMDETECTION/data/annotations`. 40 | ``` 41 | mmdetection 42 | ├── mmdet 43 | ├── tools 44 | ├── configs 45 | ├── data 46 | │ ├── train 47 | │ ├── val 48 | │ ├── annotations 49 | │ │ ├── instances_train_sub.json 50 | │ │ ├── instances_val_sub.json 51 | ``` 52 | 53 | 3. Run `python3 tools/train.py configs/masktrack_rcnn_r50_fpn_1x_youtubevos.py` to train the model. 54 | For reference to arguments such as learning rate and model parameters, please refer to `configs/masktrack_rcnn_r50_fpn_1x_youtubevos.py` 55 | 56 | ### Evaluation 57 | 58 | Our pretrained model is available for download at [Google Drive](https://drive.google.com/file/d/1L4R2vwlXYzW0CU7wbBCNGKVLmog1Sz2R/view?usp=sharing). 59 | Run the following command to evaluate the model on YouTubeVIS. 60 | ``` 61 | python3 tools/test_video.py configs/masktrack_rcnn_r50_fpn_1x_youtubevos.py [MODEL_PATH] --out [OUTPUT_PATH] --eval segm 62 | ``` 63 | A json file containing the predicted result will be generated as `OUTPUT_PATH.json`. YouTubeVIS currently only allows evaluation on the codalab server. Please upload the generated result to [codalab server](https://competitions.codalab.org/competitions/20128) to see actual performances. 64 | 65 | ## License 66 | 67 | This project is released under the [Apache 2.0 license](LICENSE). 68 | ## Contact 69 | If you have any questions regarding the repo, please contact Linjie Yang (yljatthu@gmail.com) or create an issue. 70 | -------------------------------------------------------------------------------- /TECHNICAL_DETAILS.md: -------------------------------------------------------------------------------- 1 | ## Overview 2 | 3 | In this section, we will introduce the main units of training a detector: 4 | data loading, model and iteration pipeline. 5 | 6 | ## Data loading 7 | 8 | Following typical conventions, we use `Dataset` and `DataLoader` for data loading 9 | with multiple workers. `Dataset` returns a dict of data items corresponding 10 | the arguments of models' forward method. 11 | Since the data in object detection may not be the same size (image size, gt bbox size, etc.), 12 | we introduce a new `DataContainer` type in `mmcv` to help collect and distribute 13 | data of different size. 14 | See [here](https://github.com/open-mmlab/mmcv/blob/master/mmcv/parallel/data_container.py) for more details. 15 | 16 | ## Model 17 | 18 | In mmdetection, model components are basically categorized as 4 types. 19 | 20 | - backbone: usually a FCN network to extract feature maps, e.g., ResNet. 21 | - neck: the part between backbones and heads, e.g., FPN, ASPP. 22 | - head: the part for specific tasks, e.g., bbox prediction and mask prediction. 23 | - roi extractor: the part for extracting features from feature maps, e.g., RoI Align. 24 | 25 | We also write implement some general detection pipelines with the above components, 26 | such as `SingleStageDetector` and `TwoStageDetector`. 27 | 28 | ### Build a model with basic components 29 | 30 | Following some basic pipelines (e.g., two-stage detectors), the model structure 31 | can be customized through config files with no pains. 32 | 33 | If we want to implement some new components, e.g, the path aggregation 34 | FPN structure in [Path Aggregation Network for Instance Segmentation](https://arxiv.org/abs/1803.01534), there are two things to do. 35 | 36 | 1. create a new file in `mmdet/models/necks/pafpn.py`. 37 | 38 | ```python 39 | class PAFPN(nn.Module): 40 | 41 | def __init__(self, 42 | in_channels, 43 | out_channels, 44 | num_outs, 45 | start_level=0, 46 | end_level=-1, 47 | add_extra_convs=False): 48 | pass 49 | 50 | def forward(self, inputs): 51 | # implementation is ignored 52 | pass 53 | ``` 54 | 55 | 2. modify the config file from 56 | 57 | ```python 58 | neck=dict( 59 | type='FPN', 60 | in_channels=[256, 512, 1024, 2048], 61 | out_channels=256, 62 | num_outs=5) 63 | ``` 64 | 65 | to 66 | 67 | ```python 68 | neck=dict( 69 | type='PAFPN', 70 | in_channels=[256, 512, 1024, 2048], 71 | out_channels=256, 72 | num_outs=5) 73 | ``` 74 | 75 | We will release more components (backbones, necks, heads) for research purpose. 76 | 77 | ### Write a new model 78 | 79 | To write a new detection pipeline, you need to inherit from `BaseDetector`, 80 | which defines the following abstract methods. 81 | 82 | - `extract_feat()`: given an image batch of shape (n, c, h, w), extract the feature map(s). 83 | - `forward_train()`: forward method of the training mode 84 | - `simple_test()`: single scale testing without augmentation 85 | - `aug_test()`: testing with augmentation (multi-scale, flip, etc.) 86 | 87 | [TwoStageDetector](https://github.com/hellock/mmdetection/blob/master/mmdet/models/detectors/two_stage.py) 88 | is a good example which shows how to do that. 89 | 90 | ## Iteration pipeline 91 | 92 | We adopt distributed training for both single machine and multiple machines. 93 | Supposing that the server has 8 GPUs, 8 processes will be started and each process runs on a single GPU. 94 | 95 | Each process keeps an isolated model, data loader, and optimizer. 96 | Model parameters are only synchronized once at the begining. 97 | After a forward and backward pass, gradients will be allreduced among all GPUs, 98 | and the optimizer will update model parameters. 99 | Since the gradients are allreduced, the model parameter stays the same for all processes after the iteration. 100 | -------------------------------------------------------------------------------- /mmdet/models/anchor_heads/rpn_head.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from mmcv.cnn import normal_init 5 | 6 | from mmdet.core import delta2bbox 7 | from mmdet.ops import nms 8 | from .anchor_head import AnchorHead 9 | from ..registry import HEADS 10 | 11 | 12 | @HEADS.register_module 13 | class RPNHead(AnchorHead): 14 | 15 | def __init__(self, in_channels, **kwargs): 16 | super(RPNHead, self).__init__(2, in_channels, **kwargs) 17 | 18 | def _init_layers(self): 19 | self.rpn_conv = nn.Conv2d( 20 | self.in_channels, self.feat_channels, 3, padding=1) 21 | self.rpn_cls = nn.Conv2d(self.feat_channels, 22 | self.num_anchors * self.cls_out_channels, 1) 23 | self.rpn_reg = nn.Conv2d(self.feat_channels, self.num_anchors * 4, 1) 24 | 25 | def init_weights(self): 26 | normal_init(self.rpn_conv, std=0.01) 27 | normal_init(self.rpn_cls, std=0.01) 28 | normal_init(self.rpn_reg, std=0.01) 29 | 30 | def forward_single(self, x): 31 | x = self.rpn_conv(x) 32 | x = F.relu(x, inplace=True) 33 | rpn_cls_score = self.rpn_cls(x) 34 | rpn_bbox_pred = self.rpn_reg(x) 35 | return rpn_cls_score, rpn_bbox_pred 36 | 37 | def loss(self, cls_scores, bbox_preds, gt_bboxes, img_metas, cfg): 38 | losses = super(RPNHead, self).loss(cls_scores, bbox_preds, gt_bboxes, 39 | None, img_metas, cfg) 40 | return dict( 41 | loss_rpn_cls=losses['loss_cls'], loss_rpn_reg=losses['loss_reg']) 42 | 43 | def get_bboxes_single(self, 44 | cls_scores, 45 | bbox_preds, 46 | mlvl_anchors, 47 | img_shape, 48 | scale_factor, 49 | cfg, 50 | rescale=False): 51 | mlvl_proposals = [] 52 | for idx in range(len(cls_scores)): 53 | rpn_cls_score = cls_scores[idx] 54 | rpn_bbox_pred = bbox_preds[idx] 55 | assert rpn_cls_score.size()[-2:] == rpn_bbox_pred.size()[-2:] 56 | anchors = mlvl_anchors[idx] 57 | rpn_cls_score = rpn_cls_score.permute(1, 2, 0) 58 | if self.use_sigmoid_cls: 59 | rpn_cls_score = rpn_cls_score.reshape(-1) 60 | scores = rpn_cls_score.sigmoid() 61 | else: 62 | rpn_cls_score = rpn_cls_score.reshape(-1, 2) 63 | scores = rpn_cls_score.softmax(dim=1)[:, 1] 64 | rpn_bbox_pred = rpn_bbox_pred.permute(1, 2, 0).reshape(-1, 4) 65 | if cfg.nms_pre > 0 and scores.shape[0] > cfg.nms_pre: 66 | _, topk_inds = scores.topk(cfg.nms_pre) 67 | rpn_bbox_pred = rpn_bbox_pred[topk_inds, :] 68 | anchors = anchors[topk_inds, :] 69 | scores = scores[topk_inds] 70 | proposals = delta2bbox(anchors, rpn_bbox_pred, self.target_means, 71 | self.target_stds, img_shape) 72 | if cfg.min_bbox_size > 0: 73 | w = proposals[:, 2] - proposals[:, 0] + 1 74 | h = proposals[:, 3] - proposals[:, 1] + 1 75 | valid_inds = torch.nonzero((w >= cfg.min_bbox_size) & 76 | (h >= cfg.min_bbox_size)).squeeze() 77 | proposals = proposals[valid_inds, :] 78 | scores = scores[valid_inds] 79 | proposals = torch.cat([proposals, scores.unsqueeze(-1)], dim=-1) 80 | proposals, _ = nms(proposals, cfg.nms_thr) 81 | proposals = proposals[:cfg.nms_post, :] 82 | mlvl_proposals.append(proposals) 83 | proposals = torch.cat(mlvl_proposals, 0) 84 | if cfg.nms_across_levels: 85 | proposals, _ = nms(proposals, cfg.nms_thr) 86 | proposals = proposals[:cfg.max_num, :] 87 | else: 88 | scores = proposals[:, 4] 89 | num = min(cfg.max_num, proposals.shape[0]) 90 | _, topk_inds = scores.topk(num) 91 | proposals = proposals[topk_inds, :] 92 | return proposals 93 | -------------------------------------------------------------------------------- /configs/fast_rcnn_r50_fpn_1x.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | model = dict( 3 | type='FastRCNN', 4 | pretrained='modelzoo://resnet50', 5 | backbone=dict( 6 | type='ResNet', 7 | depth=50, 8 | num_stages=4, 9 | out_indices=(0, 1, 2, 3), 10 | frozen_stages=1, 11 | style='pytorch'), 12 | neck=dict( 13 | type='FPN', 14 | in_channels=[256, 512, 1024, 2048], 15 | out_channels=256, 16 | num_outs=5), 17 | bbox_roi_extractor=dict( 18 | type='SingleRoIExtractor', 19 | roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2), 20 | out_channels=256, 21 | featmap_strides=[4, 8, 16, 32]), 22 | bbox_head=dict( 23 | type='SharedFCBBoxHead', 24 | num_fcs=2, 25 | in_channels=256, 26 | fc_out_channels=1024, 27 | roi_feat_size=7, 28 | num_classes=81, 29 | target_means=[0., 0., 0., 0.], 30 | target_stds=[0.1, 0.1, 0.2, 0.2], 31 | reg_class_agnostic=False)) 32 | # model training and testing settings 33 | train_cfg = dict( 34 | rcnn=dict( 35 | assigner=dict( 36 | type='MaxIoUAssigner', 37 | pos_iou_thr=0.5, 38 | neg_iou_thr=0.5, 39 | min_pos_iou=0.5, 40 | ignore_iof_thr=-1), 41 | sampler=dict( 42 | type='RandomSampler', 43 | num=512, 44 | pos_fraction=0.25, 45 | neg_pos_ub=-1, 46 | add_gt_as_proposals=True), 47 | pos_weight=-1, 48 | debug=False)) 49 | test_cfg = dict( 50 | rcnn=dict( 51 | score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100)) 52 | # dataset settings 53 | dataset_type = 'CocoDataset' 54 | data_root = 'data/coco/' 55 | img_norm_cfg = dict( 56 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 57 | data = dict( 58 | imgs_per_gpu=2, 59 | workers_per_gpu=2, 60 | train=dict( 61 | type=dataset_type, 62 | ann_file=data_root + 'annotations/instances_train2017.json', 63 | img_prefix=data_root + 'train2017/', 64 | img_scale=(1333, 800), 65 | img_norm_cfg=img_norm_cfg, 66 | size_divisor=32, 67 | proposal_file=data_root + 'proposals/rpn_r50_fpn_1x_train2017.pkl', 68 | flip_ratio=0.5, 69 | with_mask=False, 70 | with_crowd=True, 71 | with_label=True), 72 | val=dict( 73 | type=dataset_type, 74 | ann_file=data_root + 'annotations/instances_val2017.json', 75 | img_prefix=data_root + 'val2017/', 76 | img_scale=(1333, 800), 77 | img_norm_cfg=img_norm_cfg, 78 | proposal_file=data_root + 'proposals/rpn_r50_fpn_1x_val2017.pkl', 79 | size_divisor=32, 80 | flip_ratio=0, 81 | with_mask=False, 82 | with_crowd=True, 83 | with_label=True), 84 | test=dict( 85 | type=dataset_type, 86 | ann_file=data_root + 'annotations/instances_val2017.json', 87 | img_prefix=data_root + 'val2017/', 88 | img_scale=(1333, 800), 89 | img_norm_cfg=img_norm_cfg, 90 | proposal_file=data_root + 'proposals/rpn_r50_fpn_1x_val2017.pkl', 91 | size_divisor=32, 92 | flip_ratio=0, 93 | with_mask=False, 94 | with_label=False, 95 | test_mode=True)) 96 | # optimizer 97 | optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) 98 | optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) 99 | # learning policy 100 | lr_config = dict( 101 | policy='step', 102 | warmup='linear', 103 | warmup_iters=500, 104 | warmup_ratio=1.0 / 3, 105 | step=[8, 11]) 106 | checkpoint_config = dict(interval=1) 107 | # yapf:disable 108 | log_config = dict( 109 | interval=50, 110 | hooks=[ 111 | dict(type='TextLoggerHook'), 112 | # dict(type='TensorboardLoggerHook') 113 | ]) 114 | # yapf:enable 115 | # runtime settings 116 | total_epochs = 12 117 | dist_params = dict(backend='nccl') 118 | log_level = 'INFO' 119 | work_dir = './work_dirs/fast_rcnn_r50_fpn_1x' 120 | load_from = None 121 | resume_from = None 122 | workflow = [('train', 1)] 123 | -------------------------------------------------------------------------------- /configs/fast_rcnn_r101_fpn_1x.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | model = dict( 3 | type='FastRCNN', 4 | pretrained='modelzoo://resnet101', 5 | backbone=dict( 6 | type='ResNet', 7 | depth=101, 8 | num_stages=4, 9 | out_indices=(0, 1, 2, 3), 10 | frozen_stages=1, 11 | style='pytorch'), 12 | neck=dict( 13 | type='FPN', 14 | in_channels=[256, 512, 1024, 2048], 15 | out_channels=256, 16 | num_outs=5), 17 | bbox_roi_extractor=dict( 18 | type='SingleRoIExtractor', 19 | roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2), 20 | out_channels=256, 21 | featmap_strides=[4, 8, 16, 32]), 22 | bbox_head=dict( 23 | type='SharedFCBBoxHead', 24 | num_fcs=2, 25 | in_channels=256, 26 | fc_out_channels=1024, 27 | roi_feat_size=7, 28 | num_classes=81, 29 | target_means=[0., 0., 0., 0.], 30 | target_stds=[0.1, 0.1, 0.2, 0.2], 31 | reg_class_agnostic=False)) 32 | # model training and testing settings 33 | train_cfg = dict( 34 | rcnn=dict( 35 | assigner=dict( 36 | type='MaxIoUAssigner', 37 | pos_iou_thr=0.5, 38 | neg_iou_thr=0.5, 39 | min_pos_iou=0.5, 40 | ignore_iof_thr=-1), 41 | sampler=dict( 42 | type='RandomSampler', 43 | num=512, 44 | pos_fraction=0.25, 45 | neg_pos_ub=-1, 46 | add_gt_as_proposals=True), 47 | pos_weight=-1, 48 | debug=False)) 49 | test_cfg = dict( 50 | rcnn=dict( 51 | score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100)) 52 | # dataset settings 53 | dataset_type = 'CocoDataset' 54 | data_root = 'data/coco/' 55 | img_norm_cfg = dict( 56 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 57 | data = dict( 58 | imgs_per_gpu=2, 59 | workers_per_gpu=2, 60 | train=dict( 61 | type=dataset_type, 62 | ann_file=data_root + 'annotations/instances_train2017.json', 63 | img_prefix=data_root + 'train2017/', 64 | img_scale=(1333, 800), 65 | img_norm_cfg=img_norm_cfg, 66 | size_divisor=32, 67 | proposal_file=data_root + 'proposals/rpn_r50_fpn_1x_train2017.pkl', 68 | flip_ratio=0.5, 69 | with_mask=False, 70 | with_crowd=True, 71 | with_label=True), 72 | val=dict( 73 | type=dataset_type, 74 | ann_file=data_root + 'annotations/instances_val2017.json', 75 | img_prefix=data_root + 'val2017/', 76 | img_scale=(1333, 800), 77 | img_norm_cfg=img_norm_cfg, 78 | proposal_file=data_root + 'proposals/rpn_r50_fpn_1x_val2017.pkl', 79 | size_divisor=32, 80 | flip_ratio=0, 81 | with_mask=False, 82 | with_crowd=True, 83 | with_label=True), 84 | test=dict( 85 | type=dataset_type, 86 | ann_file=data_root + 'annotations/instances_val2017.json', 87 | img_prefix=data_root + 'val2017/', 88 | img_scale=(1333, 800), 89 | img_norm_cfg=img_norm_cfg, 90 | proposal_file=data_root + 'proposals/rpn_r50_fpn_1x_val2017.pkl', 91 | size_divisor=32, 92 | flip_ratio=0, 93 | with_mask=False, 94 | with_label=False, 95 | test_mode=True)) 96 | # optimizer 97 | optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) 98 | optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) 99 | # learning policy 100 | lr_config = dict( 101 | policy='step', 102 | warmup='linear', 103 | warmup_iters=500, 104 | warmup_ratio=1.0 / 3, 105 | step=[8, 11]) 106 | checkpoint_config = dict(interval=1) 107 | # yapf:disable 108 | log_config = dict( 109 | interval=50, 110 | hooks=[ 111 | dict(type='TextLoggerHook'), 112 | # dict(type='TensorboardLoggerHook') 113 | ]) 114 | # yapf:enable 115 | # runtime settings 116 | total_epochs = 12 117 | dist_params = dict(backend='nccl') 118 | log_level = 'INFO' 119 | work_dir = './work_dirs/fast_rcnn_r101_fpn_1x' 120 | load_from = None 121 | resume_from = None 122 | workflow = [('train', 1)] 123 | -------------------------------------------------------------------------------- /mmdet/datasets/utils.py: -------------------------------------------------------------------------------- 1 | import copy 2 | from collections import Sequence 3 | 4 | import mmcv 5 | from mmcv.runner import obj_from_dict 6 | import torch 7 | 8 | import matplotlib.pyplot as plt 9 | import numpy as np 10 | from .concat_dataset import ConcatDataset 11 | from .repeat_dataset import RepeatDataset 12 | from .. import datasets 13 | 14 | 15 | def to_tensor(data): 16 | """Convert objects of various python types to :obj:`torch.Tensor`. 17 | 18 | Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`, 19 | :class:`Sequence`, :class:`int` and :class:`float`. 20 | """ 21 | if isinstance(data, torch.Tensor): 22 | return data 23 | elif isinstance(data, np.ndarray): 24 | return torch.from_numpy(data) 25 | elif isinstance(data, Sequence) and not mmcv.is_str(data): 26 | return torch.tensor(data) 27 | elif isinstance(data, int): 28 | return torch.LongTensor([data]) 29 | elif isinstance(data, float): 30 | return torch.FloatTensor([data]) 31 | else: 32 | raise TypeError('type {} cannot be converted to tensor.'.format( 33 | type(data))) 34 | 35 | 36 | def random_scale(img_scales, mode='range'): 37 | """Randomly select a scale from a list of scales or scale ranges. 38 | 39 | Args: 40 | img_scales (list[tuple]): Image scale or scale range. 41 | mode (str): "range" or "value". 42 | 43 | Returns: 44 | tuple: Sampled image scale. 45 | """ 46 | num_scales = len(img_scales) 47 | if num_scales == 1: # fixed scale is specified 48 | img_scale = img_scales[0] 49 | elif num_scales == 2: # randomly sample a scale 50 | if mode == 'range': 51 | img_scale_long = [max(s) for s in img_scales] 52 | img_scale_short = [min(s) for s in img_scales] 53 | long_edge = np.random.randint( 54 | min(img_scale_long), 55 | max(img_scale_long) + 1) 56 | short_edge = np.random.randint( 57 | min(img_scale_short), 58 | max(img_scale_short) + 1) 59 | img_scale = (long_edge, short_edge) 60 | elif mode == 'value': 61 | img_scale = img_scales[np.random.randint(num_scales)] 62 | else: 63 | if mode != 'value': 64 | raise ValueError( 65 | 'Only "value" mode supports more than 2 image scales') 66 | img_scale = img_scales[np.random.randint(num_scales)] 67 | return img_scale 68 | 69 | 70 | def show_ann(coco, img, ann_info): 71 | plt.imshow(mmcv.bgr2rgb(img)) 72 | plt.axis('off') 73 | coco.showAnns(ann_info) 74 | plt.show() 75 | 76 | 77 | def get_dataset(data_cfg): 78 | if data_cfg['type'] == 'RepeatDataset': 79 | return RepeatDataset( 80 | get_dataset(data_cfg['dataset']), data_cfg['times']) 81 | 82 | if isinstance(data_cfg['ann_file'], (list, tuple)): 83 | ann_files = data_cfg['ann_file'] 84 | num_dset = len(ann_files) 85 | else: 86 | ann_files = [data_cfg['ann_file']] 87 | num_dset = 1 88 | 89 | if 'proposal_file' in data_cfg.keys(): 90 | if isinstance(data_cfg['proposal_file'], (list, tuple)): 91 | proposal_files = data_cfg['proposal_file'] 92 | else: 93 | proposal_files = [data_cfg['proposal_file']] 94 | else: 95 | proposal_files = [None] * num_dset 96 | assert len(proposal_files) == num_dset 97 | 98 | if isinstance(data_cfg['img_prefix'], (list, tuple)): 99 | img_prefixes = data_cfg['img_prefix'] 100 | else: 101 | img_prefixes = [data_cfg['img_prefix']] * num_dset 102 | assert len(img_prefixes) == num_dset 103 | 104 | dsets = [] 105 | for i in range(num_dset): 106 | data_info = copy.deepcopy(data_cfg) 107 | data_info['ann_file'] = ann_files[i] 108 | data_info['proposal_file'] = proposal_files[i] 109 | data_info['img_prefix'] = img_prefixes[i] 110 | dset = obj_from_dict(data_info, datasets) 111 | dsets.append(dset) 112 | if len(dsets) > 1: 113 | dset = ConcatDataset(dsets) 114 | else: 115 | dset = dsets[0] 116 | return dset 117 | -------------------------------------------------------------------------------- /mmdet/core/loss/losses.py: -------------------------------------------------------------------------------- 1 | # TODO merge naive and weighted loss. 2 | import torch 3 | import torch.nn.functional as F 4 | 5 | 6 | def weighted_nll_loss(pred, label, weight, avg_factor=None): 7 | if avg_factor is None: 8 | avg_factor = max(torch.sum(weight > 0).float().item(), 1.) 9 | raw = F.nll_loss(pred, label, reduction='none') 10 | return torch.sum(raw * weight)[None] / avg_factor 11 | 12 | 13 | def weighted_cross_entropy(pred, label, weight, avg_factor=None, 14 | reduce=True): 15 | if avg_factor is None: 16 | avg_factor = max(torch.sum(weight > 0).float().item(), 1.) 17 | raw = F.cross_entropy(pred, label, reduction='none') 18 | if reduce: 19 | return torch.sum(raw * weight)[None] / avg_factor 20 | else: 21 | return raw * weight / avg_factor 22 | 23 | 24 | def weighted_binary_cross_entropy(pred, label, weight, avg_factor=None): 25 | if avg_factor is None: 26 | avg_factor = max(torch.sum(weight > 0).float().item(), 1.) 27 | return F.binary_cross_entropy_with_logits( 28 | pred, label.float(), weight.float(), 29 | reduction='sum')[None] / avg_factor 30 | 31 | 32 | def sigmoid_focal_loss(pred, 33 | target, 34 | weight, 35 | gamma=2.0, 36 | alpha=0.25, 37 | reduction='elementwise_mean'): 38 | pred_sigmoid = pred.sigmoid() 39 | pt = (1 - pred_sigmoid) * target + pred_sigmoid * (1 - target) 40 | weight = (alpha * target + (1 - alpha) * (1 - target)) * weight 41 | weight = weight * pt.pow(gamma) 42 | return F.binary_cross_entropy_with_logits( 43 | pred, target, weight, reduction=reduction) 44 | 45 | 46 | def weighted_sigmoid_focal_loss(pred, 47 | target, 48 | weight, 49 | gamma=2.0, 50 | alpha=0.25, 51 | avg_factor=None, 52 | num_classes=80): 53 | if avg_factor is None: 54 | avg_factor = torch.sum(weight > 0).float().item() / num_classes + 1e-6 55 | return sigmoid_focal_loss( 56 | pred, target, weight, gamma=gamma, alpha=alpha, 57 | reduction='sum')[None] / avg_factor 58 | 59 | 60 | def mask_cross_entropy(pred, target, label): 61 | num_rois = pred.size()[0] 62 | inds = torch.arange(0, num_rois, dtype=torch.long, device=pred.device) 63 | pred_slice = pred[inds, label].squeeze(1) 64 | return F.binary_cross_entropy_with_logits( 65 | pred_slice, target, reduction='elementwise_mean')[None] 66 | 67 | 68 | def smooth_l1_loss(pred, target, beta=1.0, reduction='elementwise_mean'): 69 | assert beta > 0 70 | assert pred.size() == target.size() and target.numel() > 0 71 | diff = torch.abs(pred - target) 72 | loss = torch.where(diff < beta, 0.5 * diff * diff / beta, 73 | diff - 0.5 * beta) 74 | reduction = F._Reduction.get_enum(reduction) 75 | # none: 0, elementwise_mean:1, sum: 2 76 | if reduction == 0: 77 | return loss 78 | elif reduction == 1: 79 | return loss.sum() / pred.numel() 80 | elif reduction == 2: 81 | return loss.sum() 82 | 83 | 84 | def weighted_smoothl1(pred, target, weight, beta=1.0, avg_factor=None): 85 | if avg_factor is None: 86 | avg_factor = torch.sum(weight > 0).float().item() / 4 + 1e-6 87 | loss = smooth_l1_loss(pred, target, beta, reduction='none') 88 | return torch.sum(loss * weight)[None] / avg_factor 89 | 90 | 91 | def accuracy(pred, target, topk=1): 92 | if isinstance(topk, int): 93 | topk = (topk, ) 94 | return_single = True 95 | else: 96 | return_single = False 97 | 98 | maxk = max(topk) 99 | _, pred_label = pred.topk(maxk, 1, True, True) 100 | pred_label = pred_label.t() 101 | correct = pred_label.eq(target.view(1, -1).expand_as(pred_label)) 102 | 103 | res = [] 104 | for k in topk: 105 | correct_k = correct[:k].view(-1).float().sum(0, keepdim=True) 106 | res.append(correct_k.mul_(100.0 / pred.size(0))) 107 | return res[0] if return_single else res 108 | -------------------------------------------------------------------------------- /mmdet/datasets/transforms.py: -------------------------------------------------------------------------------- 1 | import mmcv 2 | import numpy as np 3 | import torch 4 | 5 | __all__ = ['ImageTransform', 'BboxTransform', 'MaskTransform', 'Numpy2Tensor'] 6 | 7 | 8 | class ImageTransform(object): 9 | """Preprocess an image. 10 | 11 | 1. rescale the image to expected size 12 | 2. normalize the image 13 | 3. flip the image (if needed) 14 | 4. pad the image (if needed) 15 | 5. transpose to (c, h, w) 16 | """ 17 | 18 | def __init__(self, 19 | mean=(0, 0, 0), 20 | std=(1, 1, 1), 21 | to_rgb=True, 22 | size_divisor=None): 23 | self.mean = np.array(mean, dtype=np.float32) 24 | self.std = np.array(std, dtype=np.float32) 25 | self.to_rgb = to_rgb 26 | self.size_divisor = size_divisor 27 | 28 | def __call__(self, img, scale, flip=False, keep_ratio=True): 29 | if keep_ratio: 30 | img, scale_factor = mmcv.imrescale(img, scale, return_scale=True) 31 | else: 32 | img, w_scale, h_scale = mmcv.imresize( 33 | img, scale, return_scale=True) 34 | scale_factor = np.array([w_scale, h_scale, w_scale, h_scale], 35 | dtype=np.float32) 36 | img_shape = img.shape 37 | img = mmcv.imnormalize(img, self.mean, self.std, self.to_rgb) 38 | if flip: 39 | img = mmcv.imflip(img) 40 | if self.size_divisor is not None: 41 | img = mmcv.impad_to_multiple(img, self.size_divisor) 42 | pad_shape = img.shape 43 | else: 44 | pad_shape = img_shape 45 | img = img.transpose(2, 0, 1) 46 | return img, img_shape, pad_shape, scale_factor 47 | 48 | 49 | def bbox_flip(bboxes, img_shape): 50 | """Flip bboxes horizontally. 51 | 52 | Args: 53 | bboxes(ndarray): shape (..., 4*k) 54 | img_shape(tuple): (height, width) 55 | """ 56 | assert bboxes.shape[-1] % 4 == 0 57 | w = img_shape[1] 58 | flipped = bboxes.copy() 59 | flipped[..., 0::4] = w - bboxes[..., 2::4] - 1 60 | flipped[..., 2::4] = w - bboxes[..., 0::4] - 1 61 | return flipped 62 | 63 | 64 | class BboxTransform(object): 65 | """Preprocess gt bboxes. 66 | 67 | 1. rescale bboxes according to image size 68 | 2. flip bboxes (if needed) 69 | 3. pad the first dimension to `max_num_gts` 70 | """ 71 | 72 | def __init__(self, max_num_gts=None): 73 | self.max_num_gts = max_num_gts 74 | 75 | def __call__(self, bboxes, img_shape, scale_factor, flip=False): 76 | gt_bboxes = bboxes * scale_factor 77 | if flip: 78 | gt_bboxes = bbox_flip(gt_bboxes, img_shape) 79 | gt_bboxes[:, 0::2] = np.clip(gt_bboxes[:, 0::2], 0, img_shape[1]) 80 | gt_bboxes[:, 1::2] = np.clip(gt_bboxes[:, 1::2], 0, img_shape[0]) 81 | if self.max_num_gts is None: 82 | return gt_bboxes 83 | else: 84 | num_gts = gt_bboxes.shape[0] 85 | padded_bboxes = np.zeros((self.max_num_gts, 4), dtype=np.float32) 86 | padded_bboxes[:num_gts, :] = gt_bboxes 87 | return padded_bboxes 88 | 89 | 90 | class MaskTransform(object): 91 | """Preprocess masks. 92 | 93 | 1. resize masks to expected size and stack to a single array 94 | 2. flip the masks (if needed) 95 | 3. pad the masks (if needed) 96 | """ 97 | 98 | def __call__(self, masks, pad_shape, scale_factor, flip=False): 99 | masks = [ 100 | mmcv.imrescale(mask, scale_factor, interpolation='nearest') 101 | for mask in masks 102 | ] 103 | if flip: 104 | masks = [mask[:, ::-1] for mask in masks] 105 | padded_masks = [ 106 | mmcv.impad(mask, pad_shape[:2], pad_val=0) for mask in masks 107 | ] 108 | padded_masks = np.stack(padded_masks, axis=0) 109 | return padded_masks 110 | 111 | 112 | class Numpy2Tensor(object): 113 | 114 | def __init__(self): 115 | pass 116 | 117 | def __call__(self, *args): 118 | if len(args) == 1: 119 | return torch.from_numpy(args[0]) 120 | else: 121 | return tuple([torch.from_numpy(np.array(array)) for array in args]) 122 | -------------------------------------------------------------------------------- /mmdet/apis/train.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | 3 | from collections import OrderedDict 4 | 5 | import torch 6 | from mmcv.runner import Runner, DistSamplerSeedHook 7 | from mmcv.parallel import MMDataParallel, MMDistributedDataParallel 8 | 9 | from mmdet.core import (DistOptimizerHook, DistEvalmAPHook, 10 | CocoDistEvalRecallHook, CocoDistEvalmAPHook) 11 | from mmdet.datasets import build_dataloader 12 | from mmdet.models import RPN 13 | from .env import get_root_logger 14 | 15 | 16 | def parse_losses(losses): 17 | log_vars = OrderedDict() 18 | for loss_name, loss_value in losses.items(): 19 | if isinstance(loss_value, torch.Tensor): 20 | log_vars[loss_name] = loss_value.mean() 21 | elif isinstance(loss_value, list): 22 | log_vars[loss_name] = sum(_loss.mean() for _loss in loss_value) 23 | else: 24 | raise TypeError( 25 | '{} is not a tensor or list of tensors'.format(loss_name)) 26 | 27 | loss = sum(_value for _key, _value in log_vars.items() if 'loss' in _key) 28 | 29 | log_vars['loss'] = loss 30 | for name in log_vars: 31 | log_vars[name] = log_vars[name].item() 32 | 33 | return loss, log_vars 34 | 35 | 36 | def batch_processor(model, data, train_mode): 37 | losses = model(**data) 38 | loss, log_vars = parse_losses(losses) 39 | 40 | outputs = dict( 41 | loss=loss, log_vars=log_vars, num_samples=len(data['img'].data)) 42 | 43 | return outputs 44 | 45 | 46 | def train_detector(model, 47 | dataset, 48 | cfg, 49 | distributed=False, 50 | validate=False, 51 | logger=None): 52 | if logger is None: 53 | logger = get_root_logger(cfg.log_level) 54 | 55 | # start training 56 | if distributed: 57 | _dist_train(model, dataset, cfg, validate=validate) 58 | else: 59 | _non_dist_train(model, dataset, cfg, validate=validate) 60 | 61 | 62 | def _dist_train(model, dataset, cfg, validate=False): 63 | # prepare data loaders 64 | data_loaders = [ 65 | build_dataloader( 66 | dataset, 67 | cfg.data.imgs_per_gpu, 68 | cfg.data.workers_per_gpu, 69 | dist=True) 70 | ] 71 | # put model on gpus 72 | model = MMDistributedDataParallel(model.cuda()) 73 | # build runner 74 | runner = Runner(model, batch_processor, cfg.optimizer, cfg.work_dir, 75 | cfg.log_level) 76 | # register hooks 77 | optimizer_config = DistOptimizerHook(**cfg.optimizer_config) 78 | runner.register_training_hooks(cfg.lr_config, optimizer_config, 79 | cfg.checkpoint_config, cfg.log_config) 80 | runner.register_hook(DistSamplerSeedHook()) 81 | # register eval hooks 82 | if validate: 83 | if isinstance(model.module, RPN): 84 | # TODO: implement recall hooks for other datasets 85 | runner.register_hook(CocoDistEvalRecallHook(cfg.data.val)) 86 | else: 87 | if cfg.data.val.type == 'CocoDataset': 88 | runner.register_hook(CocoDistEvalmAPHook(cfg.data.val)) 89 | else: 90 | runner.register_hook(DistEvalmAPHook(cfg.data.val)) 91 | 92 | if cfg.resume_from: 93 | runner.resume(cfg.resume_from) 94 | elif cfg.load_from: 95 | runner.load_checkpoint(cfg.load_from) 96 | runner.run(data_loaders, cfg.workflow, cfg.total_epochs) 97 | 98 | 99 | def _non_dist_train(model, dataset, cfg, validate=False): 100 | # prepare data loaders 101 | data_loaders = [ 102 | build_dataloader( 103 | dataset, 104 | cfg.data.imgs_per_gpu, 105 | cfg.data.workers_per_gpu, 106 | cfg.gpus, 107 | dist=False) 108 | ] 109 | # put model on gpus 110 | model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() 111 | # build runner 112 | runner = Runner(model, batch_processor, cfg.optimizer, cfg.work_dir, 113 | cfg.log_level) 114 | runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config, 115 | cfg.checkpoint_config, cfg.log_config) 116 | 117 | if cfg.resume_from: 118 | runner.resume(cfg.resume_from) 119 | elif cfg.load_from: 120 | runner.load_checkpoint(cfg.load_from) 121 | runner.run(data_loaders, cfg.workflow, cfg.total_epochs) 122 | -------------------------------------------------------------------------------- /tools/test_video.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import torch 4 | import mmcv 5 | from mmcv.runner import load_checkpoint, parallel_test, obj_from_dict 6 | from mmcv.parallel import scatter, collate, MMDataParallel 7 | 8 | from mmdet import datasets 9 | from mmdet.core import results2json_videoseg, ytvos_eval 10 | from mmdet.datasets import build_dataloader 11 | from mmdet.models import build_detector, detectors 12 | 13 | 14 | def single_test(model, data_loader, show=False, save_path=''): 15 | model.eval() 16 | results = [] 17 | dataset = data_loader.dataset 18 | prog_bar = mmcv.ProgressBar(len(dataset)) 19 | for i, data in enumerate(data_loader): 20 | with torch.no_grad(): 21 | result = model(return_loss=False, rescale=not show, **data) 22 | results.append(result) 23 | 24 | if show: 25 | model.module.show_result(data, result, dataset.img_norm_cfg, 26 | dataset=dataset.CLASSES, 27 | save_vis = True, 28 | save_path = save_path, 29 | is_video = True) 30 | 31 | batch_size = data['img'][0].size(0) 32 | for _ in range(batch_size): 33 | prog_bar.update() 34 | return results 35 | 36 | 37 | def _data_func(data, device_id): 38 | data = scatter(collate([data], samples_per_gpu=1), [device_id])[0] 39 | return dict(return_loss=False, rescale=True, **data) 40 | 41 | 42 | def parse_args(): 43 | parser = argparse.ArgumentParser(description='MMDet test detector') 44 | parser.add_argument('config', help='test config file path') 45 | parser.add_argument('checkpoint', help='checkpoint file') 46 | parser.add_argument( 47 | '--save_path', 48 | type=str, 49 | help='path to save visual result') 50 | parser.add_argument( 51 | '--gpus', default=1, type=int, help='GPU number used for testing') 52 | parser.add_argument( 53 | '--proc_per_gpu', 54 | default=1, 55 | type=int, 56 | help='Number of processes per GPU') 57 | parser.add_argument('--out', help='output result file') 58 | parser.add_argument('--load_result', 59 | action='store_true', 60 | help='whether to load existing result') 61 | parser.add_argument( 62 | '--eval', 63 | type=str, 64 | nargs='+', 65 | choices=['bbox', 'segm'], 66 | help='eval types') 67 | parser.add_argument('--show', action='store_true', help='show results') 68 | args = parser.parse_args() 69 | return args 70 | 71 | 72 | def main(): 73 | args = parse_args() 74 | 75 | if args.out is not None and not args.out.endswith(('.pkl', '.pickle')): 76 | raise ValueError('The output file must be a pkl file.') 77 | 78 | cfg = mmcv.Config.fromfile(args.config) 79 | # set cudnn_benchmark 80 | if cfg.get('cudnn_benchmark', False): 81 | torch.backends.cudnn.benchmark = True 82 | cfg.model.pretrained = None 83 | cfg.data.test.test_mode = True 84 | 85 | dataset = obj_from_dict(cfg.data.test, datasets, dict(test_mode=True)) 86 | assert args.gpus == 1 87 | model = build_detector( 88 | cfg.model, train_cfg=None, test_cfg=cfg.test_cfg) 89 | load_checkpoint(model, args.checkpoint) 90 | model = MMDataParallel(model, device_ids=[0]) 91 | 92 | data_loader = build_dataloader( 93 | dataset, 94 | imgs_per_gpu=1, 95 | workers_per_gpu=cfg.data.workers_per_gpu, 96 | num_gpus=1, 97 | dist=False, 98 | shuffle=False) 99 | if args.load_result: 100 | outputs = mmcv.load(args.out) 101 | else: 102 | outputs = single_test(model, data_loader, args.show, save_path=args.save_path) 103 | 104 | if args.out: 105 | if not args.load_result: 106 | print('writing results to {}'.format(args.out)) 107 | 108 | mmcv.dump(outputs, args.out) 109 | eval_types = args.eval 110 | if eval_types: 111 | print('Starting evaluate {}'.format(' and '.join(eval_types))) 112 | if not isinstance(outputs[0], dict): 113 | result_file = args.out + '.json' 114 | results2json_videoseg(dataset, outputs, result_file) 115 | ytvos_eval(result_file, eval_types, dataset.ytvos) 116 | else: 117 | NotImplemented 118 | 119 | if __name__ == '__main__': 120 | main() 121 | -------------------------------------------------------------------------------- /configs/ssd300_coco.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | input_size = 300 3 | model = dict( 4 | type='SingleStageDetector', 5 | pretrained='open-mmlab://vgg16_caffe', 6 | backbone=dict( 7 | type='SSDVGG', 8 | input_size=input_size, 9 | depth=16, 10 | with_last_pool=False, 11 | ceil_mode=True, 12 | out_indices=(3, 4), 13 | out_feature_indices=(22, 34), 14 | l2_norm_scale=20), 15 | neck=None, 16 | bbox_head=dict( 17 | type='SSDHead', 18 | input_size=input_size, 19 | in_channels=(512, 1024, 512, 256, 256, 256), 20 | num_classes=81, 21 | anchor_strides=(8, 16, 32, 64, 100, 300), 22 | basesize_ratio_range=(0.15, 0.9), 23 | anchor_ratios=([2], [2, 3], [2, 3], [2, 3], [2], [2]), 24 | target_means=(.0, .0, .0, .0), 25 | target_stds=(0.1, 0.1, 0.2, 0.2))) 26 | cudnn_benchmark = True 27 | train_cfg = dict( 28 | assigner=dict( 29 | type='MaxIoUAssigner', 30 | pos_iou_thr=0.5, 31 | neg_iou_thr=0.5, 32 | min_pos_iou=0., 33 | ignore_iof_thr=-1, 34 | gt_max_assign_all=False), 35 | smoothl1_beta=1., 36 | allowed_border=-1, 37 | pos_weight=-1, 38 | neg_pos_ratio=3, 39 | debug=False) 40 | test_cfg = dict( 41 | nms=dict(type='nms', iou_thr=0.45), 42 | min_bbox_size=0, 43 | score_thr=0.02, 44 | max_per_img=200) 45 | # model training and testing settings 46 | # dataset settings 47 | dataset_type = 'CocoDataset' 48 | data_root = 'data/coco/' 49 | img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], std=[1, 1, 1], to_rgb=True) 50 | data = dict( 51 | imgs_per_gpu=8, 52 | workers_per_gpu=3, 53 | train=dict( 54 | type='RepeatDataset', 55 | times=5, 56 | dataset=dict( 57 | type=dataset_type, 58 | ann_file=data_root + 'annotations/instances_train2017.json', 59 | img_prefix=data_root + 'train2017/', 60 | img_scale=(300, 300), 61 | img_norm_cfg=img_norm_cfg, 62 | size_divisor=None, 63 | flip_ratio=0.5, 64 | with_mask=False, 65 | with_crowd=False, 66 | with_label=True, 67 | test_mode=False, 68 | extra_aug=dict( 69 | photo_metric_distortion=dict( 70 | brightness_delta=32, 71 | contrast_range=(0.5, 1.5), 72 | saturation_range=(0.5, 1.5), 73 | hue_delta=18), 74 | expand=dict( 75 | mean=img_norm_cfg['mean'], 76 | to_rgb=img_norm_cfg['to_rgb'], 77 | ratio_range=(1, 4)), 78 | random_crop=dict( 79 | min_ious=(0.1, 0.3, 0.5, 0.7, 0.9), min_crop_size=0.3)), 80 | resize_keep_ratio=False)), 81 | val=dict( 82 | type=dataset_type, 83 | ann_file=data_root + 'annotations/instances_val2017.json', 84 | img_prefix=data_root + 'val2017/', 85 | img_scale=(300, 300), 86 | img_norm_cfg=img_norm_cfg, 87 | size_divisor=None, 88 | flip_ratio=0, 89 | with_mask=False, 90 | with_label=False, 91 | test_mode=True, 92 | resize_keep_ratio=False), 93 | test=dict( 94 | type=dataset_type, 95 | ann_file=data_root + 'annotations/instances_val2017.json', 96 | img_prefix=data_root + 'val2017/', 97 | img_scale=(300, 300), 98 | img_norm_cfg=img_norm_cfg, 99 | size_divisor=None, 100 | flip_ratio=0, 101 | with_mask=False, 102 | with_label=False, 103 | test_mode=True, 104 | resize_keep_ratio=False)) 105 | # optimizer 106 | optimizer = dict(type='SGD', lr=2e-3, momentum=0.9, weight_decay=5e-4) 107 | optimizer_config = dict() 108 | # learning policy 109 | lr_config = dict( 110 | policy='step', 111 | warmup='linear', 112 | warmup_iters=500, 113 | warmup_ratio=1.0 / 3, 114 | step=[16, 22]) 115 | checkpoint_config = dict(interval=1) 116 | # yapf:disable 117 | log_config = dict( 118 | interval=50, 119 | hooks=[ 120 | dict(type='TextLoggerHook'), 121 | # dict(type='TensorboardLoggerHook') 122 | ]) 123 | # yapf:enable 124 | # runtime settings 125 | total_epochs = 24 126 | dist_params = dict(backend='nccl') 127 | log_level = 'INFO' 128 | work_dir = './work_dirs/ssd300_coco' 129 | load_from = None 130 | resume_from = None 131 | workflow = [('train', 1)] 132 | -------------------------------------------------------------------------------- /mmdet/ops/nms/cpu_soft_nms.pyx: -------------------------------------------------------------------------------- 1 | # ---------------------------------------------------------- 2 | # Soft-NMS: Improving Object Detection With One Line of Code 3 | # Copyright (c) University of Maryland, College Park 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Navaneeth Bodla and Bharat Singh 6 | # Modified by Kai Chen 7 | # ---------------------------------------------------------- 8 | 9 | import numpy as np 10 | cimport numpy as np 11 | 12 | 13 | cdef inline np.float32_t max(np.float32_t a, np.float32_t b): 14 | return a if a >= b else b 15 | 16 | cdef inline np.float32_t min(np.float32_t a, np.float32_t b): 17 | return a if a <= b else b 18 | 19 | 20 | def cpu_soft_nms( 21 | np.ndarray[float, ndim=2] boxes_in, 22 | float iou_thr, 23 | unsigned int method=1, 24 | float sigma=0.5, 25 | float min_score=0.001, 26 | ): 27 | boxes = boxes_in.copy() 28 | cdef unsigned int N = boxes.shape[0] 29 | cdef float iw, ih, box_area 30 | cdef float ua 31 | cdef int pos = 0 32 | cdef float maxscore = 0 33 | cdef int maxpos = 0 34 | cdef float x1, x2, y1, y2, tx1, tx2, ty1, ty2, ts, area, weight, ov 35 | inds = np.arange(N) 36 | 37 | for i in range(N): 38 | maxscore = boxes[i, 4] 39 | maxpos = i 40 | 41 | tx1 = boxes[i, 0] 42 | ty1 = boxes[i, 1] 43 | tx2 = boxes[i, 2] 44 | ty2 = boxes[i, 3] 45 | ts = boxes[i, 4] 46 | ti = inds[i] 47 | 48 | pos = i + 1 49 | # get max box 50 | while pos < N: 51 | if maxscore < boxes[pos, 4]: 52 | maxscore = boxes[pos, 4] 53 | maxpos = pos 54 | pos = pos + 1 55 | 56 | # add max box as a detection 57 | boxes[i, 0] = boxes[maxpos, 0] 58 | boxes[i, 1] = boxes[maxpos, 1] 59 | boxes[i, 2] = boxes[maxpos, 2] 60 | boxes[i, 3] = boxes[maxpos, 3] 61 | boxes[i, 4] = boxes[maxpos, 4] 62 | inds[i] = inds[maxpos] 63 | 64 | # swap ith box with position of max box 65 | boxes[maxpos, 0] = tx1 66 | boxes[maxpos, 1] = ty1 67 | boxes[maxpos, 2] = tx2 68 | boxes[maxpos, 3] = ty2 69 | boxes[maxpos, 4] = ts 70 | inds[maxpos] = ti 71 | 72 | tx1 = boxes[i, 0] 73 | ty1 = boxes[i, 1] 74 | tx2 = boxes[i, 2] 75 | ty2 = boxes[i, 3] 76 | ts = boxes[i, 4] 77 | 78 | pos = i + 1 79 | # NMS iterations, note that N changes if detection boxes fall below 80 | # threshold 81 | while pos < N: 82 | x1 = boxes[pos, 0] 83 | y1 = boxes[pos, 1] 84 | x2 = boxes[pos, 2] 85 | y2 = boxes[pos, 3] 86 | s = boxes[pos, 4] 87 | 88 | area = (x2 - x1 + 1) * (y2 - y1 + 1) 89 | iw = (min(tx2, x2) - max(tx1, x1) + 1) 90 | if iw > 0: 91 | ih = (min(ty2, y2) - max(ty1, y1) + 1) 92 | if ih > 0: 93 | ua = float((tx2 - tx1 + 1) * (ty2 - ty1 + 1) + area - iw * ih) 94 | ov = iw * ih / ua # iou between max box and detection box 95 | 96 | if method == 1: # linear 97 | if ov > iou_thr: 98 | weight = 1 - ov 99 | else: 100 | weight = 1 101 | elif method == 2: # gaussian 102 | weight = np.exp(-(ov * ov) / sigma) 103 | else: # original NMS 104 | if ov > iou_thr: 105 | weight = 0 106 | else: 107 | weight = 1 108 | 109 | boxes[pos, 4] = weight * boxes[pos, 4] 110 | 111 | # if box score falls below threshold, discard the box by 112 | # swapping with last box update N 113 | if boxes[pos, 4] < min_score: 114 | boxes[pos, 0] = boxes[N-1, 0] 115 | boxes[pos, 1] = boxes[N-1, 1] 116 | boxes[pos, 2] = boxes[N-1, 2] 117 | boxes[pos, 3] = boxes[N-1, 3] 118 | boxes[pos, 4] = boxes[N-1, 4] 119 | inds[pos] = inds[N - 1] 120 | N = N - 1 121 | pos = pos - 1 122 | 123 | pos = pos + 1 124 | 125 | return boxes[:N], inds[:N] 126 | -------------------------------------------------------------------------------- /configs/ssd512_coco.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | input_size = 512 3 | model = dict( 4 | type='SingleStageDetector', 5 | pretrained='open-mmlab://vgg16_caffe', 6 | backbone=dict( 7 | type='SSDVGG', 8 | input_size=input_size, 9 | depth=16, 10 | with_last_pool=False, 11 | ceil_mode=True, 12 | out_indices=(3, 4), 13 | out_feature_indices=(22, 34), 14 | l2_norm_scale=20), 15 | neck=None, 16 | bbox_head=dict( 17 | type='SSDHead', 18 | input_size=input_size, 19 | in_channels=(512, 1024, 512, 256, 256, 256, 256), 20 | num_classes=81, 21 | anchor_strides=(8, 16, 32, 64, 128, 256, 512), 22 | basesize_ratio_range=(0.1, 0.9), 23 | anchor_ratios=([2], [2, 3], [2, 3], [2, 3], [2, 3], [2], [2]), 24 | target_means=(.0, .0, .0, .0), 25 | target_stds=(0.1, 0.1, 0.2, 0.2))) 26 | cudnn_benchmark = True 27 | train_cfg = dict( 28 | assigner=dict( 29 | type='MaxIoUAssigner', 30 | pos_iou_thr=0.5, 31 | neg_iou_thr=0.5, 32 | min_pos_iou=0., 33 | ignore_iof_thr=-1, 34 | gt_max_assign_all=False), 35 | smoothl1_beta=1., 36 | allowed_border=-1, 37 | pos_weight=-1, 38 | neg_pos_ratio=3, 39 | debug=False) 40 | test_cfg = dict( 41 | nms=dict(type='nms', iou_thr=0.45), 42 | min_bbox_size=0, 43 | score_thr=0.02, 44 | max_per_img=200) 45 | # model training and testing settings 46 | # dataset settings 47 | dataset_type = 'CocoDataset' 48 | data_root = 'data/coco/' 49 | img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], std=[1, 1, 1], to_rgb=True) 50 | data = dict( 51 | imgs_per_gpu=8, 52 | workers_per_gpu=3, 53 | train=dict( 54 | type='RepeatDataset', 55 | times=5, 56 | dataset=dict( 57 | type=dataset_type, 58 | ann_file=data_root + 'annotations/instances_train2017.json', 59 | img_prefix=data_root + 'train2017/', 60 | img_scale=(512, 512), 61 | img_norm_cfg=img_norm_cfg, 62 | size_divisor=None, 63 | flip_ratio=0.5, 64 | with_mask=False, 65 | with_crowd=False, 66 | with_label=True, 67 | test_mode=False, 68 | extra_aug=dict( 69 | photo_metric_distortion=dict( 70 | brightness_delta=32, 71 | contrast_range=(0.5, 1.5), 72 | saturation_range=(0.5, 1.5), 73 | hue_delta=18), 74 | expand=dict( 75 | mean=img_norm_cfg['mean'], 76 | to_rgb=img_norm_cfg['to_rgb'], 77 | ratio_range=(1, 4)), 78 | random_crop=dict( 79 | min_ious=(0.1, 0.3, 0.5, 0.7, 0.9), min_crop_size=0.3)), 80 | resize_keep_ratio=False)), 81 | val=dict( 82 | type=dataset_type, 83 | ann_file=data_root + 'annotations/instances_val2017.json', 84 | img_prefix=data_root + 'val2017/', 85 | img_scale=(512, 512), 86 | img_norm_cfg=img_norm_cfg, 87 | size_divisor=None, 88 | flip_ratio=0, 89 | with_mask=False, 90 | with_label=False, 91 | test_mode=True, 92 | resize_keep_ratio=False), 93 | test=dict( 94 | type=dataset_type, 95 | ann_file=data_root + 'annotations/instances_val2017.json', 96 | img_prefix=data_root + 'val2017/', 97 | img_scale=(512, 512), 98 | img_norm_cfg=img_norm_cfg, 99 | size_divisor=None, 100 | flip_ratio=0, 101 | with_mask=False, 102 | with_label=False, 103 | test_mode=True, 104 | resize_keep_ratio=False)) 105 | # optimizer 106 | optimizer = dict(type='SGD', lr=2e-3, momentum=0.9, weight_decay=5e-4) 107 | optimizer_config = dict() 108 | # learning policy 109 | lr_config = dict( 110 | policy='step', 111 | warmup='linear', 112 | warmup_iters=500, 113 | warmup_ratio=1.0 / 3, 114 | step=[16, 22]) 115 | checkpoint_config = dict(interval=1) 116 | # yapf:disable 117 | log_config = dict( 118 | interval=50, 119 | hooks=[ 120 | dict(type='TextLoggerHook'), 121 | # dict(type='TensorboardLoggerHook') 122 | ]) 123 | # yapf:enable 124 | # runtime settings 125 | total_epochs = 24 126 | dist_params = dict(backend='nccl') 127 | log_level = 'INFO' 128 | work_dir = './work_dirs/ssd512_coco' 129 | load_from = None 130 | resume_from = None 131 | workflow = [('train', 1)] 132 | -------------------------------------------------------------------------------- /configs/pascal_voc/ssd300_voc.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | input_size = 300 3 | model = dict( 4 | type='SingleStageDetector', 5 | pretrained='open-mmlab://vgg16_caffe', 6 | backbone=dict( 7 | type='SSDVGG', 8 | input_size=input_size, 9 | depth=16, 10 | with_last_pool=False, 11 | ceil_mode=True, 12 | out_indices=(3, 4), 13 | out_feature_indices=(22, 34), 14 | l2_norm_scale=20), 15 | neck=None, 16 | bbox_head=dict( 17 | type='SSDHead', 18 | input_size=input_size, 19 | in_channels=(512, 1024, 512, 256, 256, 256), 20 | num_classes=21, 21 | anchor_strides=(8, 16, 32, 64, 100, 300), 22 | basesize_ratio_range=(0.2, 0.9), 23 | anchor_ratios=([2], [2, 3], [2, 3], [2, 3], [2], [2]), 24 | target_means=(.0, .0, .0, .0), 25 | target_stds=(0.1, 0.1, 0.2, 0.2))) 26 | cudnn_benchmark = True 27 | train_cfg = dict( 28 | assigner=dict( 29 | type='MaxIoUAssigner', 30 | pos_iou_thr=0.5, 31 | neg_iou_thr=0.5, 32 | min_pos_iou=0., 33 | ignore_iof_thr=-1, 34 | gt_max_assign_all=False), 35 | smoothl1_beta=1., 36 | allowed_border=-1, 37 | pos_weight=-1, 38 | neg_pos_ratio=3, 39 | debug=False) 40 | test_cfg = dict( 41 | nms=dict(type='nms', iou_thr=0.45), 42 | min_bbox_size=0, 43 | score_thr=0.02, 44 | max_per_img=200) 45 | # model training and testing settings 46 | # dataset settings 47 | dataset_type = 'VOCDataset' 48 | data_root = 'data/VOCdevkit/' 49 | img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], std=[1, 1, 1], to_rgb=True) 50 | data = dict( 51 | imgs_per_gpu=4, 52 | workers_per_gpu=2, 53 | train=dict( 54 | type='RepeatDataset', 55 | times=10, 56 | dataset=dict( 57 | type=dataset_type, 58 | ann_file=[ 59 | data_root + 'VOC2007/ImageSets/Main/trainval.txt', 60 | data_root + 'VOC2012/ImageSets/Main/trainval.txt' 61 | ], 62 | img_prefix=[data_root + 'VOC2007/', data_root + 'VOC2012/'], 63 | img_scale=(300, 300), 64 | img_norm_cfg=img_norm_cfg, 65 | size_divisor=None, 66 | flip_ratio=0.5, 67 | with_mask=False, 68 | with_crowd=False, 69 | with_label=True, 70 | test_mode=False, 71 | extra_aug=dict( 72 | photo_metric_distortion=dict( 73 | brightness_delta=32, 74 | contrast_range=(0.5, 1.5), 75 | saturation_range=(0.5, 1.5), 76 | hue_delta=18), 77 | expand=dict( 78 | mean=img_norm_cfg['mean'], 79 | to_rgb=img_norm_cfg['to_rgb'], 80 | ratio_range=(1, 4)), 81 | random_crop=dict( 82 | min_ious=(0.1, 0.3, 0.5, 0.7, 0.9), min_crop_size=0.3)), 83 | resize_keep_ratio=False)), 84 | val=dict( 85 | type=dataset_type, 86 | ann_file=data_root + 'VOC2007/ImageSets/Main/test.txt', 87 | img_prefix=data_root + 'VOC2007/', 88 | img_scale=(300, 300), 89 | img_norm_cfg=img_norm_cfg, 90 | size_divisor=None, 91 | flip_ratio=0, 92 | with_mask=False, 93 | with_label=False, 94 | test_mode=True, 95 | resize_keep_ratio=False), 96 | test=dict( 97 | type=dataset_type, 98 | ann_file=data_root + 'VOC2007/ImageSets/Main/test.txt', 99 | img_prefix=data_root + 'VOC2007/', 100 | img_scale=(300, 300), 101 | img_norm_cfg=img_norm_cfg, 102 | size_divisor=None, 103 | flip_ratio=0, 104 | with_mask=False, 105 | with_label=False, 106 | test_mode=True, 107 | resize_keep_ratio=False)) 108 | # optimizer 109 | optimizer = dict(type='SGD', lr=1e-3, momentum=0.9, weight_decay=5e-4) 110 | optimizer_config = dict() 111 | # learning policy 112 | lr_config = dict( 113 | policy='step', 114 | warmup='linear', 115 | warmup_iters=500, 116 | warmup_ratio=1.0 / 3, 117 | step=[16, 20]) 118 | checkpoint_config = dict(interval=1) 119 | # yapf:disable 120 | log_config = dict( 121 | interval=50, 122 | hooks=[ 123 | dict(type='TextLoggerHook'), 124 | # dict(type='TensorboardLoggerHook') 125 | ]) 126 | # yapf:enable 127 | # runtime settings 128 | total_epochs = 24 129 | dist_params = dict(backend='nccl') 130 | log_level = 'INFO' 131 | work_dir = './work_dirs/ssd300_voc' 132 | load_from = None 133 | resume_from = None 134 | workflow = [('train', 1)] 135 | --------------------------------------------------------------------------------