├── .flake8 ├── .gitignore ├── README.md ├── configs ├── Base_RetinaFace.yaml └── facetron │ ├── retinaface_r_50_3x.yaml │ └── retinaface_r_50_torchvision_3x.yaml ├── dev └── linter.sh ├── mmdet ├── __init__.py ├── config │ ├── __init__.py │ ├── config.py │ └── defaults.py ├── data │ ├── __init__.py │ ├── datasets │ │ ├── __init__.py │ │ └── widerface.py │ ├── transforms │ │ ├── __init__.py │ │ └── widerface_transform.py │ └── widerface_dataset_mapper.py ├── evaluation │ ├── __init__.py │ ├── box_overlaps.c │ ├── box_overlaps.pyx │ ├── evaluator.py │ └── widerface_evaluation.py ├── layers │ ├── DCNv2 │ │ ├── __init__.py │ │ ├── dcn_v2.py │ │ └── src │ │ │ ├── cpu │ │ │ ├── dcn_v2_cpu.cpp │ │ │ ├── dcn_v2_im2col_cpu.cpp │ │ │ ├── dcn_v2_im2col_cpu.h │ │ │ ├── dcn_v2_psroi_pooling_cpu.cpp │ │ │ └── vision.h │ │ │ ├── cuda │ │ │ ├── dcn_v2_cuda.cu │ │ │ ├── dcn_v2_im2col_cuda.cu │ │ │ ├── dcn_v2_im2col_cuda.h │ │ │ ├── dcn_v2_psroi_pooling_cuda.cu │ │ │ └── vision.h │ │ │ ├── dcn_v2.h │ │ │ └── vision.cpp │ ├── __init__.py │ ├── nms.py │ └── ssh.py └── modeling │ ├── __init__.py │ ├── backbone │ ├── __init__.py │ └── torch_resnet.py │ └── meta_arch │ ├── __init__.py │ └── retinaface.py ├── setup.py └── train_net.py /.flake8: -------------------------------------------------------------------------------- 1 | # This is an example .flake8 config, used when developing *Black* itself. 2 | # Keep in sync with setup.cfg which is used for source packages. 3 | 4 | [flake8] 5 | ignore = W503, E203, E221, C901 6 | max-line-length = 180 7 | max-complexity = 18 8 | select = B,C,E,F,W,T4,B9 9 | exclude = build,__init__.py -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # User Define 2 | .vscode 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | db.sqlite3 61 | 62 | # Flask stuff: 63 | instance/ 64 | .webassets-cache 65 | 66 | # Scrapy stuff: 67 | .scrapy 68 | 69 | # Sphinx documentation 70 | docs/_build/ 71 | 72 | # PyBuilder 73 | target/ 74 | 75 | # Jupyter Notebook 76 | .ipynb_checkpoints 77 | 78 | # pyenv 79 | .python-version 80 | 81 | # celery beat schedule file 82 | celerybeat-schedule 83 | 84 | # SageMath parsed files 85 | *.sage.py 86 | 87 | # Environments 88 | .env 89 | .venv 90 | env/ 91 | venv/ 92 | ENV/ 93 | env.bak/ 94 | venv.bak/ 95 | 96 | # Spyder project settings 97 | .spyderproject 98 | .spyproject 99 | 100 | # Rope project settings 101 | .ropeproject 102 | 103 | # mkdocs documentation 104 | /site 105 | 106 | # mypy 107 | .mypy_cache/ 108 | 109 | .DS_Store 110 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RetinaFace in PyTorch 2 | 3 | A [PyTorch](https://pytorch.org/) implementation of [RetinaFace: Single-stage Dense Face Localisation in the Wild](https://arxiv.org/abs/1905.00641). The official code in Mxnet can be found [here](https://github.com/deepinsight/insightface/tree/master/RetinaFace). 4 | 5 | Old version canbe found at [v1.0](https://github.com/lbin/Retinaface_Mobilenet_Pytorch/tree/v1.0) 6 | 7 | 8 | ## WiderFace Val Performance in single scale When using ResNet50 as backbone net. 9 | 10 | | Style | easy | medium | hard | 11 | | :-------------------- | :----: | :----: | :----: | 12 | | Ours (Original Scale) | 94.14% | 92.71% | 81.13% | 13 | 14 | ## Dependencies 15 | 16 | * pytorch >= 1.4.0 17 | * torchvision >= 0.4.0 18 | * python >= 3.6 19 | 20 | ## Installation 21 | 22 | pip install -e . 23 | -------------------------------------------------------------------------------- /configs/Base_RetinaFace.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | META_ARCHITECTURE: "RetinaFace" 3 | BACKBONE: 4 | NAME: "build_resnet_fpn_backbone" 5 | RESNETS: 6 | OUT_FEATURES: ["res3", "res4", "res5"] 7 | ANCHOR_GENERATOR: 8 | SIZES: !!python/object/apply:eval ["[[x, x * 2**(1.0/3), x * 2**(2.0/3) ] for x in [32, 64, 128, 256, 512 ]]"] 9 | FPN: 10 | IN_FEATURES: ["res3", "res4", "res5"] 11 | RETINANET: 12 | IOU_THRESHOLDS: [0.4, 0.5] 13 | IOU_LABELS: [0, -1, 1] 14 | DATASETS: 15 | TRAIN: ("widerface_train",) 16 | TEST: ("widerface_val",) 17 | SOLVER: 18 | IMS_PER_BATCH: 16 19 | BASE_LR: 0.01 # Note that RetinaNet uses a different default learning rate 20 | STEPS: (60000, 80000) 21 | MAX_ITER: 90000 22 | CHECKPOINT_PERIOD: 2000 23 | INPUT: 24 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) 25 | DATALOADER: 26 | NUM_WORKERS: 16 27 | TEST: 28 | DETECTIONS_PER_IMAGE: 300 29 | EVAL_PERIOD: 10000 30 | VERSION: 2 31 | OUTPUT_DIR: "/mnt/tensorboard/" 32 | -------------------------------------------------------------------------------- /configs/facetron/retinaface_r_50_3x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base_RetinaFace.yaml" 2 | DATASETS: 3 | TRAIN: ("widerface_train",) 4 | TEST: ("widerface_val",) 5 | MODEL: 6 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" 7 | BACKBONE: 8 | NAME: "build_resnet_fpn_backbone" 9 | FREEZE_AT: 0 10 | RESNETS: 11 | DEPTH: 50 12 | STRIDE_IN_1X1: False 13 | NORM: 'SyncBN' 14 | FPN: 15 | NORM: 'SyncBN' 16 | RETINANET: 17 | NUM_CLASSES: 1 18 | IN_FEATURES: ['p3', 'p4', 'p5'] 19 | SCORE_THRESH_TEST: 0.02 20 | TOPK_CANDIDATES_TEST: 5000 21 | NMS_THRESH_TEST: 0.4 22 | WITH_DCNv2: True 23 | NORM: 'SyncBN' 24 | SMOOTH_L1_LOSS_BETA: 0.0 25 | ANCHOR_GENERATOR: 26 | SIZES: [[16, 32], [64, 128], [256, 512]] 27 | ASPECT_RATIOS: [[1.0]] 28 | SOLVER: 29 | IMS_PER_BATCH: 32 30 | BASE_LR: 0.02 # Note that RetinaNet uses a different default learning rate 31 | STEPS: (210000, 250000) 32 | MAX_ITER: 270000 33 | CHECKPOINT_PERIOD: 10000 34 | INPUT: 35 | MIN_SIZE_TRAIN: (540, 640, 672, 704, 736, 768, 800) 36 | MAX_SIZE_TRAIN: 1920 37 | MIN_SIZE_TEST: 0 38 | MAX_SIZE_TEST: 0 39 | CROP: 40 | ENABLED: True 41 | -------------------------------------------------------------------------------- /configs/facetron/retinaface_r_50_torchvision_3x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base_RetinaFace.yaml" 2 | DATASETS: 3 | TRAIN: ("widerface_train",) 4 | TEST: ("widerface_val",) 5 | MODEL: 6 | WEIGHTS: "" 7 | PIXEL_MEAN: [0.485, 0.456, 0.406] 8 | PIXEL_STD: [0.229, 0.224, 0.225] 9 | BACKBONE: 10 | NAME: "build_torch_resnet_fpn_backbone" 11 | FREEZE_AT: 0 12 | RESNETS: 13 | DEPTH: 50 14 | STRIDE_IN_1X1: False 15 | NORM: 'SyncBN' 16 | FPN: 17 | NORM: 'SyncBN' 18 | RETINANET: 19 | NUM_CLASSES: 1 20 | IN_FEATURES: ['p3', 'p4', 'p5'] 21 | SCORE_THRESH_TEST: 0.02 22 | TOPK_CANDIDATES_TEST: 5000 23 | NMS_THRESH_TEST: 0.4 24 | WITH_DCNv2: True 25 | NORM: 'SyncBN' 26 | SMOOTH_L1_LOSS_BETA: 0.0 27 | ANCHOR_GENERATOR: 28 | SIZES: [[16, 32], [64, 128], [256, 512]] 29 | ASPECT_RATIOS: [[1.0]] 30 | SOLVER: 31 | IMS_PER_BATCH: 32 32 | BASE_LR: 0.02 # Note that RetinaNet uses a different default learning rate 33 | STEPS: (210000, 250000) 34 | MAX_ITER: 270000 35 | CHECKPOINT_PERIOD: 10000 36 | INPUT: 37 | MIN_SIZE_TRAIN: (540, 640, 672, 704, 736, 768, 800) 38 | MAX_SIZE_TRAIN: 1920 39 | MIN_SIZE_TEST: 0 40 | MAX_SIZE_TEST: 0 41 | CROP: 42 | ENABLED: True 43 | FORMAT: "RGB" -------------------------------------------------------------------------------- /dev/linter.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 3 | 4 | # Run this script at project root by "./dev/linter.sh" before you commit 5 | 6 | vergte() { 7 | [ "$2" = "$(echo -e "$1\\n$2" | sort -V | head -n1)" ] 8 | } 9 | 10 | { 11 | black --version | grep "19.3b0" > /dev/null 12 | } || { 13 | echo "Linter requires black==19.3b0 !" 14 | exit 1 15 | } 16 | 17 | ISORT_TARGET_VERSION="4.3.21" 18 | ISORT_VERSION=$(isort -v | grep VERSION | awk '{print $2}') 19 | vergte "$ISORT_VERSION" "$ISORT_TARGET_VERSION" || { 20 | echo "Linter requires isort>=${ISORT_TARGET_VERSION} !" 21 | exit 1 22 | } 23 | 24 | set -v 25 | 26 | echo "Running isort ..." 27 | isort -y -sp . --atomic 28 | 29 | echo "Running black ..." 30 | black -l 100 . 31 | 32 | echo "Running flake8 ..." 33 | if [ -x "$(command -v flake8-3)" ]; then 34 | flake8-3 . 35 | else 36 | python3 -m flake8 . 37 | fi 38 | 39 | # echo "Running mypy ..." 40 | # Pytorch does not have enough type annotations 41 | # mypy mmdet/solver mmdet/structures mmdet/config 42 | 43 | # echo "Running clang-format ..." 44 | # find . -regex ".*\.\(cpp\|c\|cc\|cu\|cxx\|h\|hh\|hpp\|hxx\|tcc\|mm\|m\)" -print0 | xargs -0 clang-format -i 45 | 46 | # command -v arc > /dev/null && arc lint 47 | -------------------------------------------------------------------------------- /mmdet/__init__.py: -------------------------------------------------------------------------------- 1 | from .modeling import RetinaFace, build_torch_resnet_fpn_backbone 2 | 3 | __version__ = "0.1.0" 4 | -------------------------------------------------------------------------------- /mmdet/config/__init__.py: -------------------------------------------------------------------------------- 1 | from .config import get_cfg 2 | 3 | __all__ = ["get_cfg"] 4 | -------------------------------------------------------------------------------- /mmdet/config/config.py: -------------------------------------------------------------------------------- 1 | from detectron2.config import CfgNode 2 | 3 | 4 | def get_cfg() -> CfgNode: 5 | """ 6 | Get a copy of the default config. 7 | Returns: 8 | a detectron2 CfgNode instance. 9 | """ 10 | from .defaults import _C 11 | 12 | return _C.clone() 13 | -------------------------------------------------------------------------------- /mmdet/config/defaults.py: -------------------------------------------------------------------------------- 1 | # from detectron2.config import CfgNode as CN 2 | from detectron2.config.defaults import _C 3 | 4 | # ---------------------------------------------------------------------------- # 5 | # Additional Configs 6 | # ---------------------------------------------------------------------------- # 7 | 8 | _C.MODEL.RETINANET.WITH_DCNv2 = False 9 | _C.MODEL.RETINANET.NORM = "BN" 10 | -------------------------------------------------------------------------------- /mmdet/data/__init__.py: -------------------------------------------------------------------------------- 1 | from . import datasets # just to register data 2 | -------------------------------------------------------------------------------- /mmdet/data/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from . import widerface 2 | -------------------------------------------------------------------------------- /mmdet/data/datasets/widerface.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import cv2 4 | import numpy as np 5 | from detectron2.data import DatasetCatalog, MetadataCatalog 6 | from detectron2.structures.boxes import BoxMode 7 | 8 | 9 | def get_widerface_metadata(): 10 | metadata = {"thing_classes": ["face"]} 11 | return metadata 12 | 13 | 14 | def get_widerface_dicts(image_root): 15 | label_file = os.path.join(image_root, "label.txt") 16 | 17 | imgs_path = [] 18 | imgs_path_no_head = [] 19 | words = [] 20 | 21 | with open(label_file) as f: 22 | lines = f.readlines() 23 | isFirst = True 24 | labels = [] 25 | for line in lines: 26 | line = line.rstrip() 27 | if line.startswith("#"): 28 | if isFirst is True: 29 | isFirst = False 30 | else: 31 | labels_copy = labels.copy() 32 | words.append(labels_copy) 33 | labels.clear() 34 | path = line[2:] 35 | imgs_path_no_head.append(path) 36 | path = label_file.replace("label.txt", "images/") + path 37 | imgs_path.append(path) 38 | else: 39 | line = line.split(" ") 40 | label = [float(x) for x in line] 41 | labels.append(label) 42 | 43 | words.append(labels) 44 | 45 | widerface_dicts = [] 46 | for index in range(len(words)): 47 | 48 | filename = imgs_path[index] 49 | height, width = cv2.imread(filename).shape[:2] 50 | 51 | record = {} 52 | record["file_name"] = filename 53 | record["image_id"] = imgs_path_no_head[index] 54 | record["height"] = height 55 | record["width"] = width 56 | 57 | labels = words[index] 58 | # annotations = np.zeros((0, 15)) 59 | objs = [] 60 | 61 | for idx, label in enumerate(labels): 62 | annotation = np.zeros((1, 15)) 63 | # bbox 64 | annotation[0, 0] = label[0] # x1 65 | annotation[0, 1] = label[1] # y1 66 | 67 | if label[0] >= width or label[1] >= height: 68 | continue 69 | 70 | if label[2] <= 0 or label[3] <= 0: 71 | continue 72 | 73 | annotation[0, 2] = label[0] + label[2] # x2 74 | if annotation[0, 2] >= width: 75 | annotation[0, 2] = width - 1 76 | 77 | annotation[0, 3] = label[1] + label[3] # y2 78 | if annotation[0, 3] >= height: 79 | annotation[0, 3] = height - 1 80 | 81 | if len(label) > 4: 82 | # landmarks 83 | annotation[0, 4] = label[4] # l0_x 84 | annotation[0, 5] = label[5] # l0_y 85 | annotation[0, 6] = label[7] # l1_x 86 | annotation[0, 7] = label[8] # l1_y 87 | annotation[0, 8] = label[10] # l2_x 88 | annotation[0, 9] = label[11] # l2_y 89 | annotation[0, 10] = label[13] # l3_x 90 | annotation[0, 11] = label[14] # l3_y 91 | annotation[0, 12] = label[16] # l4_x 92 | annotation[0, 13] = label[17] # l4_y 93 | if annotation[0, 4] < 0: 94 | annotation[0, 14] = -1 95 | else: 96 | annotation[0, 14] = 1 97 | obj = { 98 | "bbox": [annotation[0, 0], annotation[0, 1], annotation[0, 2], annotation[0, 3]], 99 | "bbox_mode": BoxMode.XYXY_ABS, 100 | "landmark": annotation, 101 | "category_id": 0, 102 | } 103 | objs.append(obj) 104 | 105 | record["annotations"] = objs 106 | widerface_dicts.append(record) 107 | return widerface_dicts 108 | 109 | 110 | def register_widerface(): 111 | SPLITS = { 112 | "widerface_train": ("widerface/train", "widerface/train/label.txt"), 113 | "widerface_val": ("widerface/val", "widerface/val/label.txt"), 114 | } 115 | for name, (image_root, label_file) in SPLITS.items(): 116 | label_file = os.path.join("datasets", label_file) 117 | image_root = os.path.join("datasets", image_root) 118 | register_widerface_instance(name, image_root) 119 | 120 | 121 | def register_widerface_instance(name, image_root): 122 | DatasetCatalog.register(name, lambda name=name: get_widerface_dicts(image_root)) 123 | MetadataCatalog.get(name).set(**get_widerface_metadata()) 124 | -------------------------------------------------------------------------------- /mmdet/data/transforms/__init__.py: -------------------------------------------------------------------------------- 1 | from detectron2.data.transforms import * 2 | from fvcore.transforms import * 3 | 4 | from .widerface_transform import * 5 | 6 | __all__ = [k for k in globals().keys() if not k.startswith("_")] 7 | -------------------------------------------------------------------------------- /mmdet/data/transforms/widerface_transform.py: -------------------------------------------------------------------------------- 1 | import random 2 | import sys 3 | 4 | import numpy as np 5 | from detectron2.data.transforms import ResizeTransform, TransformGen 6 | from fvcore.transforms.transform import CropTransform, NoOpTransform 7 | from PIL import Image 8 | 9 | __all__ = ["WiderFace_ResizeShortestEdge", "WiderFace_NoOpTransform", "WiderFace_RandomCrop"] 10 | 11 | 12 | class WiderFace_ResizeShortestEdge(TransformGen): 13 | """ 14 | Scale the shorter edge to the given size, with a limit of `max_size` on the longer edge. 15 | If `max_size` is reached, then downscale so that the longer edge does not exceed max_size. 16 | """ 17 | 18 | def __init__( 19 | self, short_edge_length, max_size=sys.maxsize, sample_style="choice", interp=Image.BILINEAR 20 | ): 21 | """ 22 | Args: 23 | short_edge_length (list[int]): If ``sample_style=="range"``, 24 | a [min, max] interval from which to sample the shortest edge length. 25 | If ``sample_style=="choice"``, a list of shortest edge lengths to sample from. 26 | max_size (int): maximum allowed longest edge length. 27 | sample_style (str): either "range" or "choice". 28 | """ 29 | super().__init__() 30 | assert sample_style in ["range", "choice"], sample_style 31 | 32 | self.is_range = sample_style == "range" 33 | if isinstance(short_edge_length, int): 34 | short_edge_length = (short_edge_length, short_edge_length) 35 | self._init(locals()) 36 | 37 | def get_transform(self, img): 38 | h, w = img.shape[:2] 39 | if min(h, w) >= self.short_edge_length[0]: 40 | return NoOpTransform() 41 | 42 | scale = self.short_edge_length[0] * 1.0 / min(h, w) 43 | newh = h * scale 44 | neww = w * scale 45 | neww = int(neww + 0.5) 46 | newh = int(newh + 0.5) 47 | return ResizeTransform(h, w, newh, neww, self.interp) 48 | 49 | 50 | class WiderFace_NoOpTransform(TransformGen): 51 | """ 52 | Scale the shorter edge to the given size, with a limit of `max_size` on the longer edge. 53 | If `max_size` is reached, then downscale so that the longer edge does not exceed max_size. 54 | """ 55 | 56 | def __init__(self): 57 | """ 58 | Args: 59 | short_edge_length (list[int]): If ``sample_style=="range"``, 60 | a [min, max] interval from which to sample the shortest edge length. 61 | If ``sample_style=="choice"``, a list of shortest edge lengths to sample from. 62 | max_size (int): maximum allowed longest edge length. 63 | sample_style (str): either "range" or "choice". 64 | """ 65 | super().__init__() 66 | 67 | def get_transform(self, img): 68 | 69 | return NoOpTransform() 70 | 71 | 72 | class WiderFace_RandomCrop(TransformGen): 73 | """ 74 | Randomly crop a subimage out of an image. 75 | """ 76 | 77 | def __init__(self): 78 | super().__init__() 79 | self._init(locals()) 80 | 81 | def get_transform(self, img): 82 | h, w = img.shape[:2] 83 | croph, cropw = self.get_crop_size((h, w)) 84 | assert h >= croph and w >= cropw, "Shape computation in {} has bugs.".format(self) 85 | h0 = np.random.randint(h - croph + 1) 86 | w0 = np.random.randint(w - cropw + 1) 87 | return CropTransform(w0, h0, cropw, croph) 88 | 89 | def get_crop_size(self, image_size): 90 | """ 91 | Args: 92 | image_size (tuple): height, width 93 | Returns: 94 | crop_size (tuple): height, width in absolute pixels 95 | """ 96 | h, w = image_size 97 | 98 | # crop_size = np.asarray([0.1, 0.9], dtype=np.float32) 99 | # ch, cw = crop_size + np.random.rand(2) * (1 - crop_size) 100 | # return int(h * ch + 0.5), int(w * cw + 0.5) 101 | 102 | PRE_SCALES = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] 103 | scale = random.choice(PRE_SCALES) 104 | short_side = min(h, w) 105 | w = int(scale * short_side) 106 | h = w 107 | return h, w 108 | -------------------------------------------------------------------------------- /mmdet/data/widerface_dataset_mapper.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | import copy 3 | import logging 4 | 5 | import numpy as np 6 | import torch 7 | from detectron2.data import detection_utils as utils 8 | from fvcore.common.file_io import PathManager 9 | from PIL import Image 10 | 11 | from . import transforms as T 12 | 13 | """ 14 | This file contains the default mapping that's applied to "dataset dicts". 15 | """ 16 | 17 | __all__ = ["WiderFace_DatasetMapper"] 18 | 19 | 20 | class WiderFace_DatasetMapper: 21 | """ 22 | A callable which takes a dataset dict in Detectron2 Dataset format, 23 | and map it into a format used by the model. 24 | 25 | This is the default callable to be used to map your dataset dict into training data. 26 | You may need to follow it to implement your own one for customized logic, 27 | such as a different way to read or transform images. 28 | See :doc:`/tutorials/data_loading` for details. 29 | 30 | The callable currently does the following: 31 | 32 | 1. Read the image from "file_name" 33 | 2. Applies cropping/geometric transforms to the image and annotations 34 | 3. Prepare data and annotations to Tensor and :class:`Instances` 35 | """ 36 | 37 | def __init__(self, cfg, is_train=True): 38 | if cfg.INPUT.CROP.ENABLED and is_train: 39 | # self.crop_gen = T.RandomCrop(cfg.INPUT.CROP.TYPE, cfg.INPUT.CROP.SIZE) 40 | self.crop_gen = T.WiderFace_RandomCrop() 41 | logging.getLogger(__name__).info("CropGen used in training: " + str(self.crop_gen)) 42 | else: 43 | self.crop_gen = None 44 | 45 | if is_train: 46 | min_size = cfg.INPUT.MIN_SIZE_TRAIN 47 | max_size = cfg.INPUT.MAX_SIZE_TRAIN 48 | sample_style = cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING 49 | 50 | self.tfm_gens = [] 51 | # self.tfm_gens.append(T.WiderFace_ResizeShortestEdge(min_size, max_size, sample_style)) 52 | self.tfm_gens.append(T.ResizeShortestEdge(min_size, max_size, sample_style)) 53 | self.tfm_gens.append(T.RandomFlip(prob=0.5, horizontal=True, vertical=False)) 54 | self.tfm_gens.append(T.RandomFlip(prob=0.5, horizontal=False, vertical=True)) 55 | # self.tfm_gens.append(T.RandomContrast(0.7, 3.2)) 56 | # self.tfm_gens.append(T.RandomBrightness(0.6, 1.8)) 57 | # self.tfm_gens.append(T.RandomSaturation(0.6, 1.4)) 58 | # self.tfm_gens.append(T.RandomLighting(0.1)) 59 | logging.getLogger(__name__).info( 60 | "TransformGens used in training: " + str(self.tfm_gens) 61 | ) 62 | else: 63 | self.tfm_gens = [] 64 | self.tfm_gens.append(T.WiderFace_NoOpTransform()) 65 | 66 | # fmt: off 67 | self.img_format = cfg.INPUT.FORMAT 68 | self.mask_on = cfg.MODEL.MASK_ON 69 | self.mask_format = cfg.INPUT.MASK_FORMAT 70 | self.keypoint_on = cfg.MODEL.KEYPOINT_ON 71 | self.load_proposals = cfg.MODEL.LOAD_PROPOSALS 72 | # fmt: on 73 | if self.keypoint_on and is_train: 74 | # Flip only makes sense in training 75 | self.keypoint_hflip_indices = utils.create_keypoint_hflip_indices(cfg.DATASETS.TRAIN) 76 | else: 77 | self.keypoint_hflip_indices = None 78 | 79 | if self.load_proposals: 80 | self.min_box_side_len = cfg.MODEL.PROPOSAL_GENERATOR.MIN_SIZE 81 | self.proposal_topk = ( 82 | cfg.DATASETS.PRECOMPUTED_PROPOSAL_TOPK_TRAIN 83 | if is_train 84 | else cfg.DATASETS.PRECOMPUTED_PROPOSAL_TOPK_TEST 85 | ) 86 | self.is_train = is_train 87 | 88 | def __call__(self, dataset_dict): 89 | """ 90 | Args: 91 | dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. 92 | 93 | Returns: 94 | dict: a format that builtin models in detectron2 accept 95 | """ 96 | dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below 97 | # USER: Write your own image loading if it's not from a file 98 | image = utils.read_image(dataset_dict["file_name"], format=self.img_format) 99 | utils.check_image_size(dataset_dict, image) 100 | 101 | if "annotations" not in dataset_dict: 102 | image, transforms = T.apply_transform_gens( 103 | ([self.crop_gen] if self.crop_gen else []) + self.tfm_gens, image 104 | ) 105 | else: 106 | # Crop around an instance if there are instances in the image. 107 | # USER: Remove if you don't use cropping 108 | # image, transforms = T.apply_transform_gens(self.tfm_gens, image) 109 | if self.crop_gen: 110 | crop_tfm = utils.gen_crop_transform_with_instance( 111 | self.crop_gen.get_crop_size(image.shape[:2]), 112 | image.shape[:2], 113 | np.random.choice(dataset_dict["annotations"]), 114 | ) 115 | image = crop_tfm.apply_image(image) 116 | 117 | image, transforms = T.apply_transform_gens(self.tfm_gens, image) 118 | if self.crop_gen: 119 | transforms = crop_tfm + transforms 120 | 121 | image_shape = image.shape[:2] # h, w 122 | 123 | # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory, 124 | # but not efficient on large generic data structures due to the use of pickle & mp.Queue. 125 | # Therefore it's important to use torch.Tensor. 126 | dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))) 127 | 128 | # USER: Remove if you don't use pre-computed proposals. 129 | if self.load_proposals: 130 | utils.transform_proposals( 131 | dataset_dict, image_shape, transforms, self.min_box_side_len, self.proposal_topk 132 | ) 133 | 134 | if not self.is_train: 135 | # USER: Modify this if you want to keep them for some reason. 136 | dataset_dict.pop("annotations", None) 137 | dataset_dict.pop("sem_seg_file_name", None) 138 | return dataset_dict 139 | 140 | if "annotations" in dataset_dict: 141 | # USER: Modify this if you want to keep them for some reason. 142 | for anno in dataset_dict["annotations"]: 143 | if not self.mask_on: 144 | anno.pop("segmentation", None) 145 | if not self.keypoint_on: 146 | anno.pop("keypoints", None) 147 | 148 | # USER: Implement additional transformations if you have other types of data 149 | annos = [ 150 | utils.transform_instance_annotations( 151 | obj, transforms, image_shape, keypoint_hflip_indices=self.keypoint_hflip_indices 152 | ) 153 | for obj in dataset_dict.pop("annotations") 154 | if obj.get("iscrowd", 0) == 0 155 | ] 156 | instances = utils.annotations_to_instances( 157 | annos, image_shape, mask_format=self.mask_format 158 | ) 159 | # Create a tight bounding box from masks, useful when image is cropped 160 | if self.crop_gen and instances.has("gt_masks"): 161 | instances.gt_boxes = instances.gt_masks.get_bounding_boxes() 162 | dataset_dict["instances"] = utils.filter_empty_instances(instances) 163 | 164 | # USER: Remove if you don't do semantic/panoptic segmentation. 165 | if "sem_seg_file_name" in dataset_dict: 166 | with PathManager.open(dataset_dict.pop("sem_seg_file_name"), "rb") as f: 167 | sem_seg_gt = Image.open(f) 168 | sem_seg_gt = np.asarray(sem_seg_gt, dtype="uint8") 169 | sem_seg_gt = transforms.apply_segmentation(sem_seg_gt) 170 | sem_seg_gt = torch.as_tensor(sem_seg_gt.astype("long")) 171 | dataset_dict["sem_seg"] = sem_seg_gt 172 | return dataset_dict 173 | -------------------------------------------------------------------------------- /mmdet/evaluation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lbin/Retinaface_Detectron2/579e500b35efac6afc389dfc9bbea0b129e91ba6/mmdet/evaluation/__init__.py -------------------------------------------------------------------------------- /mmdet/evaluation/box_overlaps.pyx: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Sergey Karayev 6 | # -------------------------------------------------------- 7 | 8 | cimport cython 9 | import numpy as np 10 | cimport numpy as np 11 | 12 | DTYPE = np.float 13 | ctypedef np.float_t DTYPE_t 14 | 15 | def bbox_overlaps( 16 | np.ndarray[DTYPE_t, ndim=2] boxes, 17 | np.ndarray[DTYPE_t, ndim=2] query_boxes): 18 | """ 19 | Parameters 20 | ---------- 21 | boxes: (N, 4) ndarray of float 22 | query_boxes: (K, 4) ndarray of float 23 | Returns 24 | ------- 25 | overlaps: (N, K) ndarray of overlap between boxes and query_boxes 26 | """ 27 | cdef unsigned int N = boxes.shape[0] 28 | cdef unsigned int K = query_boxes.shape[0] 29 | cdef np.ndarray[DTYPE_t, ndim=2] overlaps = np.zeros((N, K), dtype=DTYPE) 30 | cdef DTYPE_t iw, ih, box_area 31 | cdef DTYPE_t ua 32 | cdef unsigned int k, n 33 | for k in range(K): 34 | box_area = ( 35 | (query_boxes[k, 2] - query_boxes[k, 0] + 1) * 36 | (query_boxes[k, 3] - query_boxes[k, 1] + 1) 37 | ) 38 | for n in range(N): 39 | iw = ( 40 | min(boxes[n, 2], query_boxes[k, 2]) - 41 | max(boxes[n, 0], query_boxes[k, 0]) + 1 42 | ) 43 | if iw > 0: 44 | ih = ( 45 | min(boxes[n, 3], query_boxes[k, 3]) - 46 | max(boxes[n, 1], query_boxes[k, 1]) + 1 47 | ) 48 | if ih > 0: 49 | ua = float( 50 | (boxes[n, 2] - boxes[n, 0] + 1) * 51 | (boxes[n, 3] - boxes[n, 1] + 1) + 52 | box_area - iw * ih 53 | ) 54 | overlaps[n, k] = iw * ih / ua 55 | return overlaps -------------------------------------------------------------------------------- /mmdet/evaluation/evaluator.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from collections import OrderedDict, defaultdict 4 | 5 | import torch 6 | from detectron2.data import MetadataCatalog 7 | from detectron2.evaluation.evaluator import DatasetEvaluator 8 | from detectron2.utils import comm 9 | 10 | from .widerface_evaluation import evaluation 11 | 12 | 13 | class WiderFaceEvaluator(DatasetEvaluator): 14 | """ 15 | Evaluate Wider Face AP. 16 | It contains a synchronization, therefore has to be called from all ranks. 17 | """ 18 | 19 | def __init__(self, dataset_name, output_folder): 20 | """ 21 | Args: 22 | dataset_name (str): name of the dataset, e.g., "widerface_val" 23 | """ 24 | self._dataset_name = dataset_name 25 | self._output_folder = output_folder 26 | meta = MetadataCatalog.get(dataset_name) 27 | # data_info = DatasetCatalog.get(dataset_name) 28 | 29 | self._class_names = meta.thing_classes 30 | 31 | self._cpu_device = torch.device("cpu") 32 | self._logger = logging.getLogger(__name__) 33 | 34 | def reset(self): 35 | self._predictions = defaultdict(list) # class name -> list of prediction strings 36 | 37 | def process(self, inputs, outputs): 38 | for input, output in zip(inputs, outputs): 39 | image_id = input["image_id"] 40 | instances = output["instances"].to(self._cpu_device) 41 | boxes = instances.pred_boxes.tensor.numpy() 42 | scores = instances.scores.tolist() 43 | classes = instances.pred_classes.tolist() 44 | for box, score, cls in zip(boxes, scores, classes): 45 | xmin, ymin, xmax, ymax = box 46 | # The inverse of data loading logic in `datasets/pascal_voc.py` 47 | xmin += 1 48 | ymin += 1 49 | 50 | self._predictions[image_id].append([xmin, ymin, xmax, ymax, score]) 51 | 52 | if len(self._predictions[image_id]) == 0: 53 | self._predictions[image_id].append([0, 0, 0, 0, 0]) 54 | 55 | def evaluate(self): 56 | 57 | all_predictions = comm.gather(self._predictions, dst=0) 58 | if not comm.is_main_process(): 59 | return 60 | predictions = defaultdict(list) 61 | for predictions_per_rank in all_predictions: 62 | for clsid, lines in predictions_per_rank.items(): 63 | predictions[clsid].extend(lines) 64 | del all_predictions 65 | 66 | tmp_results_path = os.path.join(self._output_folder, "wider_face_val_results") 67 | 68 | for image_id in predictions.keys(): 69 | tmp_results_file = tmp_results_path + "/" + image_id[:-4] + ".txt" 70 | dirname = os.path.dirname(tmp_results_file) 71 | if not os.path.isdir(dirname): 72 | os.makedirs(dirname) 73 | 74 | with open(tmp_results_file, "w") as fd: 75 | # bboxs = dets 76 | file_name = os.path.basename(tmp_results_file)[:-4] + "\n" 77 | bboxs_num = str(len(predictions[image_id])) + "\n" 78 | fd.write(file_name) 79 | fd.write(bboxs_num) 80 | idx = 0 81 | for box in predictions[image_id]: 82 | 83 | x = int(box[0]) 84 | y = int(box[1]) 85 | w = int(box[2]) - int(box[0]) 86 | h = int(box[3]) - int(box[1]) 87 | confidence = str(float(box[4])) 88 | line = ( 89 | str(x) 90 | + " " 91 | + str(y) 92 | + " " 93 | + str(w) 94 | + " " 95 | + str(h) 96 | + " " 97 | + confidence 98 | + " \n" 99 | ) 100 | fd.write(line) 101 | idx = idx + 1 102 | 103 | aps = evaluation(tmp_results_path, "datasets/widerface/val/ground_truth") 104 | 105 | ret = OrderedDict() 106 | ret["bbox"] = {"Easy": aps[0], "Medium": aps[1], "Hard": aps[2]} 107 | return ret 108 | -------------------------------------------------------------------------------- /mmdet/evaluation/widerface_evaluation.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import os 4 | import sys 5 | 6 | import numpy as np 7 | from scipy.io import loadmat 8 | 9 | from .bbox import bbox_overlaps 10 | 11 | sys.path.append(os.getcwd()) 12 | 13 | 14 | def get_gt_boxes(gt_dir): 15 | """ gt dir: (wider_face_val.mat, wider_easy_val.mat, wider_medium_val.mat, wider_hard_val.mat)""" 16 | 17 | gt_mat = loadmat(os.path.join(gt_dir, "wider_face_val.mat")) 18 | hard_mat = loadmat(os.path.join(gt_dir, "wider_hard_val.mat")) 19 | medium_mat = loadmat(os.path.join(gt_dir, "wider_medium_val.mat")) 20 | easy_mat = loadmat(os.path.join(gt_dir, "wider_easy_val.mat")) 21 | 22 | facebox_list = gt_mat["face_bbx_list"] 23 | event_list = gt_mat["event_list"] 24 | file_list = gt_mat["file_list"] 25 | 26 | hard_gt_list = hard_mat["gt_list"] 27 | medium_gt_list = medium_mat["gt_list"] 28 | easy_gt_list = easy_mat["gt_list"] 29 | 30 | return facebox_list, event_list, file_list, hard_gt_list, medium_gt_list, easy_gt_list 31 | 32 | 33 | def get_gt_boxes_from_txt(gt_path, cache_dir): 34 | f = open(gt_path, "r") 35 | state = 0 36 | lines = f.readlines() 37 | lines = list(map(lambda x: x.rstrip("\r\n"), lines)) 38 | boxes = {} 39 | # print(len(lines)) 40 | f.close() 41 | current_boxes = [] 42 | current_name = None 43 | for line in lines: 44 | if state == 0 and "--" in line: 45 | state = 1 46 | current_name = line 47 | continue 48 | if state == 1: 49 | state = 2 50 | continue 51 | 52 | if state == 2 and "--" in line: 53 | state = 1 54 | boxes[current_name] = np.array(current_boxes).astype("float32") 55 | current_name = line 56 | current_boxes = [] 57 | continue 58 | 59 | if state == 2: 60 | box = [float(x) for x in line.split(" ")[:4]] 61 | current_boxes.append(box) 62 | continue 63 | 64 | return boxes 65 | 66 | 67 | def read_pred_file(filepath): 68 | 69 | with open(filepath, "r") as f: 70 | lines = f.readlines() 71 | img_file = lines[0].rstrip("\n\r") 72 | lines = lines[2:] 73 | 74 | # b = lines[0].rstrip('\r\n').split(' ')[:-1] 75 | # c = float(b) 76 | # a = map(lambda x: [[float(a[0]), float(a[1]), float(a[2]), float(a[3]), float(a[4])] for a in x.rstrip('\r\n').split(' ')], lines) 77 | boxes = [] 78 | for line in lines: 79 | line = line.rstrip("\r\n").split(" ") 80 | if line[0] == "": 81 | continue 82 | # a = float(line[4]) 83 | boxes.append( 84 | [float(line[0]), float(line[1]), float(line[2]), float(line[3]), float(line[4])] 85 | ) 86 | boxes = np.array(boxes) 87 | # boxes = np.array(list(map(lambda x: [float(a) for a in x.rstrip('\r\n').split(' ')], lines))).astype('float') 88 | return img_file.split("/")[-1], boxes 89 | 90 | 91 | def get_preds(pred_dir): 92 | events = os.listdir(pred_dir) 93 | boxes = dict() 94 | 95 | for event in events: 96 | event_dir = os.path.join(pred_dir, event) 97 | event_images = os.listdir(event_dir) 98 | current_event = dict() 99 | for imgtxt in event_images: 100 | imgname, _boxes = read_pred_file(os.path.join(event_dir, imgtxt)) 101 | current_event[imgname.rstrip(".jpg")] = _boxes 102 | boxes[event] = current_event 103 | return boxes 104 | 105 | 106 | def norm_score(pred): 107 | """ norm score 108 | pred {key: [[x1,y1,x2,y2,s]]} 109 | """ 110 | 111 | max_score = 0 112 | min_score = 1 113 | 114 | for _, k in pred.items(): 115 | for _, v in k.items(): 116 | if len(v) == 0: 117 | continue 118 | _min = np.min(v[:, -1]) 119 | _max = np.max(v[:, -1]) 120 | max_score = max(_max, max_score) 121 | min_score = min(_min, min_score) 122 | 123 | diff = max_score - min_score 124 | for _, k in pred.items(): 125 | for _, v in k.items(): 126 | if len(v) == 0: 127 | continue 128 | v[:, -1] = (v[:, -1] - min_score) / diff 129 | 130 | 131 | def image_eval(pred, gt, ignore, iou_thresh): 132 | """ single image evaluation 133 | pred: Nx5 134 | gt: Nx4 135 | ignore: 136 | """ 137 | 138 | _pred = pred.copy() 139 | _gt = gt.copy() 140 | pred_recall = np.zeros(_pred.shape[0]) 141 | recall_list = np.zeros(_gt.shape[0]) 142 | proposal_list = np.ones(_pred.shape[0]) 143 | 144 | _pred[:, 2] = _pred[:, 2] + _pred[:, 0] 145 | _pred[:, 3] = _pred[:, 3] + _pred[:, 1] 146 | _gt[:, 2] = _gt[:, 2] + _gt[:, 0] 147 | _gt[:, 3] = _gt[:, 3] + _gt[:, 1] 148 | 149 | overlaps = bbox_overlaps(_pred[:, :4], _gt) 150 | 151 | for h in range(_pred.shape[0]): 152 | 153 | gt_overlap = overlaps[h] 154 | max_overlap, max_idx = gt_overlap.max(), gt_overlap.argmax() 155 | if max_overlap >= iou_thresh: 156 | if ignore[max_idx] == 0: 157 | recall_list[max_idx] = -1 158 | proposal_list[h] = -1 159 | elif recall_list[max_idx] == 0: 160 | recall_list[max_idx] = 1 161 | 162 | r_keep_index = np.where(recall_list == 1)[0] 163 | pred_recall[h] = len(r_keep_index) 164 | return pred_recall, proposal_list 165 | 166 | 167 | def img_pr_info(thresh_num, pred_info, proposal_list, pred_recall): 168 | pr_info = np.zeros((thresh_num, 2)).astype("float") 169 | for t in range(thresh_num): 170 | 171 | thresh = 1 - (t + 1) / thresh_num 172 | r_index = np.where(pred_info[:, 4] >= thresh)[0] 173 | if len(r_index) == 0: 174 | pr_info[t, 0] = 0 175 | pr_info[t, 1] = 0 176 | else: 177 | r_index = r_index[-1] 178 | p_index = np.where(proposal_list[: r_index + 1] == 1)[0] 179 | pr_info[t, 0] = len(p_index) 180 | pr_info[t, 1] = pred_recall[r_index] 181 | return pr_info 182 | 183 | 184 | def dataset_pr_info(thresh_num, pr_curve, count_face): 185 | _pr_curve = np.zeros((thresh_num, 2)) 186 | for i in range(thresh_num): 187 | _pr_curve[i, 0] = pr_curve[i, 1] / pr_curve[i, 0] 188 | _pr_curve[i, 1] = pr_curve[i, 1] / count_face 189 | return _pr_curve 190 | 191 | 192 | def voc_ap(rec, prec): 193 | 194 | # correct AP calculation 195 | # first append sentinel values at the end 196 | mrec = np.concatenate(([0.0], rec, [1.0])) 197 | mpre = np.concatenate(([0.0], prec, [0.0])) 198 | 199 | # compute the precision envelope 200 | for i in range(mpre.size - 1, 0, -1): 201 | mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) 202 | 203 | # to calculate area under PR curve, look for points 204 | # where X axis (recall) changes value 205 | i = np.where(mrec[1:] != mrec[:-1])[0] 206 | 207 | # and sum (\Delta recall) * prec 208 | ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) 209 | return ap 210 | 211 | 212 | def evaluation(pred, gt_path, iou_thresh=0.5): 213 | pred = get_preds(pred) 214 | norm_score(pred) 215 | facebox_list, event_list, file_list, hard_gt_list, medium_gt_list, easy_gt_list = get_gt_boxes( 216 | gt_path 217 | ) 218 | event_num = len(event_list) 219 | thresh_num = 1000 220 | setting_gts = [easy_gt_list, medium_gt_list, hard_gt_list] 221 | aps = [] 222 | for setting_id in range(3): 223 | # different setting 224 | gt_list = setting_gts[setting_id] 225 | count_face = 0 226 | pr_curve = np.zeros((thresh_num, 2)).astype("float") 227 | # [hard, medium, easy] 228 | for i in range(event_num): 229 | event_name = str(event_list[i][0][0]) 230 | img_list = file_list[i][0] 231 | pred_list = pred[event_name] 232 | sub_gt_list = gt_list[i][0] 233 | # img_pr_info_list = np.zeros((len(img_list), thresh_num, 2)) 234 | gt_bbx_list = facebox_list[i][0] 235 | 236 | for j in range(len(img_list)): 237 | pred_info = pred_list[str(img_list[j][0][0])] 238 | 239 | gt_boxes = gt_bbx_list[j][0].astype("float") 240 | keep_index = sub_gt_list[j][0] 241 | count_face += len(keep_index) 242 | 243 | if len(gt_boxes) == 0 or len(pred_info) == 0: 244 | continue 245 | ignore = np.zeros(gt_boxes.shape[0]) 246 | if len(keep_index) != 0: 247 | ignore[keep_index - 1] = 1 248 | pred_recall, proposal_list = image_eval(pred_info, gt_boxes, ignore, iou_thresh) 249 | 250 | _img_pr_info = img_pr_info(thresh_num, pred_info, proposal_list, pred_recall) 251 | 252 | pr_curve += _img_pr_info 253 | pr_curve = dataset_pr_info(thresh_num, pr_curve, count_face) 254 | 255 | propose = pr_curve[:, 0] 256 | recall = pr_curve[:, 1] 257 | 258 | ap = voc_ap(recall, propose) 259 | aps.append(ap) 260 | 261 | logger = logging.getLogger(__name__) 262 | logger.info("Easy Val AP: {}".format(aps[0])) 263 | logger.info("Medium Val AP: {}".format(aps[1])) 264 | logger.info("Hard Val AP: {}".format(aps[2])) 265 | return aps 266 | 267 | 268 | if __name__ == "__main__": 269 | 270 | parser = argparse.ArgumentParser() 271 | parser.add_argument("-p", "--pred", default="./tools/widerface_evaluate/widerface_txt/") 272 | parser.add_argument("-g", "--gt", default="./tools/widerface_evaluate/ground_truth/") 273 | 274 | args = parser.parse_args() 275 | evaluation(args.pred, args.gt) 276 | -------------------------------------------------------------------------------- /mmdet/layers/DCNv2/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lbin/Retinaface_Detectron2/579e500b35efac6afc389dfc9bbea0b129e91ba6/mmdet/layers/DCNv2/__init__.py -------------------------------------------------------------------------------- /mmdet/layers/DCNv2/dcn_v2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import absolute_import, division, print_function 3 | 4 | import math 5 | 6 | import torch 7 | from torch import nn 8 | from torch.autograd import Function 9 | from torch.autograd.function import once_differentiable 10 | from torch.nn.modules.utils import _pair 11 | 12 | import _ext as _backend 13 | 14 | 15 | class _DCNv2(Function): 16 | @staticmethod 17 | def forward( 18 | ctx, input, offset, mask, weight, bias, stride, padding, dilation, deformable_groups 19 | ): 20 | ctx.stride = _pair(stride) 21 | ctx.padding = _pair(padding) 22 | ctx.dilation = _pair(dilation) 23 | ctx.kernel_size = _pair(weight.shape[2:4]) 24 | ctx.deformable_groups = deformable_groups 25 | output = _backend.dcn_v2_forward( 26 | input, 27 | weight, 28 | bias, 29 | offset, 30 | mask, 31 | ctx.kernel_size[0], 32 | ctx.kernel_size[1], 33 | ctx.stride[0], 34 | ctx.stride[1], 35 | ctx.padding[0], 36 | ctx.padding[1], 37 | ctx.dilation[0], 38 | ctx.dilation[1], 39 | ctx.deformable_groups, 40 | ) 41 | ctx.save_for_backward(input, offset, mask, weight, bias) 42 | return output 43 | 44 | @staticmethod 45 | @once_differentiable 46 | def backward(ctx, grad_output): 47 | input, offset, mask, weight, bias = ctx.saved_tensors 48 | grad_input, grad_offset, grad_mask, grad_weight, grad_bias = _backend.dcn_v2_backward( 49 | input, 50 | weight, 51 | bias, 52 | offset, 53 | mask, 54 | grad_output, 55 | ctx.kernel_size[0], 56 | ctx.kernel_size[1], 57 | ctx.stride[0], 58 | ctx.stride[1], 59 | ctx.padding[0], 60 | ctx.padding[1], 61 | ctx.dilation[0], 62 | ctx.dilation[1], 63 | ctx.deformable_groups, 64 | ) 65 | 66 | return grad_input, grad_offset, grad_mask, grad_weight, grad_bias, None, None, None, None 67 | 68 | 69 | dcn_v2_conv = _DCNv2.apply 70 | 71 | 72 | class DCNv2(nn.Module): 73 | def __init__( 74 | self, 75 | in_channels, 76 | out_channels, 77 | kernel_size, 78 | stride, 79 | padding, 80 | dilation=1, 81 | deformable_groups=1, 82 | ): 83 | super(DCNv2, self).__init__() 84 | self.in_channels = in_channels 85 | self.out_channels = out_channels 86 | self.kernel_size = _pair(kernel_size) 87 | self.stride = _pair(stride) 88 | self.padding = _pair(padding) 89 | self.dilation = _pair(dilation) 90 | self.deformable_groups = deformable_groups 91 | 92 | self.weight = nn.Parameter(torch.Tensor(out_channels, in_channels, *self.kernel_size)) 93 | self.bias = nn.Parameter(torch.Tensor(out_channels)) 94 | self.reset_parameters() 95 | 96 | def reset_parameters(self): 97 | n = self.in_channels 98 | for k in self.kernel_size: 99 | n *= k 100 | stdv = 1.0 / math.sqrt(n) 101 | self.weight.data.uniform_(-stdv, stdv) 102 | self.bias.data.zero_() 103 | 104 | def forward(self, input, offset, mask): 105 | assert ( 106 | 2 * self.deformable_groups * self.kernel_size[0] * self.kernel_size[1] 107 | == offset.shape[1] 108 | ) 109 | assert self.deformable_groups * self.kernel_size[0] * self.kernel_size[1] == mask.shape[1] 110 | return dcn_v2_conv( 111 | input, 112 | offset, 113 | mask, 114 | self.weight, 115 | self.bias, 116 | self.stride, 117 | self.padding, 118 | self.dilation, 119 | self.deformable_groups, 120 | ) 121 | 122 | 123 | class DCN(DCNv2): 124 | def __init__( 125 | self, 126 | in_channels, 127 | out_channels, 128 | kernel_size, 129 | stride, 130 | padding, 131 | dilation=1, 132 | deformable_groups=1, 133 | ): 134 | super(DCN, self).__init__( 135 | in_channels, out_channels, kernel_size, stride, padding, dilation, deformable_groups 136 | ) 137 | 138 | channels_ = self.deformable_groups * 3 * self.kernel_size[0] * self.kernel_size[1] 139 | self.conv_offset_mask = nn.Conv2d( 140 | self.in_channels, 141 | channels_, 142 | kernel_size=self.kernel_size, 143 | stride=self.stride, 144 | padding=self.padding, 145 | bias=True, 146 | ) 147 | self.init_offset() 148 | 149 | def init_offset(self): 150 | self.conv_offset_mask.weight.data.zero_() 151 | self.conv_offset_mask.bias.data.zero_() 152 | 153 | def forward(self, input): 154 | out = self.conv_offset_mask(input) 155 | o1, o2, mask = torch.chunk(out, 3, dim=1) 156 | offset = torch.cat((o1, o2), dim=1) 157 | mask = torch.sigmoid(mask) 158 | return dcn_v2_conv( 159 | input, 160 | offset, 161 | mask, 162 | self.weight, 163 | self.bias, 164 | self.stride, 165 | self.padding, 166 | self.dilation, 167 | self.deformable_groups, 168 | ) 169 | 170 | 171 | class _DCNv2Pooling(Function): 172 | @staticmethod 173 | def forward( 174 | ctx, 175 | input, 176 | rois, 177 | offset, 178 | spatial_scale, 179 | pooled_size, 180 | output_dim, 181 | no_trans, 182 | group_size=1, 183 | part_size=None, 184 | sample_per_part=4, 185 | trans_std=0.0, 186 | ): 187 | ctx.spatial_scale = spatial_scale 188 | ctx.no_trans = int(no_trans) 189 | ctx.output_dim = output_dim 190 | ctx.group_size = group_size 191 | ctx.pooled_size = pooled_size 192 | ctx.part_size = pooled_size if part_size is None else part_size 193 | ctx.sample_per_part = sample_per_part 194 | ctx.trans_std = trans_std 195 | 196 | output, output_count = _backend.dcn_v2_psroi_pooling_forward( 197 | input, 198 | rois, 199 | offset, 200 | ctx.no_trans, 201 | ctx.spatial_scale, 202 | ctx.output_dim, 203 | ctx.group_size, 204 | ctx.pooled_size, 205 | ctx.part_size, 206 | ctx.sample_per_part, 207 | ctx.trans_std, 208 | ) 209 | ctx.save_for_backward(input, rois, offset, output_count) 210 | return output 211 | 212 | @staticmethod 213 | @once_differentiable 214 | def backward(ctx, grad_output): 215 | input, rois, offset, output_count = ctx.saved_tensors 216 | grad_input, grad_offset = _backend.dcn_v2_psroi_pooling_backward( 217 | grad_output, 218 | input, 219 | rois, 220 | offset, 221 | output_count, 222 | ctx.no_trans, 223 | ctx.spatial_scale, 224 | ctx.output_dim, 225 | ctx.group_size, 226 | ctx.pooled_size, 227 | ctx.part_size, 228 | ctx.sample_per_part, 229 | ctx.trans_std, 230 | ) 231 | 232 | return grad_input, None, grad_offset, None, None, None, None, None, None, None, None 233 | 234 | 235 | dcn_v2_pooling = _DCNv2Pooling.apply 236 | 237 | 238 | class DCNv2Pooling(nn.Module): 239 | def __init__( 240 | self, 241 | spatial_scale, 242 | pooled_size, 243 | output_dim, 244 | no_trans, 245 | group_size=1, 246 | part_size=None, 247 | sample_per_part=4, 248 | trans_std=0.0, 249 | ): 250 | super(DCNv2Pooling, self).__init__() 251 | self.spatial_scale = spatial_scale 252 | self.pooled_size = pooled_size 253 | self.output_dim = output_dim 254 | self.no_trans = no_trans 255 | self.group_size = group_size 256 | self.part_size = pooled_size if part_size is None else part_size 257 | self.sample_per_part = sample_per_part 258 | self.trans_std = trans_std 259 | 260 | def forward(self, input, rois, offset): 261 | assert input.shape[1] == self.output_dim 262 | if self.no_trans: 263 | offset = input.new() 264 | return dcn_v2_pooling( 265 | input, 266 | rois, 267 | offset, 268 | self.spatial_scale, 269 | self.pooled_size, 270 | self.output_dim, 271 | self.no_trans, 272 | self.group_size, 273 | self.part_size, 274 | self.sample_per_part, 275 | self.trans_std, 276 | ) 277 | 278 | 279 | class DCNPooling(DCNv2Pooling): 280 | def __init__( 281 | self, 282 | spatial_scale, 283 | pooled_size, 284 | output_dim, 285 | no_trans, 286 | group_size=1, 287 | part_size=None, 288 | sample_per_part=4, 289 | trans_std=0.0, 290 | deform_fc_dim=1024, 291 | ): 292 | super(DCNPooling, self).__init__( 293 | spatial_scale, 294 | pooled_size, 295 | output_dim, 296 | no_trans, 297 | group_size, 298 | part_size, 299 | sample_per_part, 300 | trans_std, 301 | ) 302 | 303 | self.deform_fc_dim = deform_fc_dim 304 | 305 | if not no_trans: 306 | self.offset_mask_fc = nn.Sequential( 307 | nn.Linear( 308 | self.pooled_size * self.pooled_size * self.output_dim, self.deform_fc_dim 309 | ), 310 | nn.ReLU(inplace=True), 311 | nn.Linear(self.deform_fc_dim, self.deform_fc_dim), 312 | nn.ReLU(inplace=True), 313 | nn.Linear(self.deform_fc_dim, self.pooled_size * self.pooled_size * 3), 314 | ) 315 | self.offset_mask_fc[4].weight.data.zero_() 316 | self.offset_mask_fc[4].bias.data.zero_() 317 | 318 | def forward(self, input, rois): 319 | offset = input.new() 320 | 321 | if not self.no_trans: 322 | 323 | # do roi_align first 324 | n = rois.shape[0] 325 | roi = dcn_v2_pooling( 326 | input, 327 | rois, 328 | offset, 329 | self.spatial_scale, 330 | self.pooled_size, 331 | self.output_dim, 332 | True, # no trans 333 | self.group_size, 334 | self.part_size, 335 | self.sample_per_part, 336 | self.trans_std, 337 | ) 338 | 339 | # build mask and offset 340 | offset_mask = self.offset_mask_fc(roi.view(n, -1)) 341 | offset_mask = offset_mask.view(n, 3, self.pooled_size, self.pooled_size) 342 | o1, o2, mask = torch.chunk(offset_mask, 3, dim=1) 343 | offset = torch.cat((o1, o2), dim=1) 344 | mask = torch.sigmoid(mask) 345 | 346 | # do pooling with offset and mask 347 | return ( 348 | dcn_v2_pooling( 349 | input, 350 | rois, 351 | offset, 352 | self.spatial_scale, 353 | self.pooled_size, 354 | self.output_dim, 355 | self.no_trans, 356 | self.group_size, 357 | self.part_size, 358 | self.sample_per_part, 359 | self.trans_std, 360 | ) 361 | * mask 362 | ) 363 | # only roi_align 364 | return dcn_v2_pooling( 365 | input, 366 | rois, 367 | offset, 368 | self.spatial_scale, 369 | self.pooled_size, 370 | self.output_dim, 371 | self.no_trans, 372 | self.group_size, 373 | self.part_size, 374 | self.sample_per_part, 375 | self.trans_std, 376 | ) 377 | -------------------------------------------------------------------------------- /mmdet/layers/DCNv2/src/cpu/dcn_v2_cpu.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "cpu/dcn_v2_im2col_cpu.h" 3 | 4 | #include 5 | //#include 6 | 7 | #include 8 | //#include 9 | //#include 10 | 11 | //extern THCState *state; 12 | 13 | // author: Charles Shang 14 | // https://github.com/torch/cunn/blob/master/lib/THCUNN/generic/SpatialConvolutionMM.cu 15 | // modified from the CUDA version for CPU use by Daniel K. Suhendro 16 | 17 | at::Tensor 18 | dcn_v2_cpu_forward(const at::Tensor &input, 19 | const at::Tensor &weight, 20 | const at::Tensor &bias, 21 | const at::Tensor &offset, 22 | const at::Tensor &mask, 23 | const int kernel_h, 24 | const int kernel_w, 25 | const int stride_h, 26 | const int stride_w, 27 | const int pad_h, 28 | const int pad_w, 29 | const int dilation_h, 30 | const int dilation_w, 31 | const int deformable_group) 32 | { 33 | // THCAssertSameGPU(THCudaTensor_checkGPU(state, 5, input, weight, bias, offset, mask)); 34 | /*AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); 35 | AT_ASSERTM(weight.type().is_cuda(), "weight must be a CUDA tensor"); 36 | AT_ASSERTM(bias.type().is_cuda(), "bias must be a CUDA tensor"); 37 | AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor"); 38 | AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor");*/ 39 | 40 | const int batch = input.size(0); 41 | const int channels = input.size(1); 42 | const int height = input.size(2); 43 | const int width = input.size(3); 44 | 45 | const int channels_out = weight.size(0); 46 | const int channels_kernel = weight.size(1); 47 | const int kernel_h_ = weight.size(2); 48 | const int kernel_w_ = weight.size(3); 49 | 50 | // printf("Kernels: %d %d %d %d\n", kernel_h_, kernel_w_, kernel_w, kernel_h); 51 | // printf("Channels: %d %d\n", channels, channels_kernel); 52 | // printf("Channels: %d %d\n", channels_out, channels_kernel); 53 | 54 | AT_ASSERTM(kernel_h_ == kernel_h && kernel_w_ == kernel_w, 55 | "Input shape and kernel shape wont match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_); 56 | 57 | AT_ASSERTM(channels == channels_kernel, 58 | "Input shape and kernel channels wont match: (%d vs %d).", channels, channels_kernel); 59 | 60 | const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; 61 | const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; 62 | 63 | auto ones = at::ones({height_out, width_out}, input.options()); 64 | auto columns = at::empty({channels * kernel_h * kernel_w, 1 * height_out * width_out}, input.options()); 65 | auto output = at::empty({batch, channels_out, height_out, width_out}, input.options()); 66 | 67 | using scalar_t = float; 68 | for (int b = 0; b < batch; b++) 69 | { 70 | auto input_n = input.select(0, b); 71 | auto offset_n = offset.select(0, b); 72 | auto mask_n = mask.select(0, b); 73 | auto output_n = output.select(0, b); 74 | 75 | // Do Bias first: 76 | // M,N,K are dims of matrix A and B 77 | // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) 78 | // (N x 1) (1 x M) 79 | long m_ = channels_out; 80 | long n_ = height_out * width_out; 81 | long k_ = 1; 82 | THFloatBlas_gemm('t', 'n', n_, m_, k_, 1.0f, 83 | ones.contiguous().data(), k_, 84 | bias.contiguous().data(), k_, 0.0f, 85 | output_n.data(), n_); 86 | 87 | modulated_deformable_im2col_cpu(input_n.data(), 88 | offset_n.data(), 89 | mask_n.data(), 90 | 1, channels, height, width, 91 | height_out, width_out, kernel_h, kernel_w, 92 | pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, 93 | deformable_group, 94 | columns.data()); 95 | 96 | //(k * m) x (m * n) 97 | // Y = WC 98 | long m = channels_out; 99 | long n = height_out * width_out; 100 | long k = channels * kernel_h * kernel_w; 101 | THFloatBlas_gemm('n', 'n', n, m, k, 1.0f, 102 | columns.data(), n, 103 | weight.data(), k, 1.0f, 104 | output_n.data(), n); 105 | } 106 | return output; 107 | } 108 | 109 | std::vector dcn_v2_cpu_backward(const at::Tensor &input, 110 | const at::Tensor &weight, 111 | const at::Tensor &bias, 112 | const at::Tensor &offset, 113 | const at::Tensor &mask, 114 | const at::Tensor &grad_output, 115 | int kernel_h, int kernel_w, 116 | int stride_h, int stride_w, 117 | int pad_h, int pad_w, 118 | int dilation_h, int dilation_w, 119 | int deformable_group) 120 | { 121 | 122 | THArgCheck(input.is_contiguous(), 1, "input tensor has to be contiguous"); 123 | THArgCheck(weight.is_contiguous(), 2, "weight tensor has to be contiguous"); 124 | 125 | /*AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); 126 | AT_ASSERTM(weight.type().is_cuda(), "weight must be a CUDA tensor"); 127 | AT_ASSERTM(bias.type().is_cuda(), "bias must be a CUDA tensor"); 128 | AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor"); 129 | AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor");*/ 130 | 131 | const int batch = input.size(0); 132 | const int channels = input.size(1); 133 | const int height = input.size(2); 134 | const int width = input.size(3); 135 | 136 | const int channels_out = weight.size(0); 137 | const int channels_kernel = weight.size(1); 138 | const int kernel_h_ = weight.size(2); 139 | const int kernel_w_ = weight.size(3); 140 | 141 | AT_ASSERTM(kernel_h_ == kernel_h && kernel_w_ == kernel_w, 142 | "Input shape and kernel shape wont match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_); 143 | 144 | AT_ASSERTM(channels == channels_kernel, 145 | "Input shape and kernel channels wont match: (%d vs %d).", channels, channels_kernel); 146 | 147 | const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; 148 | const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; 149 | 150 | auto ones = at::ones({height_out, width_out}, input.options()); 151 | auto columns = at::empty({channels * kernel_h * kernel_w, 1 * height_out * width_out}, input.options()); 152 | auto output = at::empty({batch, channels_out, height_out, width_out}, input.options()); 153 | 154 | auto grad_input = at::zeros_like(input); 155 | auto grad_weight = at::zeros_like(weight); 156 | auto grad_bias = at::zeros_like(bias); 157 | auto grad_offset = at::zeros_like(offset); 158 | auto grad_mask = at::zeros_like(mask); 159 | 160 | using scalar_t = float; 161 | 162 | for (int b = 0; b < batch; b++) 163 | { 164 | auto input_n = input.select(0, b); 165 | auto offset_n = offset.select(0, b); 166 | auto mask_n = mask.select(0, b); 167 | auto grad_output_n = grad_output.select(0, b); 168 | auto grad_input_n = grad_input.select(0, b); 169 | auto grad_offset_n = grad_offset.select(0, b); 170 | auto grad_mask_n = grad_mask.select(0, b); 171 | 172 | long m = channels * kernel_h * kernel_w; 173 | long n = height_out * width_out; 174 | long k = channels_out; 175 | 176 | THFloatBlas_gemm('n', 't', n, m, k, 1.0f, 177 | grad_output_n.data(), n, 178 | weight.data(), m, 0.0f, 179 | columns.data(), n); 180 | 181 | // gradient w.r.t. input coordinate data 182 | modulated_deformable_col2im_coord_cpu(columns.data(), 183 | input_n.data(), 184 | offset_n.data(), 185 | mask_n.data(), 186 | 1, channels, height, width, 187 | height_out, width_out, kernel_h, kernel_w, 188 | pad_h, pad_w, stride_h, stride_w, 189 | dilation_h, dilation_w, deformable_group, 190 | grad_offset_n.data(), 191 | grad_mask_n.data()); 192 | // gradient w.r.t. input data 193 | modulated_deformable_col2im_cpu(columns.data(), 194 | offset_n.data(), 195 | mask_n.data(), 196 | 1, channels, height, width, 197 | height_out, width_out, kernel_h, kernel_w, 198 | pad_h, pad_w, stride_h, stride_w, 199 | dilation_h, dilation_w, deformable_group, 200 | grad_input_n.data()); 201 | 202 | // gradient w.r.t. weight, dWeight should accumulate across the batch and group 203 | modulated_deformable_im2col_cpu(input_n.data(), 204 | offset_n.data(), 205 | mask_n.data(), 206 | 1, channels, height, width, 207 | height_out, width_out, kernel_h, kernel_w, 208 | pad_h, pad_w, stride_h, stride_w, 209 | dilation_h, dilation_w, deformable_group, 210 | columns.data()); 211 | 212 | long m_ = channels_out; 213 | long n_ = channels * kernel_h * kernel_w; 214 | long k_ = height_out * width_out; 215 | 216 | THFloatBlas_gemm('t', 'n', n_, m_, k_, 1.0f, 217 | columns.data(), k_, 218 | grad_output_n.data(), k_, 1.0f, 219 | grad_weight.data(), n_); 220 | 221 | // gradient w.r.t. bias 222 | // long m_ = channels_out; 223 | // long k__ = height_out * width_out; 224 | THFloatBlas_gemv('t', k_, m_, 1.0f, 225 | grad_output_n.data(), k_, 226 | ones.data(), 1, 1.0f, 227 | grad_bias.data(), 1); 228 | } 229 | 230 | return { 231 | grad_input, grad_offset, grad_mask, grad_weight, grad_bias 232 | }; 233 | } -------------------------------------------------------------------------------- /mmdet/layers/DCNv2/src/cpu/dcn_v2_im2col_cpu.cpp: -------------------------------------------------------------------------------- 1 | #include "dcn_v2_im2col_cpu.h" 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | //#include 8 | 9 | #include 10 | //#include 11 | //#include 12 | 13 | // modified from the CUDA version for CPU use by Daniel K. Suhendro 14 | 15 | /*#define CUDA_KERNEL_LOOP(i, n) \ 16 | for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ 17 | i < (n); \ 18 | i += blockDim.x * gridDim.x) 19 | 20 | const int CUDA_NUM_THREADS = 1024; 21 | inline int GET_BLOCKS(const int N) 22 | { 23 | return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; 24 | }*/ 25 | 26 | 27 | float dmcn_im2col_bilinear_cpu(const float *bottom_data, const int data_width, 28 | const int height, const int width, float h, float w) 29 | { 30 | int h_low = floor(h); 31 | int w_low = floor(w); 32 | int h_high = h_low + 1; 33 | int w_high = w_low + 1; 34 | 35 | float lh = h - h_low; 36 | float lw = w - w_low; 37 | float hh = 1 - lh, hw = 1 - lw; 38 | 39 | float v1 = 0; 40 | if (h_low >= 0 && w_low >= 0) 41 | v1 = bottom_data[h_low * data_width + w_low]; 42 | float v2 = 0; 43 | if (h_low >= 0 && w_high <= width - 1) 44 | v2 = bottom_data[h_low * data_width + w_high]; 45 | float v3 = 0; 46 | if (h_high <= height - 1 && w_low >= 0) 47 | v3 = bottom_data[h_high * data_width + w_low]; 48 | float v4 = 0; 49 | if (h_high <= height - 1 && w_high <= width - 1) 50 | v4 = bottom_data[h_high * data_width + w_high]; 51 | 52 | float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; 53 | 54 | float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); 55 | return val; 56 | } 57 | 58 | float dmcn_get_gradient_weight_cpu(float argmax_h, float argmax_w, 59 | const int h, const int w, const int height, const int width) 60 | { 61 | if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width) 62 | { 63 | //empty 64 | return 0; 65 | } 66 | 67 | int argmax_h_low = floor(argmax_h); 68 | int argmax_w_low = floor(argmax_w); 69 | int argmax_h_high = argmax_h_low + 1; 70 | int argmax_w_high = argmax_w_low + 1; 71 | 72 | float weight = 0; 73 | if (h == argmax_h_low && w == argmax_w_low) 74 | weight = (h + 1 - argmax_h) * (w + 1 - argmax_w); 75 | if (h == argmax_h_low && w == argmax_w_high) 76 | weight = (h + 1 - argmax_h) * (argmax_w + 1 - w); 77 | if (h == argmax_h_high && w == argmax_w_low) 78 | weight = (argmax_h + 1 - h) * (w + 1 - argmax_w); 79 | if (h == argmax_h_high && w == argmax_w_high) 80 | weight = (argmax_h + 1 - h) * (argmax_w + 1 - w); 81 | return weight; 82 | } 83 | 84 | float dmcn_get_coordinate_weight_cpu(float argmax_h, float argmax_w, 85 | const int height, const int width, const float *im_data, 86 | const int data_width, const int bp_dir) 87 | { 88 | if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width) 89 | { 90 | //empty 91 | return 0; 92 | } 93 | 94 | int argmax_h_low = floor(argmax_h); 95 | int argmax_w_low = floor(argmax_w); 96 | int argmax_h_high = argmax_h_low + 1; 97 | int argmax_w_high = argmax_w_low + 1; 98 | 99 | float weight = 0; 100 | 101 | if (bp_dir == 0) 102 | { 103 | if (argmax_h_low >= 0 && argmax_w_low >= 0) 104 | weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low]; 105 | if (argmax_h_low >= 0 && argmax_w_high <= width - 1) 106 | weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high]; 107 | if (argmax_h_high <= height - 1 && argmax_w_low >= 0) 108 | weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low]; 109 | if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) 110 | weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high]; 111 | } 112 | else if (bp_dir == 1) 113 | { 114 | if (argmax_h_low >= 0 && argmax_w_low >= 0) 115 | weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low]; 116 | if (argmax_h_low >= 0 && argmax_w_high <= width - 1) 117 | weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high]; 118 | if (argmax_h_high <= height - 1 && argmax_w_low >= 0) 119 | weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low]; 120 | if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) 121 | weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high]; 122 | } 123 | 124 | return weight; 125 | } 126 | 127 | void modulated_deformable_im2col_cpu_kernel(const int n, const float *data_im, const float *data_offset, const float *data_mask, 128 | const int height, const int width, const int kernel_h, const int kernel_w, 129 | const int pad_h, const int pad_w, 130 | const int stride_h, const int stride_w, 131 | const int dilation_h, const int dilation_w, 132 | const int channel_per_deformable_group, 133 | const int batch_size, const int num_channels, const int deformable_group, 134 | const int height_col, const int width_col, 135 | float *data_col) 136 | { 137 | // launch channels * batch_size * height_col * width_col cores 138 | for(int index=0; index(0); 178 | const float h_im = h_in + i * dilation_h + offset_h; 179 | const float w_im = w_in + j * dilation_w + offset_w; 180 | //if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) { 181 | if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) 182 | { 183 | //const float map_h = i * dilation_h + offset_h; 184 | //const float map_w = j * dilation_w + offset_w; 185 | //const int cur_height = height - h_in; 186 | //const int cur_width = width - w_in; 187 | //val = dmcn_im2col_bilinear_cpu(data_im_ptr, width, cur_height, cur_width, map_h, map_w); 188 | val = dmcn_im2col_bilinear_cpu(data_im_ptr, width, height, width, h_im, w_im); 189 | } 190 | *data_col_ptr = val * mask; 191 | // data_col_ptr += batch_size * height_col * width_col; 192 | data_col_ptr += height_col * width_col; 193 | } 194 | } 195 | } 196 | } 197 | 198 | void modulated_deformable_col2im_cpu_kernel(const int n, const float *data_col, const float *data_offset, const float *data_mask, 199 | const int channels, const int height, const int width, 200 | const int kernel_h, const int kernel_w, 201 | const int pad_h, const int pad_w, 202 | const int stride_h, const int stride_w, 203 | const int dilation_h, const int dilation_w, 204 | const int channel_per_deformable_group, 205 | const int batch_size, const int deformable_group, 206 | const int height_col, const int width_col, 207 | float *grad_im) 208 | { 209 | for(int index = 0; index < n; index++) 210 | { 211 | const int j = (index / width_col / height_col / batch_size) % kernel_w; 212 | const int i = (index / width_col / height_col / batch_size / kernel_w) % kernel_h; 213 | const int c = index / width_col / height_col / batch_size / kernel_w / kernel_h; 214 | // compute the start and end of the output 215 | 216 | const int deformable_group_index = c / channel_per_deformable_group; 217 | 218 | int w_out = index % width_col; 219 | int h_out = (index / width_col) % height_col; 220 | int b = (index / width_col / height_col) % batch_size; 221 | int w_in = w_out * stride_w - pad_w; 222 | int h_in = h_out * stride_h - pad_h; 223 | 224 | const float *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; 225 | const float *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col; 226 | const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out; 227 | const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out; 228 | const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_out) * width_col + w_out; 229 | const float offset_h = data_offset_ptr[data_offset_h_ptr]; 230 | const float offset_w = data_offset_ptr[data_offset_w_ptr]; 231 | const float mask = data_mask_ptr[data_mask_hw_ptr]; 232 | const float cur_inv_h_data = h_in + i * dilation_h + offset_h; 233 | const float cur_inv_w_data = w_in + j * dilation_w + offset_w; 234 | 235 | const float cur_top_grad = data_col[index] * mask; 236 | const int cur_h = (int)cur_inv_h_data; 237 | const int cur_w = (int)cur_inv_w_data; 238 | 239 | for (int dy = -2; dy <= 2; dy++) 240 | { 241 | for (int dx = -2; dx <= 2; dx++) 242 | { 243 | if (cur_h + dy >= 0 && cur_h + dy < height && 244 | cur_w + dx >= 0 && cur_w + dx < width && 245 | abs(cur_inv_h_data - (cur_h + dy)) < 1 && 246 | abs(cur_inv_w_data - (cur_w + dx)) < 1) 247 | { 248 | int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx; 249 | float weight = dmcn_get_gradient_weight_cpu(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width); 250 | //atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad); 251 | *(grad_im + cur_bottom_grad_pos) += weight * cur_top_grad; 252 | 253 | } 254 | } 255 | } 256 | } 257 | } 258 | 259 | void modulated_deformable_col2im_coord_cpu_kernel(const int n, const float *data_col, const float *data_im, 260 | const float *data_offset, const float *data_mask, 261 | const int channels, const int height, const int width, 262 | const int kernel_h, const int kernel_w, 263 | const int pad_h, const int pad_w, 264 | const int stride_h, const int stride_w, 265 | const int dilation_h, const int dilation_w, 266 | const int channel_per_deformable_group, 267 | const int batch_size, const int offset_channels, const int deformable_group, 268 | const int height_col, const int width_col, 269 | float *grad_offset, float *grad_mask) 270 | { 271 | for(int index = 0; index < n; index++) 272 | { 273 | float val = 0, mval = 0; 274 | int w = index % width_col; 275 | int h = (index / width_col) % height_col; 276 | int c = (index / width_col / height_col) % offset_channels; 277 | int b = (index / width_col / height_col) / offset_channels; 278 | // compute the start and end of the output 279 | 280 | const int deformable_group_index = c / (2 * kernel_h * kernel_w); 281 | const int col_step = kernel_h * kernel_w; 282 | int cnt = 0; 283 | const float *data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group * batch_size * width_col * height_col; 284 | const float *data_im_ptr = data_im + (b * deformable_group + deformable_group_index) * channel_per_deformable_group / kernel_h / kernel_w * height * width; 285 | const float *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; 286 | const float *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col; 287 | 288 | const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w; 289 | 290 | for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step) 291 | { 292 | const int col_pos = (((col_c * batch_size + b) * height_col) + h) * width_col + w; 293 | const int bp_dir = offset_c % 2; 294 | 295 | int j = (col_pos / width_col / height_col / batch_size) % kernel_w; 296 | int i = (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h; 297 | int w_out = col_pos % width_col; 298 | int h_out = (col_pos / width_col) % height_col; 299 | int w_in = w_out * stride_w - pad_w; 300 | int h_in = h_out * stride_h - pad_h; 301 | const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out); 302 | const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out); 303 | const int data_mask_hw_ptr = (((i * kernel_w + j) * height_col + h_out) * width_col + w_out); 304 | const float offset_h = data_offset_ptr[data_offset_h_ptr]; 305 | const float offset_w = data_offset_ptr[data_offset_w_ptr]; 306 | const float mask = data_mask_ptr[data_mask_hw_ptr]; 307 | float inv_h = h_in + i * dilation_h + offset_h; 308 | float inv_w = w_in + j * dilation_w + offset_w; 309 | if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) 310 | { 311 | inv_h = inv_w = -2; 312 | } 313 | else 314 | { 315 | mval += data_col_ptr[col_pos] * dmcn_im2col_bilinear_cpu(data_im_ptr + cnt * height * width, width, height, width, inv_h, inv_w); 316 | } 317 | const float weight = dmcn_get_coordinate_weight_cpu( 318 | inv_h, inv_w, 319 | height, width, data_im_ptr + cnt * height * width, width, bp_dir); 320 | val += weight * data_col_ptr[col_pos] * mask; 321 | cnt += 1; 322 | } 323 | // KERNEL_ASSIGN(grad_offset[index], offset_req, val); 324 | grad_offset[index] = val; 325 | if (offset_c % 2 == 0) 326 | // KERNEL_ASSIGN(grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w], mask_req, mval); 327 | grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w] = mval; 328 | } 329 | } 330 | 331 | void modulated_deformable_im2col_cpu(const float* data_im, const float* data_offset, const float* data_mask, 332 | const int batch_size, const int channels, const int height_im, const int width_im, 333 | const int height_col, const int width_col, const int kernel_h, const int kernel_w, 334 | const int pad_h, const int pad_w, const int stride_h, const int stride_w, 335 | const int dilation_h, const int dilation_w, 336 | const int deformable_group, float* data_col) { 337 | // num_axes should be smaller than block size 338 | const int channel_per_deformable_group = channels / deformable_group; 339 | const int num_kernels = channels * batch_size * height_col * width_col; 340 | modulated_deformable_im2col_cpu_kernel( 341 | num_kernels, data_im, data_offset, data_mask, height_im, width_im, kernel_h, kernel_w, 342 | pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group, 343 | batch_size, channels, deformable_group, height_col, width_col, data_col); 344 | 345 | /*cudaError_t err = cudaGetLastError(); 346 | if (err != cudaSuccess) 347 | { 348 | printf("error in modulated_deformable_im2col_cuda: %s\n", cudaGetErrorString(err)); 349 | }*/ 350 | 351 | } 352 | 353 | void modulated_deformable_col2im_cpu(const float* data_col, const float* data_offset, const float* data_mask, 354 | const int batch_size, const int channels, const int height_im, const int width_im, 355 | const int height_col, const int width_col, const int kernel_h, const int kernel_w, 356 | const int pad_h, const int pad_w, const int stride_h, const int stride_w, 357 | const int dilation_h, const int dilation_w, 358 | const int deformable_group, float* grad_im){ 359 | 360 | const int channel_per_deformable_group = channels / deformable_group; 361 | const int num_kernels = channels * kernel_h * kernel_w * batch_size * height_col * width_col; 362 | modulated_deformable_col2im_cpu_kernel( 363 | num_kernels, data_col, data_offset, data_mask, channels, height_im, width_im, 364 | kernel_h, kernel_w, pad_h, pad_h, stride_h, stride_w, 365 | dilation_h, dilation_w, channel_per_deformable_group, 366 | batch_size, deformable_group, height_col, width_col, grad_im); 367 | /*cudaError_t err = cudaGetLastError(); 368 | if (err != cudaSuccess) 369 | { 370 | printf("error in modulated_deformable_col2im_cuda: %s\n", cudaGetErrorString(err)); 371 | }*/ 372 | 373 | } 374 | 375 | void modulated_deformable_col2im_coord_cpu(const float* data_col, const float* data_im, const float* data_offset, const float* data_mask, 376 | const int batch_size, const int channels, const int height_im, const int width_im, 377 | const int height_col, const int width_col, const int kernel_h, const int kernel_w, 378 | const int pad_h, const int pad_w, const int stride_h, const int stride_w, 379 | const int dilation_h, const int dilation_w, 380 | const int deformable_group, 381 | float* grad_offset, float* grad_mask) { 382 | const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h * kernel_w * deformable_group; 383 | const int channel_per_deformable_group = channels * kernel_h * kernel_w / deformable_group; 384 | modulated_deformable_col2im_coord_cpu_kernel( 385 | num_kernels, data_col, data_im, data_offset, data_mask, channels, height_im, width_im, 386 | kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, 387 | dilation_h, dilation_w, channel_per_deformable_group, 388 | batch_size, 2 * kernel_h * kernel_w * deformable_group, deformable_group, height_col, width_col, 389 | grad_offset, grad_mask); 390 | /*cudaError_t err = cudaGetLastError(); 391 | if (err != cudaSuccess) 392 | { 393 | printf("error in modulated_deformable_col2im_coord_cuda: %s\n", cudaGetErrorString(err)); 394 | }*/ 395 | } -------------------------------------------------------------------------------- /mmdet/layers/DCNv2/src/cpu/dcn_v2_im2col_cpu.h: -------------------------------------------------------------------------------- 1 | 2 | /*! 3 | ******************* BEGIN Caffe Copyright Notice and Disclaimer **************** 4 | * 5 | * COPYRIGHT 6 | * 7 | * All contributions by the University of California: 8 | * Copyright (c) 2014-2017 The Regents of the University of California (Regents) 9 | * All rights reserved. 10 | * 11 | * All other contributions: 12 | * Copyright (c) 2014-2017, the respective contributors 13 | * All rights reserved. 14 | * 15 | * Caffe uses a shared copyright model: each contributor holds copyright over 16 | * their contributions to Caffe. The project versioning records all such 17 | * contribution and copyright details. If a contributor wants to further mark 18 | * their specific copyright on a particular contribution, they should indicate 19 | * their copyright solely in the commit message of the change when it is 20 | * committed. 21 | * 22 | * LICENSE 23 | * 24 | * Redistribution and use in source and binary forms, with or without 25 | * modification, are permitted provided that the following conditions are met: 26 | * 27 | * 1. Redistributions of source code must retain the above copyright notice, this 28 | * list of conditions and the following disclaimer. 29 | * 2. Redistributions in binary form must reproduce the above copyright notice, 30 | * this list of conditions and the following disclaimer in the documentation 31 | * and/or other materials provided with the distribution. 32 | * 33 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 34 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 35 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 36 | * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 37 | * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 38 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 39 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 40 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 41 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 42 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 43 | * 44 | * CONTRIBUTION AGREEMENT 45 | * 46 | * By contributing to the BVLC/caffe repository through pull-request, comment, 47 | * or otherwise, the contributor releases their content to the 48 | * license and copyright terms herein. 49 | * 50 | ***************** END Caffe Copyright Notice and Disclaimer ******************** 51 | * 52 | * Copyright (c) 2018 Microsoft 53 | * Licensed under The MIT License [see LICENSE for details] 54 | * \file modulated_deformable_im2col.h 55 | * \brief Function definitions of converting an image to 56 | * column matrix based on kernel, padding, dilation, and offset. 57 | * These functions are mainly used in deformable convolution operators. 58 | * \ref: https://arxiv.org/abs/1811.11168 59 | * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu 60 | */ 61 | 62 | /***************** Adapted by Charles Shang *********************/ 63 | // modified from the CUDA version for CPU use by Daniel K. Suhendro 64 | 65 | #ifndef DCN_V2_IM2COL_CPU 66 | #define DCN_V2_IM2COL_CPU 67 | 68 | #ifdef __cplusplus 69 | extern "C" 70 | { 71 | #endif 72 | 73 | void modulated_deformable_im2col_cpu(const float *data_im, const float *data_offset, const float *data_mask, 74 | const int batch_size, const int channels, const int height_im, const int width_im, 75 | const int height_col, const int width_col, const int kernel_h, const int kenerl_w, 76 | const int pad_h, const int pad_w, const int stride_h, const int stride_w, 77 | const int dilation_h, const int dilation_w, 78 | const int deformable_group, float *data_col); 79 | 80 | void modulated_deformable_col2im_cpu(const float *data_col, const float *data_offset, const float *data_mask, 81 | const int batch_size, const int channels, const int height_im, const int width_im, 82 | const int height_col, const int width_col, const int kernel_h, const int kenerl_w, 83 | const int pad_h, const int pad_w, const int stride_h, const int stride_w, 84 | const int dilation_h, const int dilation_w, 85 | const int deformable_group, float *grad_im); 86 | 87 | void modulated_deformable_col2im_coord_cpu(const float *data_col, const float *data_im, const float *data_offset, const float *data_mask, 88 | const int batch_size, const int channels, const int height_im, const int width_im, 89 | const int height_col, const int width_col, const int kernel_h, const int kenerl_w, 90 | const int pad_h, const int pad_w, const int stride_h, const int stride_w, 91 | const int dilation_h, const int dilation_w, 92 | const int deformable_group, 93 | float *grad_offset, float *grad_mask); 94 | 95 | #ifdef __cplusplus 96 | } 97 | #endif 98 | 99 | #endif -------------------------------------------------------------------------------- /mmdet/layers/DCNv2/src/cpu/dcn_v2_psroi_pooling_cpu.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | * Copyright (c) 2017 Microsoft 3 | * Licensed under The MIT License [see LICENSE for details] 4 | * \file deformable_psroi_pooling.cu 5 | * \brief 6 | * \author Yi Li, Guodong Zhang, Jifeng Dai 7 | */ 8 | /***************** Adapted by Charles Shang *********************/ 9 | // modified from the CUDA version for CPU use by Daniel K. Suhendro 10 | 11 | #include 12 | #include 13 | #include 14 | 15 | #include 16 | //#include 17 | 18 | #include 19 | //#include 20 | //#include 21 | 22 | /*#define CUDA_KERNEL_LOOP(i, n) \ 23 | for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ 24 | i < (n); \ 25 | i += blockDim.x * gridDim.x) 26 | 27 | const int CUDA_NUM_THREADS = 1024; 28 | inline int GET_BLOCKS(const int N) 29 | { 30 | return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; 31 | }*/ 32 | 33 | template 34 | T bilinear_interp_cpu( 35 | const T *data, 36 | const T x, 37 | const T y, 38 | const int width, 39 | const int height) 40 | { 41 | int x1 = floor(x); 42 | int x2 = ceil(x); 43 | int y1 = floor(y); 44 | int y2 = ceil(y); 45 | T dist_x = static_cast(x - x1); 46 | T dist_y = static_cast(y - y1); 47 | T value11 = data[y1 * width + x1]; 48 | T value12 = data[y2 * width + x1]; 49 | T value21 = data[y1 * width + x2]; 50 | T value22 = data[y2 * width + x2]; 51 | T value = (1 - dist_x) * (1 - dist_y) * value11 + 52 | (1 - dist_x) * dist_y * value12 + 53 | dist_x * (1 - dist_y) * value21 + 54 | dist_x * dist_y * value22; 55 | return value; 56 | } 57 | 58 | template 59 | void DeformablePSROIPoolForwardKernelCpu( 60 | const int count, 61 | const T *bottom_data, 62 | const T spatial_scale, 63 | const int channels, 64 | const int height, const int width, 65 | const int pooled_height, const int pooled_width, 66 | const T *bottom_rois, const T *bottom_trans, 67 | const int no_trans, 68 | const T trans_std, 69 | const int sample_per_part, 70 | const int output_dim, 71 | const int group_size, 72 | const int part_size, 73 | const int num_classes, 74 | const int channels_each_class, 75 | T *top_data, 76 | T *top_count) 77 | { 78 | for(int index = 0; index < count; index++) 79 | { 80 | // The output is in order (n, ctop, ph, pw) 81 | int pw = index % pooled_width; 82 | int ph = (index / pooled_width) % pooled_height; 83 | int ctop = (index / pooled_width / pooled_height) % output_dim; 84 | int n = index / pooled_width / pooled_height / output_dim; 85 | 86 | // [start, end) interval for spatial sampling 87 | const T *offset_bottom_rois = bottom_rois + n * 5; 88 | int roi_batch_ind = offset_bottom_rois[0]; 89 | T roi_start_w = static_cast(round(offset_bottom_rois[1])) * spatial_scale - 0.5; 90 | T roi_start_h = static_cast(round(offset_bottom_rois[2])) * spatial_scale - 0.5; 91 | T roi_end_w = static_cast(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5; 92 | T roi_end_h = static_cast(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5; 93 | 94 | // Force too small ROIs to be 1x1 95 | T roi_width = std::max(roi_end_w - roi_start_w, T(0.1)); //avoid 0 96 | T roi_height = std::max(roi_end_h - roi_start_h, T(0.1)); 97 | 98 | // Compute w and h at bottom 99 | T bin_size_h = roi_height / static_cast(pooled_height); 100 | T bin_size_w = roi_width / static_cast(pooled_width); 101 | 102 | T sub_bin_size_h = bin_size_h / static_cast(sample_per_part); 103 | T sub_bin_size_w = bin_size_w / static_cast(sample_per_part); 104 | 105 | int part_h = floor(static_cast(ph) / pooled_height * part_size); 106 | int part_w = floor(static_cast(pw) / pooled_width * part_size); 107 | int class_id = ctop / channels_each_class; 108 | T trans_x = no_trans ? static_cast(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * trans_std; 109 | T trans_y = no_trans ? static_cast(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * trans_std; 110 | 111 | T wstart = static_cast(pw) * bin_size_w + roi_start_w; 112 | wstart += trans_x * roi_width; 113 | T hstart = static_cast(ph) * bin_size_h + roi_start_h; 114 | hstart += trans_y * roi_height; 115 | 116 | T sum = 0; 117 | int count = 0; 118 | int gw = floor(static_cast(pw) * group_size / pooled_width); 119 | int gh = floor(static_cast(ph) * group_size / pooled_height); 120 | gw = std::min(std::max(gw, 0), group_size - 1); 121 | gh = std::min(std::max(gh, 0), group_size - 1); 122 | 123 | const T *offset_bottom_data = bottom_data + (roi_batch_ind * channels) * height * width; 124 | for (int ih = 0; ih < sample_per_part; ih++) 125 | { 126 | for (int iw = 0; iw < sample_per_part; iw++) 127 | { 128 | T w = wstart + iw * sub_bin_size_w; 129 | T h = hstart + ih * sub_bin_size_h; 130 | // bilinear interpolation 131 | if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5) 132 | { 133 | continue; 134 | } 135 | w = std::min(std::max(w, T(0.)), width - T(1.)); 136 | h = std::min(std::max(h, T(0.)), height - T(1.)); 137 | int c = (ctop * group_size + gh) * group_size + gw; 138 | T val = bilinear_interp_cpu(offset_bottom_data + c * height * width, w, h, width, height); 139 | sum += val; 140 | count++; 141 | } 142 | } 143 | top_data[index] = count == 0 ? static_cast(0) : sum / count; 144 | top_count[index] = count; 145 | } 146 | } 147 | 148 | template 149 | void DeformablePSROIPoolBackwardAccKernelCpu( 150 | const int count, 151 | const T *top_diff, 152 | const T *top_count, 153 | const int num_rois, 154 | const T spatial_scale, 155 | const int channels, 156 | const int height, const int width, 157 | const int pooled_height, const int pooled_width, 158 | const int output_dim, 159 | T *bottom_data_diff, T *bottom_trans_diff, 160 | const T *bottom_data, 161 | const T *bottom_rois, 162 | const T *bottom_trans, 163 | const int no_trans, 164 | const T trans_std, 165 | const int sample_per_part, 166 | const int group_size, 167 | const int part_size, 168 | const int num_classes, 169 | const int channels_each_class) 170 | { 171 | for(int index = 0; index < count; index++) 172 | { 173 | // The output is in order (n, ctop, ph, pw) 174 | int pw = index % pooled_width; 175 | int ph = (index / pooled_width) % pooled_height; 176 | int ctop = (index / pooled_width / pooled_height) % output_dim; 177 | int n = index / pooled_width / pooled_height / output_dim; 178 | 179 | // [start, end) interval for spatial sampling 180 | const T *offset_bottom_rois = bottom_rois + n * 5; 181 | int roi_batch_ind = offset_bottom_rois[0]; 182 | T roi_start_w = static_cast(round(offset_bottom_rois[1])) * spatial_scale - 0.5; 183 | T roi_start_h = static_cast(round(offset_bottom_rois[2])) * spatial_scale - 0.5; 184 | T roi_end_w = static_cast(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5; 185 | T roi_end_h = static_cast(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5; 186 | 187 | // Force too small ROIs to be 1x1 188 | T roi_width = std::max(roi_end_w - roi_start_w, T(0.1)); //avoid 0 189 | T roi_height = std::max(roi_end_h - roi_start_h, T(0.1)); 190 | 191 | // Compute w and h at bottom 192 | T bin_size_h = roi_height / static_cast(pooled_height); 193 | T bin_size_w = roi_width / static_cast(pooled_width); 194 | 195 | T sub_bin_size_h = bin_size_h / static_cast(sample_per_part); 196 | T sub_bin_size_w = bin_size_w / static_cast(sample_per_part); 197 | 198 | int part_h = floor(static_cast(ph) / pooled_height * part_size); 199 | int part_w = floor(static_cast(pw) / pooled_width * part_size); 200 | int class_id = ctop / channels_each_class; 201 | T trans_x = no_trans ? static_cast(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * trans_std; 202 | T trans_y = no_trans ? static_cast(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * trans_std; 203 | 204 | T wstart = static_cast(pw) * bin_size_w + roi_start_w; 205 | wstart += trans_x * roi_width; 206 | T hstart = static_cast(ph) * bin_size_h + roi_start_h; 207 | hstart += trans_y * roi_height; 208 | 209 | if (top_count[index] <= 0) 210 | { 211 | continue; 212 | } 213 | T diff_val = top_diff[index] / top_count[index]; 214 | const T *offset_bottom_data = bottom_data + roi_batch_ind * channels * height * width; 215 | T *offset_bottom_data_diff = bottom_data_diff + roi_batch_ind * channels * height * width; 216 | int gw = floor(static_cast(pw) * group_size / pooled_width); 217 | int gh = floor(static_cast(ph) * group_size / pooled_height); 218 | gw = std::min(std::max(gw, 0), group_size - 1); 219 | gh = std::min(std::max(gh, 0), group_size - 1); 220 | 221 | for (int ih = 0; ih < sample_per_part; ih++) 222 | { 223 | for (int iw = 0; iw < sample_per_part; iw++) 224 | { 225 | T w = wstart + iw * sub_bin_size_w; 226 | T h = hstart + ih * sub_bin_size_h; 227 | // bilinear interpolation 228 | if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5) 229 | { 230 | continue; 231 | } 232 | w = std::min(std::max(w, T(0.)), width - T(1.)); 233 | h = std::min(std::max(h, T(0.)), height - T(1.)); 234 | int c = (ctop * group_size + gh) * group_size + gw; 235 | // backward on feature 236 | int x0 = floor(w); 237 | int x1 = ceil(w); 238 | int y0 = floor(h); 239 | int y1 = ceil(h); 240 | T dist_x = w - x0, dist_y = h - y0; 241 | T q00 = (1 - dist_x) * (1 - dist_y); 242 | T q01 = (1 - dist_x) * dist_y; 243 | T q10 = dist_x * (1 - dist_y); 244 | T q11 = dist_x * dist_y; 245 | int bottom_index_base = c * height * width; 246 | /*atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x0, q00 * diff_val); 247 | atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x0, q01 * diff_val); 248 | atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x1, q10 * diff_val); 249 | atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x1, q11 * diff_val);*/ 250 | *(offset_bottom_data_diff + bottom_index_base + y0 * width + x0) += q00 * diff_val; 251 | *(offset_bottom_data_diff + bottom_index_base + y1 * width + x0) += q01 * diff_val; 252 | *(offset_bottom_data_diff + bottom_index_base + y0 * width + x1) += q10 * diff_val; 253 | *(offset_bottom_data_diff + bottom_index_base + y1 * width + x1) += q11 * diff_val; 254 | 255 | 256 | if (no_trans) 257 | { 258 | continue; 259 | } 260 | T U00 = offset_bottom_data[bottom_index_base + y0 * width + x0]; 261 | T U01 = offset_bottom_data[bottom_index_base + y1 * width + x0]; 262 | T U10 = offset_bottom_data[bottom_index_base + y0 * width + x1]; 263 | T U11 = offset_bottom_data[bottom_index_base + y1 * width + x1]; 264 | T diff_x = (U11 * dist_y + U10 * (1 - dist_y) - U01 * dist_y - U00 * (1 - dist_y)) * trans_std * diff_val; 265 | diff_x *= roi_width; 266 | T diff_y = (U11 * dist_x + U01 * (1 - dist_x) - U10 * dist_x - U00 * (1 - dist_x)) * trans_std * diff_val; 267 | diff_y *= roi_height; 268 | 269 | /*atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w, diff_x); 270 | atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w, diff_y);*/ 271 | *(bottom_trans_diff + (((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w) += diff_x; 272 | *(bottom_trans_diff + (((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w) += diff_y; 273 | } 274 | } 275 | } 276 | } 277 | 278 | std::tuple 279 | dcn_v2_psroi_pooling_cpu_forward(const at::Tensor &input, 280 | const at::Tensor &bbox, 281 | const at::Tensor &trans, 282 | const int no_trans, 283 | const float spatial_scale, 284 | const int output_dim, 285 | const int group_size, 286 | const int pooled_size, 287 | const int part_size, 288 | const int sample_per_part, 289 | const float trans_std) 290 | { 291 | /*AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); 292 | AT_ASSERTM(bbox.type().is_cuda(), "rois must be a CUDA tensor"); 293 | AT_ASSERTM(trans.type().is_cuda(), "trans must be a CUDA tensor");*/ 294 | 295 | const int batch = input.size(0); 296 | const int channels = input.size(1); 297 | const int height = input.size(2); 298 | const int width = input.size(3); 299 | const int channels_trans = no_trans ? 2 : trans.size(1); 300 | const int num_bbox = bbox.size(0); 301 | 302 | AT_ASSERTM(channels == output_dim, "input channels and output channels must equal"); 303 | auto pooled_height = pooled_size; 304 | auto pooled_width = pooled_size; 305 | 306 | auto out = at::empty({num_bbox, output_dim, pooled_height, pooled_width}, input.options()); 307 | long out_size = num_bbox * output_dim * pooled_height * pooled_width; 308 | auto top_count = at::zeros({num_bbox, output_dim, pooled_height, pooled_width}, input.options()); 309 | 310 | const int num_classes = no_trans ? 1 : channels_trans / 2; 311 | const int channels_each_class = no_trans ? output_dim : output_dim / num_classes; 312 | 313 | //cudaStream_t stream = at::cuda::getCurrentCUDAStream(); 314 | 315 | if (out.numel() == 0) 316 | { 317 | //THCudaCheck(cudaGetLastError()); 318 | return std::make_tuple(out, top_count); 319 | } 320 | 321 | /*dim3 grid(std::min(THCCeilDiv(out_size, 512L), 4096L)); 322 | dim3 block(512);*/ 323 | 324 | AT_DISPATCH_FLOATING_TYPES(input.type(), "dcn_v2_psroi_pooling_cpu_forward", [&] { 325 | DeformablePSROIPoolForwardKernelCpu( 326 | out_size, 327 | input.contiguous().data(), 328 | spatial_scale, 329 | channels, 330 | height, width, 331 | pooled_height, 332 | pooled_width, 333 | bbox.contiguous().data(), 334 | trans.contiguous().data(), 335 | no_trans, 336 | trans_std, 337 | sample_per_part, 338 | output_dim, 339 | group_size, 340 | part_size, 341 | num_classes, 342 | channels_each_class, 343 | out.data(), 344 | top_count.data()); 345 | }); 346 | //THCudaCheck(cudaGetLastError()); 347 | return std::make_tuple(out, top_count); 348 | } 349 | 350 | std::tuple 351 | dcn_v2_psroi_pooling_cpu_backward(const at::Tensor &out_grad, 352 | const at::Tensor &input, 353 | const at::Tensor &bbox, 354 | const at::Tensor &trans, 355 | const at::Tensor &top_count, 356 | const int no_trans, 357 | const float spatial_scale, 358 | const int output_dim, 359 | const int group_size, 360 | const int pooled_size, 361 | const int part_size, 362 | const int sample_per_part, 363 | const float trans_std) 364 | { 365 | /*AT_ASSERTM(out_grad.type().is_cuda(), "out_grad must be a CUDA tensor"); 366 | AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); 367 | AT_ASSERTM(bbox.type().is_cuda(), "bbox must be a CUDA tensor"); 368 | AT_ASSERTM(trans.type().is_cuda(), "trans must be a CUDA tensor"); 369 | AT_ASSERTM(top_count.type().is_cuda(), "top_count must be a CUDA tensor");*/ 370 | 371 | const int batch = input.size(0); 372 | const int channels = input.size(1); 373 | const int height = input.size(2); 374 | const int width = input.size(3); 375 | const int channels_trans = no_trans ? 2 : trans.size(1); 376 | const int num_bbox = bbox.size(0); 377 | 378 | AT_ASSERTM(channels == output_dim, "input channels and output channels must equal"); 379 | auto pooled_height = pooled_size; 380 | auto pooled_width = pooled_size; 381 | long out_size = num_bbox * output_dim * pooled_height * pooled_width; 382 | const int num_classes = no_trans ? 1 : channels_trans / 2; 383 | const int channels_each_class = no_trans ? output_dim : output_dim / num_classes; 384 | 385 | auto input_grad = at::zeros({batch, channels, height, width}, out_grad.options()); 386 | auto trans_grad = at::zeros_like(trans); 387 | 388 | if (input_grad.numel() == 0) 389 | { 390 | //THCudaCheck(cudaGetLastError()); 391 | return std::make_tuple(input_grad, trans_grad); 392 | } 393 | 394 | /*dim3 grid(std::min(THCCeilDiv(out_size, 512L), 4096L)); 395 | dim3 block(512); 396 | cudaStream_t stream = at::cuda::getCurrentCUDAStream();*/ 397 | 398 | AT_DISPATCH_FLOATING_TYPES(out_grad.type(), "dcn_v2_psroi_pooling_cpu_backward", [&] { 399 | DeformablePSROIPoolBackwardAccKernelCpu( 400 | out_size, 401 | out_grad.contiguous().data(), 402 | top_count.contiguous().data(), 403 | num_bbox, 404 | spatial_scale, 405 | channels, 406 | height, 407 | width, 408 | pooled_height, 409 | pooled_width, 410 | output_dim, 411 | input_grad.contiguous().data(), 412 | trans_grad.contiguous().data(), 413 | input.contiguous().data(), 414 | bbox.contiguous().data(), 415 | trans.contiguous().data(), 416 | no_trans, 417 | trans_std, 418 | sample_per_part, 419 | group_size, 420 | part_size, 421 | num_classes, 422 | channels_each_class); 423 | }); 424 | //THCudaCheck(cudaGetLastError()); 425 | return std::make_tuple(input_grad, trans_grad); 426 | } -------------------------------------------------------------------------------- /mmdet/layers/DCNv2/src/cpu/vision.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | at::Tensor 5 | dcn_v2_cpu_forward(const at::Tensor &input, 6 | const at::Tensor &weight, 7 | const at::Tensor &bias, 8 | const at::Tensor &offset, 9 | const at::Tensor &mask, 10 | const int kernel_h, 11 | const int kernel_w, 12 | const int stride_h, 13 | const int stride_w, 14 | const int pad_h, 15 | const int pad_w, 16 | const int dilation_h, 17 | const int dilation_w, 18 | const int deformable_group); 19 | 20 | std::vector 21 | dcn_v2_cpu_backward(const at::Tensor &input, 22 | const at::Tensor &weight, 23 | const at::Tensor &bias, 24 | const at::Tensor &offset, 25 | const at::Tensor &mask, 26 | const at::Tensor &grad_output, 27 | int kernel_h, int kernel_w, 28 | int stride_h, int stride_w, 29 | int pad_h, int pad_w, 30 | int dilation_h, int dilation_w, 31 | int deformable_group); 32 | 33 | 34 | std::tuple 35 | dcn_v2_psroi_pooling_cpu_forward(const at::Tensor &input, 36 | const at::Tensor &bbox, 37 | const at::Tensor &trans, 38 | const int no_trans, 39 | const float spatial_scale, 40 | const int output_dim, 41 | const int group_size, 42 | const int pooled_size, 43 | const int part_size, 44 | const int sample_per_part, 45 | const float trans_std); 46 | 47 | std::tuple 48 | dcn_v2_psroi_pooling_cpu_backward(const at::Tensor &out_grad, 49 | const at::Tensor &input, 50 | const at::Tensor &bbox, 51 | const at::Tensor &trans, 52 | const at::Tensor &top_count, 53 | const int no_trans, 54 | const float spatial_scale, 55 | const int output_dim, 56 | const int group_size, 57 | const int pooled_size, 58 | const int part_size, 59 | const int sample_per_part, 60 | const float trans_std); -------------------------------------------------------------------------------- /mmdet/layers/DCNv2/src/cuda/dcn_v2_cuda.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include "cuda/dcn_v2_im2col_cuda.h" 3 | 4 | #include 5 | #include 6 | 7 | #include 8 | #include 9 | #include 10 | 11 | THCState *state = at::globalContext().lazyInitCUDA(); 12 | 13 | // author: Charles Shang 14 | // https://github.com/torch/cunn/blob/master/lib/THCUNN/generic/SpatialConvolutionMM.cu 15 | 16 | // [batch gemm] 17 | // https://github.com/pytorch/pytorch/blob/master/aten/src/THC/generic/THCTensorMathBlas.cu 18 | 19 | __global__ void createBatchGemmBuffer(const float **input_b, float **output_b, 20 | float **columns_b, const float **ones_b, 21 | const float **weight_b, const float **bias_b, 22 | float *input, float *output, 23 | float *columns, float *ones, 24 | float *weight, float *bias, 25 | const int input_stride, const int output_stride, 26 | const int columns_stride, const int ones_stride, 27 | const int num_batches) 28 | { 29 | const int idx = blockIdx.x * blockDim.x + threadIdx.x; 30 | if (idx < num_batches) 31 | { 32 | input_b[idx] = input + idx * input_stride; 33 | output_b[idx] = output + idx * output_stride; 34 | columns_b[idx] = columns + idx * columns_stride; 35 | ones_b[idx] = ones + idx * ones_stride; 36 | // share weights and bias within a Mini-Batch 37 | weight_b[idx] = weight; 38 | bias_b[idx] = bias; 39 | } 40 | } 41 | 42 | at::Tensor 43 | dcn_v2_cuda_forward(const at::Tensor &input, 44 | const at::Tensor &weight, 45 | const at::Tensor &bias, 46 | const at::Tensor &offset, 47 | const at::Tensor &mask, 48 | const int kernel_h, 49 | const int kernel_w, 50 | const int stride_h, 51 | const int stride_w, 52 | const int pad_h, 53 | const int pad_w, 54 | const int dilation_h, 55 | const int dilation_w, 56 | const int deformable_group) 57 | { 58 | using scalar_t = float; 59 | // THCAssertSameGPU(THCudaTensor_checkGPU(state, 5, input, weight, bias, offset, mask)); 60 | AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); 61 | AT_ASSERTM(weight.type().is_cuda(), "weight must be a CUDA tensor"); 62 | AT_ASSERTM(bias.type().is_cuda(), "bias must be a CUDA tensor"); 63 | AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor"); 64 | AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor"); 65 | 66 | const int batch = input.size(0); 67 | const int channels = input.size(1); 68 | const int height = input.size(2); 69 | const int width = input.size(3); 70 | 71 | const int channels_out = weight.size(0); 72 | const int channels_kernel = weight.size(1); 73 | const int kernel_h_ = weight.size(2); 74 | const int kernel_w_ = weight.size(3); 75 | 76 | // printf("Kernels: %d %d %d %d\n", kernel_h_, kernel_w_, kernel_w, kernel_h); 77 | // printf("Channels: %d %d\n", channels, channels_kernel); 78 | // printf("Channels: %d %d\n", channels_out, channels_kernel); 79 | 80 | AT_ASSERTM(kernel_h_ == kernel_h && kernel_w_ == kernel_w, 81 | "Input shape and kernel shape wont match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_); 82 | 83 | AT_ASSERTM(channels == channels_kernel, 84 | "Input shape and kernel channels wont match: (%d vs %d).", channels, channels_kernel); 85 | 86 | const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; 87 | const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; 88 | 89 | auto ones = at::ones({batch, height_out, width_out}, input.options()); 90 | auto columns = at::empty({batch, channels * kernel_h * kernel_w, 1 * height_out * width_out}, input.options()); 91 | auto output = at::empty({batch, channels_out, height_out, width_out}, input.options()); 92 | 93 | // prepare for batch-wise computing, which is significantly faster than instance-wise computing 94 | // when batch size is large. 95 | // launch batch threads 96 | int matrices_size = batch * sizeof(float *); 97 | auto input_b = static_cast(THCudaMalloc(state, matrices_size)); 98 | auto output_b = static_cast(THCudaMalloc(state, matrices_size)); 99 | auto columns_b = static_cast(THCudaMalloc(state, matrices_size)); 100 | auto ones_b = static_cast(THCudaMalloc(state, matrices_size)); 101 | auto weight_b = static_cast(THCudaMalloc(state, matrices_size)); 102 | auto bias_b = static_cast(THCudaMalloc(state, matrices_size)); 103 | 104 | const int block = 128; 105 | const int grid = (batch + block - 1) / block; 106 | 107 | createBatchGemmBuffer<<>>( 108 | input_b, output_b, 109 | columns_b, ones_b, 110 | weight_b, bias_b, 111 | input.data(), 112 | output.data(), 113 | columns.data(), 114 | ones.data(), 115 | weight.data(), 116 | bias.data(), 117 | channels * width * height, 118 | channels_out * width_out * height_out, 119 | channels * kernel_h * kernel_w * height_out * width_out, 120 | height_out * width_out, 121 | batch); 122 | 123 | long m_ = channels_out; 124 | long n_ = height_out * width_out; 125 | long k_ = 1; 126 | THCudaBlas_SgemmBatched(state, 127 | 't', 128 | 'n', 129 | n_, 130 | m_, 131 | k_, 132 | 1.0f, 133 | ones_b, k_, 134 | bias_b, k_, 135 | 0.0f, 136 | output_b, n_, 137 | batch); 138 | 139 | modulated_deformable_im2col_cuda(c10::cuda::getCurrentCUDAStream(), 140 | input.data(), 141 | offset.data(), 142 | mask.data(), 143 | batch, channels, height, width, 144 | height_out, width_out, kernel_h, kernel_w, 145 | pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, 146 | deformable_group, 147 | columns.data()); 148 | 149 | long m = channels_out; 150 | long n = height_out * width_out; 151 | long k = channels * kernel_h * kernel_w; 152 | THCudaBlas_SgemmBatched(state, 153 | 'n', 154 | 'n', 155 | n, 156 | m, 157 | k, 158 | 1.0f, 159 | (const float **)columns_b, n, 160 | weight_b, k, 161 | 1.0f, 162 | output_b, n, 163 | batch); 164 | 165 | THCudaFree(state, input_b); 166 | THCudaFree(state, output_b); 167 | THCudaFree(state, columns_b); 168 | THCudaFree(state, ones_b); 169 | THCudaFree(state, weight_b); 170 | THCudaFree(state, bias_b); 171 | return output; 172 | } 173 | 174 | __global__ void createBatchGemmBufferBackward( 175 | float **grad_output_b, 176 | float **columns_b, 177 | float **ones_b, 178 | float **weight_b, 179 | float **grad_weight_b, 180 | float **grad_bias_b, 181 | float *grad_output, 182 | float *columns, 183 | float *ones, 184 | float *weight, 185 | float *grad_weight, 186 | float *grad_bias, 187 | const int grad_output_stride, 188 | const int columns_stride, 189 | const int ones_stride, 190 | const int num_batches) 191 | { 192 | const int idx = blockIdx.x * blockDim.x + threadIdx.x; 193 | if (idx < num_batches) 194 | { 195 | grad_output_b[idx] = grad_output + idx * grad_output_stride; 196 | columns_b[idx] = columns + idx * columns_stride; 197 | ones_b[idx] = ones + idx * ones_stride; 198 | 199 | // share weights and bias within a Mini-Batch 200 | weight_b[idx] = weight; 201 | grad_weight_b[idx] = grad_weight; 202 | grad_bias_b[idx] = grad_bias; 203 | } 204 | } 205 | 206 | std::vector dcn_v2_cuda_backward(const at::Tensor &input, 207 | const at::Tensor &weight, 208 | const at::Tensor &bias, 209 | const at::Tensor &offset, 210 | const at::Tensor &mask, 211 | const at::Tensor &grad_output, 212 | int kernel_h, int kernel_w, 213 | int stride_h, int stride_w, 214 | int pad_h, int pad_w, 215 | int dilation_h, int dilation_w, 216 | int deformable_group) 217 | { 218 | 219 | THArgCheck(input.is_contiguous(), 1, "input tensor has to be contiguous"); 220 | THArgCheck(weight.is_contiguous(), 2, "weight tensor has to be contiguous"); 221 | 222 | AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); 223 | AT_ASSERTM(weight.type().is_cuda(), "weight must be a CUDA tensor"); 224 | AT_ASSERTM(bias.type().is_cuda(), "bias must be a CUDA tensor"); 225 | AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor"); 226 | AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor"); 227 | 228 | const int batch = input.size(0); 229 | const int channels = input.size(1); 230 | const int height = input.size(2); 231 | const int width = input.size(3); 232 | 233 | const int channels_out = weight.size(0); 234 | const int channels_kernel = weight.size(1); 235 | const int kernel_h_ = weight.size(2); 236 | const int kernel_w_ = weight.size(3); 237 | 238 | AT_ASSERTM(kernel_h_ == kernel_h && kernel_w_ == kernel_w, 239 | "Input shape and kernel shape wont match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_); 240 | 241 | AT_ASSERTM(channels == channels_kernel, 242 | "Input shape and kernel channels wont match: (%d vs %d).", channels, channels_kernel); 243 | 244 | const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; 245 | const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; 246 | 247 | auto ones = at::ones({height_out, width_out}, input.options()); 248 | auto columns = at::empty({channels * kernel_h * kernel_w, 1 * height_out * width_out}, input.options()); 249 | auto output = at::empty({batch, channels_out, height_out, width_out}, input.options()); 250 | 251 | auto grad_input = at::zeros_like(input); 252 | auto grad_weight = at::zeros_like(weight); 253 | auto grad_bias = at::zeros_like(bias); 254 | auto grad_offset = at::zeros_like(offset); 255 | auto grad_mask = at::zeros_like(mask); 256 | 257 | using scalar_t = float; 258 | 259 | for (int b = 0; b < batch; b++) 260 | { 261 | auto input_n = input.select(0, b); 262 | auto offset_n = offset.select(0, b); 263 | auto mask_n = mask.select(0, b); 264 | auto grad_output_n = grad_output.select(0, b); 265 | auto grad_input_n = grad_input.select(0, b); 266 | auto grad_offset_n = grad_offset.select(0, b); 267 | auto grad_mask_n = grad_mask.select(0, b); 268 | 269 | long m = channels * kernel_h * kernel_w; 270 | long n = height_out * width_out; 271 | long k = channels_out; 272 | 273 | THCudaBlas_Sgemm(state, 'n', 't', n, m, k, 1.0f, 274 | grad_output_n.data(), n, 275 | weight.data(), m, 0.0f, 276 | columns.data(), n); 277 | 278 | // gradient w.r.t. input coordinate data 279 | modulated_deformable_col2im_coord_cuda(c10::cuda::getCurrentCUDAStream(), 280 | columns.data(), 281 | input_n.data(), 282 | offset_n.data(), 283 | mask_n.data(), 284 | 1, channels, height, width, 285 | height_out, width_out, kernel_h, kernel_w, 286 | pad_h, pad_w, stride_h, stride_w, 287 | dilation_h, dilation_w, deformable_group, 288 | grad_offset_n.data(), 289 | grad_mask_n.data()); 290 | // gradient w.r.t. input data 291 | modulated_deformable_col2im_cuda(c10::cuda::getCurrentCUDAStream(), 292 | columns.data(), 293 | offset_n.data(), 294 | mask_n.data(), 295 | 1, channels, height, width, 296 | height_out, width_out, kernel_h, kernel_w, 297 | pad_h, pad_w, stride_h, stride_w, 298 | dilation_h, dilation_w, deformable_group, 299 | grad_input_n.data()); 300 | 301 | // gradient w.r.t. weight, dWeight should accumulate across the batch and group 302 | modulated_deformable_im2col_cuda(c10::cuda::getCurrentCUDAStream(), 303 | input_n.data(), 304 | offset_n.data(), 305 | mask_n.data(), 306 | 1, channels, height, width, 307 | height_out, width_out, kernel_h, kernel_w, 308 | pad_h, pad_w, stride_h, stride_w, 309 | dilation_h, dilation_w, deformable_group, 310 | columns.data()); 311 | 312 | long m_ = channels_out; 313 | long n_ = channels * kernel_h * kernel_w; 314 | long k_ = height_out * width_out; 315 | 316 | THCudaBlas_Sgemm(state, 't', 'n', n_, m_, k_, 1.0f, 317 | columns.data(), k_, 318 | grad_output_n.data(), k_, 1.0f, 319 | grad_weight.data(), n_); 320 | 321 | // gradient w.r.t. bias 322 | // long m_ = channels_out; 323 | // long k__ = height_out * width_out; 324 | THCudaBlas_Sgemv(state, 325 | 't', 326 | k_, m_, 1.0f, 327 | grad_output_n.data(), k_, 328 | ones.data(), 1, 1.0f, 329 | grad_bias.data(), 1); 330 | } 331 | 332 | return { 333 | grad_input, grad_offset, grad_mask, grad_weight, grad_bias 334 | }; 335 | } -------------------------------------------------------------------------------- /mmdet/layers/DCNv2/src/cuda/dcn_v2_im2col_cuda.cu: -------------------------------------------------------------------------------- 1 | #include "dcn_v2_im2col_cuda.h" 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | #include 8 | 9 | #include 10 | #include 11 | #include 12 | 13 | #define CUDA_KERNEL_LOOP(i, n) \ 14 | for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ 15 | i < (n); \ 16 | i += blockDim.x * gridDim.x) 17 | 18 | const int CUDA_NUM_THREADS = 1024; 19 | inline int GET_BLOCKS(const int N) 20 | { 21 | return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; 22 | } 23 | 24 | 25 | __device__ float dmcn_im2col_bilinear_cuda(const float *bottom_data, const int data_width, 26 | const int height, const int width, float h, float w) 27 | { 28 | int h_low = floor(h); 29 | int w_low = floor(w); 30 | int h_high = h_low + 1; 31 | int w_high = w_low + 1; 32 | 33 | float lh = h - h_low; 34 | float lw = w - w_low; 35 | float hh = 1 - lh, hw = 1 - lw; 36 | 37 | float v1 = 0; 38 | if (h_low >= 0 && w_low >= 0) 39 | v1 = bottom_data[h_low * data_width + w_low]; 40 | float v2 = 0; 41 | if (h_low >= 0 && w_high <= width - 1) 42 | v2 = bottom_data[h_low * data_width + w_high]; 43 | float v3 = 0; 44 | if (h_high <= height - 1 && w_low >= 0) 45 | v3 = bottom_data[h_high * data_width + w_low]; 46 | float v4 = 0; 47 | if (h_high <= height - 1 && w_high <= width - 1) 48 | v4 = bottom_data[h_high * data_width + w_high]; 49 | 50 | float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; 51 | 52 | float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); 53 | return val; 54 | } 55 | 56 | __device__ float dmcn_get_gradient_weight_cuda(float argmax_h, float argmax_w, 57 | const int h, const int w, const int height, const int width) 58 | { 59 | if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width) 60 | { 61 | //empty 62 | return 0; 63 | } 64 | 65 | int argmax_h_low = floor(argmax_h); 66 | int argmax_w_low = floor(argmax_w); 67 | int argmax_h_high = argmax_h_low + 1; 68 | int argmax_w_high = argmax_w_low + 1; 69 | 70 | float weight = 0; 71 | if (h == argmax_h_low && w == argmax_w_low) 72 | weight = (h + 1 - argmax_h) * (w + 1 - argmax_w); 73 | if (h == argmax_h_low && w == argmax_w_high) 74 | weight = (h + 1 - argmax_h) * (argmax_w + 1 - w); 75 | if (h == argmax_h_high && w == argmax_w_low) 76 | weight = (argmax_h + 1 - h) * (w + 1 - argmax_w); 77 | if (h == argmax_h_high && w == argmax_w_high) 78 | weight = (argmax_h + 1 - h) * (argmax_w + 1 - w); 79 | return weight; 80 | } 81 | 82 | __device__ float dmcn_get_coordinate_weight_cuda(float argmax_h, float argmax_w, 83 | const int height, const int width, const float *im_data, 84 | const int data_width, const int bp_dir) 85 | { 86 | if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width) 87 | { 88 | //empty 89 | return 0; 90 | } 91 | 92 | int argmax_h_low = floor(argmax_h); 93 | int argmax_w_low = floor(argmax_w); 94 | int argmax_h_high = argmax_h_low + 1; 95 | int argmax_w_high = argmax_w_low + 1; 96 | 97 | float weight = 0; 98 | 99 | if (bp_dir == 0) 100 | { 101 | if (argmax_h_low >= 0 && argmax_w_low >= 0) 102 | weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low]; 103 | if (argmax_h_low >= 0 && argmax_w_high <= width - 1) 104 | weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high]; 105 | if (argmax_h_high <= height - 1 && argmax_w_low >= 0) 106 | weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low]; 107 | if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) 108 | weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high]; 109 | } 110 | else if (bp_dir == 1) 111 | { 112 | if (argmax_h_low >= 0 && argmax_w_low >= 0) 113 | weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low]; 114 | if (argmax_h_low >= 0 && argmax_w_high <= width - 1) 115 | weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high]; 116 | if (argmax_h_high <= height - 1 && argmax_w_low >= 0) 117 | weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low]; 118 | if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) 119 | weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high]; 120 | } 121 | 122 | return weight; 123 | } 124 | 125 | __global__ void modulated_deformable_im2col_gpu_kernel(const int n, 126 | const float *data_im, const float *data_offset, const float *data_mask, 127 | const int height, const int width, const int kernel_h, const int kernel_w, 128 | const int pad_h, const int pad_w, 129 | const int stride_h, const int stride_w, 130 | const int dilation_h, const int dilation_w, 131 | const int channel_per_deformable_group, 132 | const int batch_size, const int num_channels, const int deformable_group, 133 | const int height_col, const int width_col, 134 | float *data_col) 135 | { 136 | // launch channels * batch_size * height_col * width_col cores 137 | CUDA_KERNEL_LOOP(index, n) 138 | { 139 | // NOTE(CharlesShang): different from Dai Jifeng's MXNet implementation, col_buffer is of shape (c*kw*kh, N, oh, ow) 140 | // here columns is of shape (N, c*kw*kh, oh * ow), need to adapt axis 141 | 142 | // index index of output matrix 143 | const int w_col = index % width_col; 144 | const int h_col = (index / width_col) % height_col; 145 | // const int b_col = (index / width_col / height_col) % batch_size; 146 | const int b_col = (index / width_col / height_col / num_channels) % batch_size; 147 | // const int c_im = (index / width_col / height_col) / batch_size; 148 | const int c_im = (index / width_col / height_col) % num_channels; 149 | // const int c_col = c_im * kernel_h * kernel_w; 150 | const int c_col = c_im * kernel_h * kernel_w; 151 | 152 | // compute deformable group index 153 | const int deformable_group_index = c_im / channel_per_deformable_group; 154 | 155 | const int h_in = h_col * stride_h - pad_h; 156 | const int w_in = w_col * stride_w - pad_w; 157 | 158 | // float *data_col_ptr = data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col; 159 | float *data_col_ptr = data_col + ((b_col * num_channels * kernel_w * kernel_h + c_col) * height_col + h_col) * width_col + w_col; 160 | //const float* data_im_ptr = data_im + ((b_col * num_channels + c_im) * height + h_in) * width + w_in; 161 | const float *data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width; 162 | const float *data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; 163 | 164 | const float *data_mask_ptr = data_mask + (b_col * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col; 165 | 166 | for (int i = 0; i < kernel_h; ++i) 167 | { 168 | for (int j = 0; j < kernel_w; ++j) 169 | { 170 | const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col; 171 | const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col; 172 | const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_col) * width_col + w_col; 173 | const float offset_h = data_offset_ptr[data_offset_h_ptr]; 174 | const float offset_w = data_offset_ptr[data_offset_w_ptr]; 175 | const float mask = data_mask_ptr[data_mask_hw_ptr]; 176 | float val = static_cast(0); 177 | const float h_im = h_in + i * dilation_h + offset_h; 178 | const float w_im = w_in + j * dilation_w + offset_w; 179 | //if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) { 180 | if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) 181 | { 182 | //const float map_h = i * dilation_h + offset_h; 183 | //const float map_w = j * dilation_w + offset_w; 184 | //const int cur_height = height - h_in; 185 | //const int cur_width = width - w_in; 186 | //val = dmcn_im2col_bilinear_cuda(data_im_ptr, width, cur_height, cur_width, map_h, map_w); 187 | val = dmcn_im2col_bilinear_cuda(data_im_ptr, width, height, width, h_im, w_im); 188 | } 189 | *data_col_ptr = val * mask; 190 | // data_col_ptr += batch_size * height_col * width_col; 191 | data_col_ptr += height_col * width_col; 192 | } 193 | } 194 | } 195 | } 196 | 197 | __global__ void modulated_deformable_col2im_gpu_kernel(const int n, 198 | const float *data_col, const float *data_offset, const float *data_mask, 199 | const int channels, const int height, const int width, 200 | const int kernel_h, const int kernel_w, 201 | const int pad_h, const int pad_w, 202 | const int stride_h, const int stride_w, 203 | const int dilation_h, const int dilation_w, 204 | const int channel_per_deformable_group, 205 | const int batch_size, const int deformable_group, 206 | const int height_col, const int width_col, 207 | float *grad_im) 208 | { 209 | CUDA_KERNEL_LOOP(index, n) 210 | { 211 | const int j = (index / width_col / height_col / batch_size) % kernel_w; 212 | const int i = (index / width_col / height_col / batch_size / kernel_w) % kernel_h; 213 | const int c = index / width_col / height_col / batch_size / kernel_w / kernel_h; 214 | // compute the start and end of the output 215 | 216 | const int deformable_group_index = c / channel_per_deformable_group; 217 | 218 | int w_out = index % width_col; 219 | int h_out = (index / width_col) % height_col; 220 | int b = (index / width_col / height_col) % batch_size; 221 | int w_in = w_out * stride_w - pad_w; 222 | int h_in = h_out * stride_h - pad_h; 223 | 224 | const float *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; 225 | const float *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col; 226 | const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out; 227 | const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out; 228 | const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_out) * width_col + w_out; 229 | const float offset_h = data_offset_ptr[data_offset_h_ptr]; 230 | const float offset_w = data_offset_ptr[data_offset_w_ptr]; 231 | const float mask = data_mask_ptr[data_mask_hw_ptr]; 232 | const float cur_inv_h_data = h_in + i * dilation_h + offset_h; 233 | const float cur_inv_w_data = w_in + j * dilation_w + offset_w; 234 | 235 | const float cur_top_grad = data_col[index] * mask; 236 | const int cur_h = (int)cur_inv_h_data; 237 | const int cur_w = (int)cur_inv_w_data; 238 | for (int dy = -2; dy <= 2; dy++) 239 | { 240 | for (int dx = -2; dx <= 2; dx++) 241 | { 242 | if (cur_h + dy >= 0 && cur_h + dy < height && 243 | cur_w + dx >= 0 && cur_w + dx < width && 244 | abs(cur_inv_h_data - (cur_h + dy)) < 1 && 245 | abs(cur_inv_w_data - (cur_w + dx)) < 1) 246 | { 247 | int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx; 248 | float weight = dmcn_get_gradient_weight_cuda(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width); 249 | atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad); 250 | } 251 | } 252 | } 253 | } 254 | } 255 | 256 | __global__ void modulated_deformable_col2im_coord_gpu_kernel(const int n, 257 | const float *data_col, const float *data_im, 258 | const float *data_offset, const float *data_mask, 259 | const int channels, const int height, const int width, 260 | const int kernel_h, const int kernel_w, 261 | const int pad_h, const int pad_w, 262 | const int stride_h, const int stride_w, 263 | const int dilation_h, const int dilation_w, 264 | const int channel_per_deformable_group, 265 | const int batch_size, const int offset_channels, const int deformable_group, 266 | const int height_col, const int width_col, 267 | float *grad_offset, float *grad_mask) 268 | { 269 | CUDA_KERNEL_LOOP(index, n) 270 | { 271 | float val = 0, mval = 0; 272 | int w = index % width_col; 273 | int h = (index / width_col) % height_col; 274 | int c = (index / width_col / height_col) % offset_channels; 275 | int b = (index / width_col / height_col) / offset_channels; 276 | // compute the start and end of the output 277 | 278 | const int deformable_group_index = c / (2 * kernel_h * kernel_w); 279 | const int col_step = kernel_h * kernel_w; 280 | int cnt = 0; 281 | const float *data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group * batch_size * width_col * height_col; 282 | const float *data_im_ptr = data_im + (b * deformable_group + deformable_group_index) * channel_per_deformable_group / kernel_h / kernel_w * height * width; 283 | const float *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; 284 | const float *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col; 285 | 286 | const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w; 287 | 288 | for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step) 289 | { 290 | const int col_pos = (((col_c * batch_size + b) * height_col) + h) * width_col + w; 291 | const int bp_dir = offset_c % 2; 292 | 293 | int j = (col_pos / width_col / height_col / batch_size) % kernel_w; 294 | int i = (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h; 295 | int w_out = col_pos % width_col; 296 | int h_out = (col_pos / width_col) % height_col; 297 | int w_in = w_out * stride_w - pad_w; 298 | int h_in = h_out * stride_h - pad_h; 299 | const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out); 300 | const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out); 301 | const int data_mask_hw_ptr = (((i * kernel_w + j) * height_col + h_out) * width_col + w_out); 302 | const float offset_h = data_offset_ptr[data_offset_h_ptr]; 303 | const float offset_w = data_offset_ptr[data_offset_w_ptr]; 304 | const float mask = data_mask_ptr[data_mask_hw_ptr]; 305 | float inv_h = h_in + i * dilation_h + offset_h; 306 | float inv_w = w_in + j * dilation_w + offset_w; 307 | if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) 308 | { 309 | inv_h = inv_w = -2; 310 | } 311 | else 312 | { 313 | mval += data_col_ptr[col_pos] * dmcn_im2col_bilinear_cuda(data_im_ptr + cnt * height * width, width, height, width, inv_h, inv_w); 314 | } 315 | const float weight = dmcn_get_coordinate_weight_cuda( 316 | inv_h, inv_w, 317 | height, width, data_im_ptr + cnt * height * width, width, bp_dir); 318 | val += weight * data_col_ptr[col_pos] * mask; 319 | cnt += 1; 320 | } 321 | // KERNEL_ASSIGN(grad_offset[index], offset_req, val); 322 | grad_offset[index] = val; 323 | if (offset_c % 2 == 0) 324 | // KERNEL_ASSIGN(grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w], mask_req, mval); 325 | grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w] = mval; 326 | } 327 | } 328 | 329 | void modulated_deformable_im2col_cuda(cudaStream_t stream, 330 | const float* data_im, const float* data_offset, const float* data_mask, 331 | const int batch_size, const int channels, const int height_im, const int width_im, 332 | const int height_col, const int width_col, const int kernel_h, const int kernel_w, 333 | const int pad_h, const int pad_w, const int stride_h, const int stride_w, 334 | const int dilation_h, const int dilation_w, 335 | const int deformable_group, float* data_col) { 336 | // num_axes should be smaller than block size 337 | const int channel_per_deformable_group = channels / deformable_group; 338 | const int num_kernels = channels * batch_size * height_col * width_col; 339 | modulated_deformable_im2col_gpu_kernel 340 | <<>>( 342 | num_kernels, data_im, data_offset, data_mask, height_im, width_im, kernel_h, kernel_w, 343 | pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group, 344 | batch_size, channels, deformable_group, height_col, width_col, data_col); 345 | 346 | cudaError_t err = cudaGetLastError(); 347 | if (err != cudaSuccess) 348 | { 349 | printf("error in modulated_deformable_im2col_cuda: %s\n", cudaGetErrorString(err)); 350 | } 351 | 352 | } 353 | 354 | void modulated_deformable_col2im_cuda(cudaStream_t stream, 355 | const float* data_col, const float* data_offset, const float* data_mask, 356 | const int batch_size, const int channels, const int height_im, const int width_im, 357 | const int height_col, const int width_col, const int kernel_h, const int kernel_w, 358 | const int pad_h, const int pad_w, const int stride_h, const int stride_w, 359 | const int dilation_h, const int dilation_w, 360 | const int deformable_group, float* grad_im){ 361 | 362 | const int channel_per_deformable_group = channels / deformable_group; 363 | const int num_kernels = channels * kernel_h * kernel_w * batch_size * height_col * width_col; 364 | modulated_deformable_col2im_gpu_kernel 365 | <<>>( 367 | num_kernels, data_col, data_offset, data_mask, channels, height_im, width_im, 368 | kernel_h, kernel_w, pad_h, pad_h, stride_h, stride_w, 369 | dilation_h, dilation_w, channel_per_deformable_group, 370 | batch_size, deformable_group, height_col, width_col, grad_im); 371 | cudaError_t err = cudaGetLastError(); 372 | if (err != cudaSuccess) 373 | { 374 | printf("error in modulated_deformable_col2im_cuda: %s\n", cudaGetErrorString(err)); 375 | } 376 | 377 | } 378 | 379 | void modulated_deformable_col2im_coord_cuda(cudaStream_t stream, 380 | const float* data_col, const float* data_im, const float* data_offset, const float* data_mask, 381 | const int batch_size, const int channels, const int height_im, const int width_im, 382 | const int height_col, const int width_col, const int kernel_h, const int kernel_w, 383 | const int pad_h, const int pad_w, const int stride_h, const int stride_w, 384 | const int dilation_h, const int dilation_w, 385 | const int deformable_group, 386 | float* grad_offset, float* grad_mask) { 387 | const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h * kernel_w * deformable_group; 388 | const int channel_per_deformable_group = channels * kernel_h * kernel_w / deformable_group; 389 | modulated_deformable_col2im_coord_gpu_kernel 390 | <<>>( 392 | num_kernels, data_col, data_im, data_offset, data_mask, channels, height_im, width_im, 393 | kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, 394 | dilation_h, dilation_w, channel_per_deformable_group, 395 | batch_size, 2 * kernel_h * kernel_w * deformable_group, deformable_group, height_col, width_col, 396 | grad_offset, grad_mask); 397 | cudaError_t err = cudaGetLastError(); 398 | if (err != cudaSuccess) 399 | { 400 | printf("error in modulated_deformable_col2im_coord_cuda: %s\n", cudaGetErrorString(err)); 401 | } 402 | } -------------------------------------------------------------------------------- /mmdet/layers/DCNv2/src/cuda/dcn_v2_im2col_cuda.h: -------------------------------------------------------------------------------- 1 | 2 | /*! 3 | ******************* BEGIN Caffe Copyright Notice and Disclaimer **************** 4 | * 5 | * COPYRIGHT 6 | * 7 | * All contributions by the University of California: 8 | * Copyright (c) 2014-2017 The Regents of the University of California (Regents) 9 | * All rights reserved. 10 | * 11 | * All other contributions: 12 | * Copyright (c) 2014-2017, the respective contributors 13 | * All rights reserved. 14 | * 15 | * Caffe uses a shared copyright model: each contributor holds copyright over 16 | * their contributions to Caffe. The project versioning records all such 17 | * contribution and copyright details. If a contributor wants to further mark 18 | * their specific copyright on a particular contribution, they should indicate 19 | * their copyright solely in the commit message of the change when it is 20 | * committed. 21 | * 22 | * LICENSE 23 | * 24 | * Redistribution and use in source and binary forms, with or without 25 | * modification, are permitted provided that the following conditions are met: 26 | * 27 | * 1. Redistributions of source code must retain the above copyright notice, this 28 | * list of conditions and the following disclaimer. 29 | * 2. Redistributions in binary form must reproduce the above copyright notice, 30 | * this list of conditions and the following disclaimer in the documentation 31 | * and/or other materials provided with the distribution. 32 | * 33 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 34 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 35 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 36 | * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 37 | * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 38 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 39 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 40 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 41 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 42 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 43 | * 44 | * CONTRIBUTION AGREEMENT 45 | * 46 | * By contributing to the BVLC/caffe repository through pull-request, comment, 47 | * or otherwise, the contributor releases their content to the 48 | * license and copyright terms herein. 49 | * 50 | ***************** END Caffe Copyright Notice and Disclaimer ******************** 51 | * 52 | * Copyright (c) 2018 Microsoft 53 | * Licensed under The MIT License [see LICENSE for details] 54 | * \file modulated_deformable_im2col.h 55 | * \brief Function definitions of converting an image to 56 | * column matrix based on kernel, padding, dilation, and offset. 57 | * These functions are mainly used in deformable convolution operators. 58 | * \ref: https://arxiv.org/abs/1811.11168 59 | * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu 60 | */ 61 | 62 | /***************** Adapted by Charles Shang *********************/ 63 | 64 | #ifndef DCN_V2_IM2COL_CUDA 65 | #define DCN_V2_IM2COL_CUDA 66 | 67 | #ifdef __cplusplus 68 | extern "C" 69 | { 70 | #endif 71 | 72 | void modulated_deformable_im2col_cuda(cudaStream_t stream, 73 | const float *data_im, const float *data_offset, const float *data_mask, 74 | const int batch_size, const int channels, const int height_im, const int width_im, 75 | const int height_col, const int width_col, const int kernel_h, const int kenerl_w, 76 | const int pad_h, const int pad_w, const int stride_h, const int stride_w, 77 | const int dilation_h, const int dilation_w, 78 | const int deformable_group, float *data_col); 79 | 80 | void modulated_deformable_col2im_cuda(cudaStream_t stream, 81 | const float *data_col, const float *data_offset, const float *data_mask, 82 | const int batch_size, const int channels, const int height_im, const int width_im, 83 | const int height_col, const int width_col, const int kernel_h, const int kenerl_w, 84 | const int pad_h, const int pad_w, const int stride_h, const int stride_w, 85 | const int dilation_h, const int dilation_w, 86 | const int deformable_group, float *grad_im); 87 | 88 | void modulated_deformable_col2im_coord_cuda(cudaStream_t stream, 89 | const float *data_col, const float *data_im, const float *data_offset, const float *data_mask, 90 | const int batch_size, const int channels, const int height_im, const int width_im, 91 | const int height_col, const int width_col, const int kernel_h, const int kenerl_w, 92 | const int pad_h, const int pad_w, const int stride_h, const int stride_w, 93 | const int dilation_h, const int dilation_w, 94 | const int deformable_group, 95 | float *grad_offset, float *grad_mask); 96 | 97 | #ifdef __cplusplus 98 | } 99 | #endif 100 | 101 | #endif -------------------------------------------------------------------------------- /mmdet/layers/DCNv2/src/cuda/dcn_v2_psroi_pooling_cuda.cu: -------------------------------------------------------------------------------- 1 | /*! 2 | * Copyright (c) 2017 Microsoft 3 | * Licensed under The MIT License [see LICENSE for details] 4 | * \file deformable_psroi_pooling.cu 5 | * \brief 6 | * \author Yi Li, Guodong Zhang, Jifeng Dai 7 | */ 8 | /***************** Adapted by Charles Shang *********************/ 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | #include 16 | #include 17 | 18 | #include 19 | #include 20 | #include 21 | 22 | #define CUDA_KERNEL_LOOP(i, n) \ 23 | for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ 24 | i < (n); \ 25 | i += blockDim.x * gridDim.x) 26 | 27 | const int CUDA_NUM_THREADS = 1024; 28 | inline int GET_BLOCKS(const int N) 29 | { 30 | return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; 31 | } 32 | 33 | template 34 | __device__ T bilinear_interp_cuda( 35 | const T *data, 36 | const T x, 37 | const T y, 38 | const int width, 39 | const int height) 40 | { 41 | int x1 = floor(x); 42 | int x2 = ceil(x); 43 | int y1 = floor(y); 44 | int y2 = ceil(y); 45 | T dist_x = static_cast(x - x1); 46 | T dist_y = static_cast(y - y1); 47 | T value11 = data[y1 * width + x1]; 48 | T value12 = data[y2 * width + x1]; 49 | T value21 = data[y1 * width + x2]; 50 | T value22 = data[y2 * width + x2]; 51 | T value = (1 - dist_x) * (1 - dist_y) * value11 + 52 | (1 - dist_x) * dist_y * value12 + 53 | dist_x * (1 - dist_y) * value21 + 54 | dist_x * dist_y * value22; 55 | return value; 56 | } 57 | 58 | template 59 | __global__ void DeformablePSROIPoolForwardKernelCuda( 60 | const int count, 61 | const T *bottom_data, 62 | const T spatial_scale, 63 | const int channels, 64 | const int height, const int width, 65 | const int pooled_height, const int pooled_width, 66 | const T *bottom_rois, const T *bottom_trans, 67 | const int no_trans, 68 | const T trans_std, 69 | const int sample_per_part, 70 | const int output_dim, 71 | const int group_size, 72 | const int part_size, 73 | const int num_classes, 74 | const int channels_each_class, 75 | T *top_data, 76 | T *top_count) 77 | { 78 | CUDA_KERNEL_LOOP(index, count) 79 | { 80 | // The output is in order (n, ctop, ph, pw) 81 | int pw = index % pooled_width; 82 | int ph = (index / pooled_width) % pooled_height; 83 | int ctop = (index / pooled_width / pooled_height) % output_dim; 84 | int n = index / pooled_width / pooled_height / output_dim; 85 | 86 | // [start, end) interval for spatial sampling 87 | const T *offset_bottom_rois = bottom_rois + n * 5; 88 | int roi_batch_ind = offset_bottom_rois[0]; 89 | T roi_start_w = static_cast(round(offset_bottom_rois[1])) * spatial_scale - 0.5; 90 | T roi_start_h = static_cast(round(offset_bottom_rois[2])) * spatial_scale - 0.5; 91 | T roi_end_w = static_cast(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5; 92 | T roi_end_h = static_cast(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5; 93 | 94 | // Force too small ROIs to be 1x1 95 | T roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0 96 | T roi_height = max(roi_end_h - roi_start_h, 0.1); 97 | 98 | // Compute w and h at bottom 99 | T bin_size_h = roi_height / static_cast(pooled_height); 100 | T bin_size_w = roi_width / static_cast(pooled_width); 101 | 102 | T sub_bin_size_h = bin_size_h / static_cast(sample_per_part); 103 | T sub_bin_size_w = bin_size_w / static_cast(sample_per_part); 104 | 105 | int part_h = floor(static_cast(ph) / pooled_height * part_size); 106 | int part_w = floor(static_cast(pw) / pooled_width * part_size); 107 | int class_id = ctop / channels_each_class; 108 | T trans_x = no_trans ? static_cast(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * trans_std; 109 | T trans_y = no_trans ? static_cast(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * trans_std; 110 | 111 | T wstart = static_cast(pw) * bin_size_w + roi_start_w; 112 | wstart += trans_x * roi_width; 113 | T hstart = static_cast(ph) * bin_size_h + roi_start_h; 114 | hstart += trans_y * roi_height; 115 | 116 | T sum = 0; 117 | int count = 0; 118 | int gw = floor(static_cast(pw) * group_size / pooled_width); 119 | int gh = floor(static_cast(ph) * group_size / pooled_height); 120 | gw = min(max(gw, 0), group_size - 1); 121 | gh = min(max(gh, 0), group_size - 1); 122 | 123 | const T *offset_bottom_data = bottom_data + (roi_batch_ind * channels) * height * width; 124 | for (int ih = 0; ih < sample_per_part; ih++) 125 | { 126 | for (int iw = 0; iw < sample_per_part; iw++) 127 | { 128 | T w = wstart + iw * sub_bin_size_w; 129 | T h = hstart + ih * sub_bin_size_h; 130 | // bilinear interpolation 131 | if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5) 132 | { 133 | continue; 134 | } 135 | w = min(max(w, 0.), width - 1.); 136 | h = min(max(h, 0.), height - 1.); 137 | int c = (ctop * group_size + gh) * group_size + gw; 138 | T val = bilinear_interp_cuda(offset_bottom_data + c * height * width, w, h, width, height); 139 | sum += val; 140 | count++; 141 | } 142 | } 143 | top_data[index] = count == 0 ? static_cast(0) : sum / count; 144 | top_count[index] = count; 145 | } 146 | } 147 | 148 | template 149 | __global__ void DeformablePSROIPoolBackwardAccKernelCuda( 150 | const int count, 151 | const T *top_diff, 152 | const T *top_count, 153 | const int num_rois, 154 | const T spatial_scale, 155 | const int channels, 156 | const int height, const int width, 157 | const int pooled_height, const int pooled_width, 158 | const int output_dim, 159 | T *bottom_data_diff, T *bottom_trans_diff, 160 | const T *bottom_data, 161 | const T *bottom_rois, 162 | const T *bottom_trans, 163 | const int no_trans, 164 | const T trans_std, 165 | const int sample_per_part, 166 | const int group_size, 167 | const int part_size, 168 | const int num_classes, 169 | const int channels_each_class) 170 | { 171 | CUDA_KERNEL_LOOP(index, count) 172 | { 173 | // The output is in order (n, ctop, ph, pw) 174 | int pw = index % pooled_width; 175 | int ph = (index / pooled_width) % pooled_height; 176 | int ctop = (index / pooled_width / pooled_height) % output_dim; 177 | int n = index / pooled_width / pooled_height / output_dim; 178 | 179 | // [start, end) interval for spatial sampling 180 | const T *offset_bottom_rois = bottom_rois + n * 5; 181 | int roi_batch_ind = offset_bottom_rois[0]; 182 | T roi_start_w = static_cast(round(offset_bottom_rois[1])) * spatial_scale - 0.5; 183 | T roi_start_h = static_cast(round(offset_bottom_rois[2])) * spatial_scale - 0.5; 184 | T roi_end_w = static_cast(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5; 185 | T roi_end_h = static_cast(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5; 186 | 187 | // Force too small ROIs to be 1x1 188 | T roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0 189 | T roi_height = max(roi_end_h - roi_start_h, 0.1); 190 | 191 | // Compute w and h at bottom 192 | T bin_size_h = roi_height / static_cast(pooled_height); 193 | T bin_size_w = roi_width / static_cast(pooled_width); 194 | 195 | T sub_bin_size_h = bin_size_h / static_cast(sample_per_part); 196 | T sub_bin_size_w = bin_size_w / static_cast(sample_per_part); 197 | 198 | int part_h = floor(static_cast(ph) / pooled_height * part_size); 199 | int part_w = floor(static_cast(pw) / pooled_width * part_size); 200 | int class_id = ctop / channels_each_class; 201 | T trans_x = no_trans ? static_cast(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * trans_std; 202 | T trans_y = no_trans ? static_cast(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * trans_std; 203 | 204 | T wstart = static_cast(pw) * bin_size_w + roi_start_w; 205 | wstart += trans_x * roi_width; 206 | T hstart = static_cast(ph) * bin_size_h + roi_start_h; 207 | hstart += trans_y * roi_height; 208 | 209 | if (top_count[index] <= 0) 210 | { 211 | continue; 212 | } 213 | T diff_val = top_diff[index] / top_count[index]; 214 | const T *offset_bottom_data = bottom_data + roi_batch_ind * channels * height * width; 215 | T *offset_bottom_data_diff = bottom_data_diff + roi_batch_ind * channels * height * width; 216 | int gw = floor(static_cast(pw) * group_size / pooled_width); 217 | int gh = floor(static_cast(ph) * group_size / pooled_height); 218 | gw = min(max(gw, 0), group_size - 1); 219 | gh = min(max(gh, 0), group_size - 1); 220 | 221 | for (int ih = 0; ih < sample_per_part; ih++) 222 | { 223 | for (int iw = 0; iw < sample_per_part; iw++) 224 | { 225 | T w = wstart + iw * sub_bin_size_w; 226 | T h = hstart + ih * sub_bin_size_h; 227 | // bilinear interpolation 228 | if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5) 229 | { 230 | continue; 231 | } 232 | w = min(max(w, 0.), width - 1.); 233 | h = min(max(h, 0.), height - 1.); 234 | int c = (ctop * group_size + gh) * group_size + gw; 235 | // backward on feature 236 | int x0 = floor(w); 237 | int x1 = ceil(w); 238 | int y0 = floor(h); 239 | int y1 = ceil(h); 240 | T dist_x = w - x0, dist_y = h - y0; 241 | T q00 = (1 - dist_x) * (1 - dist_y); 242 | T q01 = (1 - dist_x) * dist_y; 243 | T q10 = dist_x * (1 - dist_y); 244 | T q11 = dist_x * dist_y; 245 | int bottom_index_base = c * height * width; 246 | atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x0, q00 * diff_val); 247 | atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x0, q01 * diff_val); 248 | atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x1, q10 * diff_val); 249 | atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x1, q11 * diff_val); 250 | 251 | if (no_trans) 252 | { 253 | continue; 254 | } 255 | T U00 = offset_bottom_data[bottom_index_base + y0 * width + x0]; 256 | T U01 = offset_bottom_data[bottom_index_base + y1 * width + x0]; 257 | T U10 = offset_bottom_data[bottom_index_base + y0 * width + x1]; 258 | T U11 = offset_bottom_data[bottom_index_base + y1 * width + x1]; 259 | T diff_x = (U11 * dist_y + U10 * (1 - dist_y) - U01 * dist_y - U00 * (1 - dist_y)) * trans_std * diff_val; 260 | diff_x *= roi_width; 261 | T diff_y = (U11 * dist_x + U01 * (1 - dist_x) - U10 * dist_x - U00 * (1 - dist_x)) * trans_std * diff_val; 262 | diff_y *= roi_height; 263 | 264 | atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w, diff_x); 265 | atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w, diff_y); 266 | } 267 | } 268 | } 269 | } 270 | 271 | std::tuple 272 | dcn_v2_psroi_pooling_cuda_forward(const at::Tensor &input, 273 | const at::Tensor &bbox, 274 | const at::Tensor &trans, 275 | const int no_trans, 276 | const float spatial_scale, 277 | const int output_dim, 278 | const int group_size, 279 | const int pooled_size, 280 | const int part_size, 281 | const int sample_per_part, 282 | const float trans_std) 283 | { 284 | AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); 285 | AT_ASSERTM(bbox.type().is_cuda(), "rois must be a CUDA tensor"); 286 | AT_ASSERTM(trans.type().is_cuda(), "trans must be a CUDA tensor"); 287 | 288 | const int batch = input.size(0); 289 | const int channels = input.size(1); 290 | const int height = input.size(2); 291 | const int width = input.size(3); 292 | const int channels_trans = no_trans ? 2 : trans.size(1); 293 | const int num_bbox = bbox.size(0); 294 | 295 | AT_ASSERTM(channels == output_dim, "input channels and output channels must equal"); 296 | auto pooled_height = pooled_size; 297 | auto pooled_width = pooled_size; 298 | 299 | auto out = at::empty({num_bbox, output_dim, pooled_height, pooled_width}, input.options()); 300 | long out_size = num_bbox * output_dim * pooled_height * pooled_width; 301 | auto top_count = at::zeros({num_bbox, output_dim, pooled_height, pooled_width}, input.options()); 302 | 303 | const int num_classes = no_trans ? 1 : channels_trans / 2; 304 | const int channels_each_class = no_trans ? output_dim : output_dim / num_classes; 305 | 306 | cudaStream_t stream = at::cuda::getCurrentCUDAStream(); 307 | 308 | if (out.numel() == 0) 309 | { 310 | THCudaCheck(cudaGetLastError()); 311 | return std::make_tuple(out, top_count); 312 | } 313 | 314 | dim3 grid(std::min(THCCeilDiv(out_size, 512L), 4096L)); 315 | dim3 block(512); 316 | 317 | AT_DISPATCH_FLOATING_TYPES(input.type(), "dcn_v2_psroi_pooling_cuda_forward", [&] { 318 | DeformablePSROIPoolForwardKernelCuda<<>>( 319 | out_size, 320 | input.contiguous().data(), 321 | spatial_scale, 322 | channels, 323 | height, width, 324 | pooled_height, 325 | pooled_width, 326 | bbox.contiguous().data(), 327 | trans.contiguous().data(), 328 | no_trans, 329 | trans_std, 330 | sample_per_part, 331 | output_dim, 332 | group_size, 333 | part_size, 334 | num_classes, 335 | channels_each_class, 336 | out.data(), 337 | top_count.data()); 338 | }); 339 | THCudaCheck(cudaGetLastError()); 340 | return std::make_tuple(out, top_count); 341 | } 342 | 343 | std::tuple 344 | dcn_v2_psroi_pooling_cuda_backward(const at::Tensor &out_grad, 345 | const at::Tensor &input, 346 | const at::Tensor &bbox, 347 | const at::Tensor &trans, 348 | const at::Tensor &top_count, 349 | const int no_trans, 350 | const float spatial_scale, 351 | const int output_dim, 352 | const int group_size, 353 | const int pooled_size, 354 | const int part_size, 355 | const int sample_per_part, 356 | const float trans_std) 357 | { 358 | AT_ASSERTM(out_grad.type().is_cuda(), "out_grad must be a CUDA tensor"); 359 | AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); 360 | AT_ASSERTM(bbox.type().is_cuda(), "bbox must be a CUDA tensor"); 361 | AT_ASSERTM(trans.type().is_cuda(), "trans must be a CUDA tensor"); 362 | AT_ASSERTM(top_count.type().is_cuda(), "top_count must be a CUDA tensor"); 363 | 364 | const int batch = input.size(0); 365 | const int channels = input.size(1); 366 | const int height = input.size(2); 367 | const int width = input.size(3); 368 | const int channels_trans = no_trans ? 2 : trans.size(1); 369 | const int num_bbox = bbox.size(0); 370 | 371 | AT_ASSERTM(channels == output_dim, "input channels and output channels must equal"); 372 | auto pooled_height = pooled_size; 373 | auto pooled_width = pooled_size; 374 | long out_size = num_bbox * output_dim * pooled_height * pooled_width; 375 | const int num_classes = no_trans ? 1 : channels_trans / 2; 376 | const int channels_each_class = no_trans ? output_dim : output_dim / num_classes; 377 | 378 | auto input_grad = at::zeros({batch, channels, height, width}, out_grad.options()); 379 | auto trans_grad = at::zeros_like(trans); 380 | 381 | if (input_grad.numel() == 0) 382 | { 383 | THCudaCheck(cudaGetLastError()); 384 | return std::make_tuple(input_grad, trans_grad); 385 | } 386 | 387 | dim3 grid(std::min(THCCeilDiv(out_size, 512L), 4096L)); 388 | dim3 block(512); 389 | cudaStream_t stream = at::cuda::getCurrentCUDAStream(); 390 | 391 | AT_DISPATCH_FLOATING_TYPES(out_grad.type(), "dcn_v2_psroi_pooling_cuda_backward", [&] { 392 | DeformablePSROIPoolBackwardAccKernelCuda<<>>( 393 | out_size, 394 | out_grad.contiguous().data(), 395 | top_count.contiguous().data(), 396 | num_bbox, 397 | spatial_scale, 398 | channels, 399 | height, 400 | width, 401 | pooled_height, 402 | pooled_width, 403 | output_dim, 404 | input_grad.contiguous().data(), 405 | trans_grad.contiguous().data(), 406 | input.contiguous().data(), 407 | bbox.contiguous().data(), 408 | trans.contiguous().data(), 409 | no_trans, 410 | trans_std, 411 | sample_per_part, 412 | group_size, 413 | part_size, 414 | num_classes, 415 | channels_each_class); 416 | }); 417 | THCudaCheck(cudaGetLastError()); 418 | return std::make_tuple(input_grad, trans_grad); 419 | } -------------------------------------------------------------------------------- /mmdet/layers/DCNv2/src/cuda/vision.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | at::Tensor 5 | dcn_v2_cuda_forward(const at::Tensor &input, 6 | const at::Tensor &weight, 7 | const at::Tensor &bias, 8 | const at::Tensor &offset, 9 | const at::Tensor &mask, 10 | const int kernel_h, 11 | const int kernel_w, 12 | const int stride_h, 13 | const int stride_w, 14 | const int pad_h, 15 | const int pad_w, 16 | const int dilation_h, 17 | const int dilation_w, 18 | const int deformable_group); 19 | 20 | std::vector 21 | dcn_v2_cuda_backward(const at::Tensor &input, 22 | const at::Tensor &weight, 23 | const at::Tensor &bias, 24 | const at::Tensor &offset, 25 | const at::Tensor &mask, 26 | const at::Tensor &grad_output, 27 | int kernel_h, int kernel_w, 28 | int stride_h, int stride_w, 29 | int pad_h, int pad_w, 30 | int dilation_h, int dilation_w, 31 | int deformable_group); 32 | 33 | 34 | std::tuple 35 | dcn_v2_psroi_pooling_cuda_forward(const at::Tensor &input, 36 | const at::Tensor &bbox, 37 | const at::Tensor &trans, 38 | const int no_trans, 39 | const float spatial_scale, 40 | const int output_dim, 41 | const int group_size, 42 | const int pooled_size, 43 | const int part_size, 44 | const int sample_per_part, 45 | const float trans_std); 46 | 47 | std::tuple 48 | dcn_v2_psroi_pooling_cuda_backward(const at::Tensor &out_grad, 49 | const at::Tensor &input, 50 | const at::Tensor &bbox, 51 | const at::Tensor &trans, 52 | const at::Tensor &top_count, 53 | const int no_trans, 54 | const float spatial_scale, 55 | const int output_dim, 56 | const int group_size, 57 | const int pooled_size, 58 | const int part_size, 59 | const int sample_per_part, 60 | const float trans_std); -------------------------------------------------------------------------------- /mmdet/layers/DCNv2/src/dcn_v2.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "cpu/vision.h" 4 | 5 | #ifdef WITH_CUDA 6 | #include "cuda/vision.h" 7 | #endif 8 | 9 | at::Tensor 10 | dcn_v2_forward(const at::Tensor &input, 11 | const at::Tensor &weight, 12 | const at::Tensor &bias, 13 | const at::Tensor &offset, 14 | const at::Tensor &mask, 15 | const int kernel_h, 16 | const int kernel_w, 17 | const int stride_h, 18 | const int stride_w, 19 | const int pad_h, 20 | const int pad_w, 21 | const int dilation_h, 22 | const int dilation_w, 23 | const int deformable_group) 24 | { 25 | if (input.type().is_cuda()) 26 | { 27 | #ifdef WITH_CUDA 28 | return dcn_v2_cuda_forward(input, weight, bias, offset, mask, 29 | kernel_h, kernel_w, 30 | stride_h, stride_w, 31 | pad_h, pad_w, 32 | dilation_h, dilation_w, 33 | deformable_group); 34 | #else 35 | AT_ERROR("Not compiled with GPU support"); 36 | #endif 37 | } 38 | else{ 39 | return dcn_v2_cpu_forward(input, weight, bias, offset, mask, 40 | kernel_h, kernel_w, 41 | stride_h, stride_w, 42 | pad_h, pad_w, 43 | dilation_h, dilation_w, 44 | deformable_group); 45 | } 46 | } 47 | 48 | std::vector 49 | dcn_v2_backward(const at::Tensor &input, 50 | const at::Tensor &weight, 51 | const at::Tensor &bias, 52 | const at::Tensor &offset, 53 | const at::Tensor &mask, 54 | const at::Tensor &grad_output, 55 | int kernel_h, int kernel_w, 56 | int stride_h, int stride_w, 57 | int pad_h, int pad_w, 58 | int dilation_h, int dilation_w, 59 | int deformable_group) 60 | { 61 | if (input.type().is_cuda()) 62 | { 63 | #ifdef WITH_CUDA 64 | return dcn_v2_cuda_backward(input, 65 | weight, 66 | bias, 67 | offset, 68 | mask, 69 | grad_output, 70 | kernel_h, kernel_w, 71 | stride_h, stride_w, 72 | pad_h, pad_w, 73 | dilation_h, dilation_w, 74 | deformable_group); 75 | #else 76 | AT_ERROR("Not compiled with GPU support"); 77 | #endif 78 | } 79 | else{ 80 | return dcn_v2_cpu_backward(input, 81 | weight, 82 | bias, 83 | offset, 84 | mask, 85 | grad_output, 86 | kernel_h, kernel_w, 87 | stride_h, stride_w, 88 | pad_h, pad_w, 89 | dilation_h, dilation_w, 90 | deformable_group); 91 | } 92 | } 93 | 94 | std::tuple 95 | dcn_v2_psroi_pooling_forward(const at::Tensor &input, 96 | const at::Tensor &bbox, 97 | const at::Tensor &trans, 98 | const int no_trans, 99 | const float spatial_scale, 100 | const int output_dim, 101 | const int group_size, 102 | const int pooled_size, 103 | const int part_size, 104 | const int sample_per_part, 105 | const float trans_std) 106 | { 107 | if (input.type().is_cuda()) 108 | { 109 | #ifdef WITH_CUDA 110 | return dcn_v2_psroi_pooling_cuda_forward(input, 111 | bbox, 112 | trans, 113 | no_trans, 114 | spatial_scale, 115 | output_dim, 116 | group_size, 117 | pooled_size, 118 | part_size, 119 | sample_per_part, 120 | trans_std); 121 | #else 122 | AT_ERROR("Not compiled with GPU support"); 123 | #endif 124 | } 125 | else{ 126 | return dcn_v2_psroi_pooling_cpu_forward(input, 127 | bbox, 128 | trans, 129 | no_trans, 130 | spatial_scale, 131 | output_dim, 132 | group_size, 133 | pooled_size, 134 | part_size, 135 | sample_per_part, 136 | trans_std); 137 | } 138 | } 139 | 140 | std::tuple 141 | dcn_v2_psroi_pooling_backward(const at::Tensor &out_grad, 142 | const at::Tensor &input, 143 | const at::Tensor &bbox, 144 | const at::Tensor &trans, 145 | const at::Tensor &top_count, 146 | const int no_trans, 147 | const float spatial_scale, 148 | const int output_dim, 149 | const int group_size, 150 | const int pooled_size, 151 | const int part_size, 152 | const int sample_per_part, 153 | const float trans_std) 154 | { 155 | if (input.type().is_cuda()) 156 | { 157 | #ifdef WITH_CUDA 158 | return dcn_v2_psroi_pooling_cuda_backward(out_grad, 159 | input, 160 | bbox, 161 | trans, 162 | top_count, 163 | no_trans, 164 | spatial_scale, 165 | output_dim, 166 | group_size, 167 | pooled_size, 168 | part_size, 169 | sample_per_part, 170 | trans_std); 171 | #else 172 | AT_ERROR("Not compiled with GPU support"); 173 | #endif 174 | } 175 | else{ 176 | return dcn_v2_psroi_pooling_cpu_backward(out_grad, 177 | input, 178 | bbox, 179 | trans, 180 | top_count, 181 | no_trans, 182 | spatial_scale, 183 | output_dim, 184 | group_size, 185 | pooled_size, 186 | part_size, 187 | sample_per_part, 188 | trans_std); 189 | } 190 | } -------------------------------------------------------------------------------- /mmdet/layers/DCNv2/src/vision.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "dcn_v2.h" 3 | 4 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 5 | m.def("dcn_v2_forward", &dcn_v2_forward, "dcn_v2_forward"); 6 | m.def("dcn_v2_backward", &dcn_v2_backward, "dcn_v2_backward"); 7 | m.def("dcn_v2_psroi_pooling_forward", &dcn_v2_psroi_pooling_forward, "dcn_v2_psroi_pooling_forward"); 8 | m.def("dcn_v2_psroi_pooling_backward", &dcn_v2_psroi_pooling_backward, "dcn_v2_psroi_pooling_backward"); 9 | } 10 | -------------------------------------------------------------------------------- /mmdet/layers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lbin/Retinaface_Detectron2/579e500b35efac6afc389dfc9bbea0b129e91ba6/mmdet/layers/__init__.py -------------------------------------------------------------------------------- /mmdet/layers/nms.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchvision.ops import boxes as box_ops 3 | from torchvision.ops import nms # BC-compat 4 | 5 | from apex.amp import float_function 6 | 7 | 8 | @float_function 9 | def batched_nms(boxes, scores, idxs, iou_threshold): 10 | """ 11 | Same as torchvision.ops.boxes.batched_nms, but safer. 12 | """ 13 | assert boxes.shape[-1] == 4 14 | # TODO may need better strategy. 15 | # Investigate after having a fully-cuda NMS op. 16 | if len(boxes) < 40000: 17 | return box_ops.batched_nms(boxes, scores, idxs, iou_threshold) 18 | 19 | result_mask = scores.new_zeros(scores.size(), dtype=torch.bool) 20 | for id in torch.unique(idxs).cpu().tolist(): 21 | mask = (idxs == id).nonzero().view(-1) 22 | keep = nms(boxes[mask], scores[mask], iou_threshold) 23 | result_mask[mask[keep]] = True 24 | keep = result_mask.nonzero().view(-1) 25 | keep = keep[scores[keep].argsort(descending=True)] 26 | return keep 27 | -------------------------------------------------------------------------------- /mmdet/layers/ssh.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from detectron2.layers import Conv2d, get_norm 5 | 6 | from aifwdet.layers.DCNv2.dcn_v2 import DCN 7 | 8 | # from detectron2.layers.batch_norm import NaiveSyncBatchNorm 9 | 10 | 11 | def conv_bn(in_channel, out_channel, stride=1, leaky=0, norm="BN"): 12 | return nn.Sequential( 13 | Conv2d( 14 | in_channel, 15 | out_channel, 16 | kernel_size=3, 17 | stride=stride, 18 | padding=1, 19 | bias=False, 20 | norm=get_norm(norm, out_channel), 21 | ), 22 | nn.LeakyReLU(negative_slope=leaky, inplace=True), 23 | ) 24 | 25 | 26 | def conv_bn_no_relu(in_channel, out_channel, stride, norm="BN"): 27 | return Conv2d( 28 | in_channel, 29 | out_channel, 30 | kernel_size=3, 31 | stride=stride, 32 | padding=1, 33 | bias=False, 34 | norm=get_norm(norm, out_channel), 35 | ) 36 | 37 | 38 | def conv_bn1X1(in_channel, out_channel, stride, leaky=0, norm="BN"): 39 | return nn.Sequential( 40 | Conv2d( 41 | in_channel, 42 | out_channel, 43 | kernel_size=1, 44 | stride=stride, 45 | padding=0, 46 | bias=False, 47 | norm=get_norm(norm, out_channel), 48 | ), 49 | nn.LeakyReLU(negative_slope=leaky, inplace=True), 50 | ) 51 | 52 | 53 | def conv_dw(in_channel, out_channel, stride, leaky=0.1, norm="BN"): 54 | return nn.Sequential( 55 | Conv2d( 56 | in_channel, 57 | out_channel, 58 | kernel_size=3, 59 | stride=stride, 60 | padding=1, 61 | bias=False, 62 | groups=in_channel, 63 | norm=get_norm(norm, out_channel), 64 | ), 65 | nn.LeakyReLU(negative_slope=leaky, inplace=True), 66 | Conv2d( 67 | in_channel, 68 | out_channel, 69 | kernel_size=1, 70 | stride=1, 71 | padding=0, 72 | bias=False, 73 | norm=get_norm(norm, out_channel), 74 | ), 75 | nn.LeakyReLU(negative_slope=leaky, inplace=True), 76 | ) 77 | 78 | 79 | class DeformConv(nn.Module): 80 | def __init__(self, in_channel, out_channel, norm="BN"): 81 | super(DeformConv, self).__init__() 82 | self.actf = nn.Sequential(get_norm(norm, out_channel), nn.ReLU(inplace=True)) 83 | self.conv = DCN( 84 | in_channel, 85 | out_channel, 86 | kernel_size=(3, 3), 87 | stride=1, 88 | padding=1, 89 | dilation=1, 90 | deformable_groups=1, 91 | ) 92 | 93 | def forward(self, x): 94 | x = self.conv(x) 95 | x = self.actf(x) 96 | return x 97 | 98 | 99 | # SSH:Single Stage Headless Face Detector 100 | class SSH(nn.Module): 101 | def __init__(self, cfg, in_channel, out_channel): 102 | super(SSH, self).__init__() 103 | assert out_channel % 4 == 0 104 | self.use_dcnv2 = cfg.MODEL.RETINANET.WITH_DCNv2 105 | self.norm = cfg.MODEL.RETINANET.NORM 106 | leaky = 0 107 | if out_channel <= 64: 108 | leaky = 0.1 109 | self.conv_1 = conv_bn_no_relu(in_channel, out_channel // 2, stride=1, norm=self.norm) 110 | 111 | self.conv_2 = conv_bn(in_channel, out_channel // 4, stride=1, leaky=leaky, norm=self.norm) 112 | self.conv_3 = conv_bn_no_relu(out_channel // 4, out_channel // 4, stride=1, norm=self.norm) 113 | 114 | self.conv_4 = conv_bn( 115 | out_channel // 4, out_channel // 4, stride=1, leaky=leaky, norm=self.norm 116 | ) 117 | self.conv_5 = conv_bn_no_relu(out_channel // 4, out_channel // 4, stride=1, norm=self.norm) 118 | if self.use_dcnv2: 119 | self.dcn = DeformConv(in_channel, out_channel, norm=self.norm) 120 | 121 | def forward(self, input): 122 | conv_1 = self.conv_1(input) 123 | 124 | conv_2 = self.conv_2(input) 125 | conv_2_3 = self.conv_3(conv_2) 126 | 127 | conv_4 = self.conv_4(conv_2) 128 | conv_4_5 = self.conv_5(conv_4) 129 | 130 | out = torch.cat([conv_1, conv_2_3, conv_4_5], dim=1) 131 | out = F.relu(out) 132 | if self.use_dcnv2: 133 | out = self.dcn(out) 134 | return out 135 | -------------------------------------------------------------------------------- /mmdet/modeling/__init__.py: -------------------------------------------------------------------------------- 1 | from .backbone import * 2 | from .meta_arch import RetinaFace 3 | -------------------------------------------------------------------------------- /mmdet/modeling/backbone/__init__.py: -------------------------------------------------------------------------------- 1 | from .torch_resnet import build_torch_resnet_backbone, build_torch_resnet_fpn_backbone 2 | 3 | __all__ = [k for k in globals().keys() if not k.startswith("_")] 4 | -------------------------------------------------------------------------------- /mmdet/modeling/backbone/torch_resnet.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torchvision.models.resnet as resnet 3 | from detectron2.layers import ShapeSpec 4 | 5 | # from centernet.network.backbone import Backbone 6 | from detectron2.modeling import Backbone 7 | from detectron2.modeling.backbone.build import BACKBONE_REGISTRY 8 | from detectron2.modeling.backbone.fpn import FPN, LastLevelMaxPool 9 | 10 | _resnet_mapper = {18: resnet.resnet18, 50: resnet.resnet50, 101: resnet.resnet101} 11 | 12 | 13 | class ResnetBackbone(Backbone): 14 | def __init__(self, cfg, out_features=None, pretrained=True): 15 | super().__init__() 16 | depth = cfg.MODEL.RESNETS.DEPTH 17 | backbone = _resnet_mapper[depth](pretrained=pretrained) 18 | self.stage0 = nn.Sequential(backbone.conv1, backbone.bn1, backbone.relu, backbone.maxpool) 19 | self.stage1 = backbone.layer1 20 | self.stage2 = backbone.layer2 21 | self.stage3 = backbone.layer3 22 | self.stage4 = backbone.layer4 23 | 24 | self.stages_and_names = [] 25 | 26 | self.add_module("res1", self.stage0) 27 | self.stages_and_names.append((self.stage0, "res1")) 28 | 29 | self.add_module("res2", self.stage1) 30 | self.stages_and_names.append((self.stage1, "res2")) 31 | 32 | self.add_module("res3", self.stage2) 33 | self.stages_and_names.append((self.stage2, "res3")) 34 | 35 | self.add_module("res4", self.stage3) 36 | self.stages_and_names.append((self.stage3, "res4")) 37 | 38 | self.add_module("res5", self.stage4) 39 | self.stages_and_names.append((self.stage4, "res5")) 40 | 41 | self._out_feature_strides = {} 42 | self._out_feature_channels = {} 43 | 44 | self._out_feature_strides["res3"] = 8 45 | self._out_feature_channels["res3"] = 512 46 | 47 | self._out_feature_strides["res4"] = 16 48 | self._out_feature_channels["res4"] = 1024 49 | 50 | self._out_feature_strides["res5"] = 32 51 | self._out_feature_channels["res5"] = 2048 52 | 53 | self._out_features = out_features 54 | 55 | def forward(self, x): 56 | outputs = {} 57 | for stage, name in self.stages_and_names: 58 | x = stage(x) 59 | if name in self._out_features: 60 | outputs[name] = x 61 | 62 | return outputs 63 | 64 | 65 | @BACKBONE_REGISTRY.register() 66 | def build_torch_resnet_backbone(cfg): 67 | out_features = cfg.MODEL.RESNETS.OUT_FEATURES 68 | return ResnetBackbone(cfg, out_features) 69 | 70 | 71 | @BACKBONE_REGISTRY.register() 72 | def build_torch_resnet_fpn_backbone(cfg, input_shape: ShapeSpec): 73 | """ 74 | Args: 75 | cfg: a detectron2 CfgNode 76 | Returns: 77 | backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`. 78 | """ 79 | bottom_up = build_torch_resnet_backbone(cfg) 80 | in_features = cfg.MODEL.FPN.IN_FEATURES 81 | out_channels = cfg.MODEL.FPN.OUT_CHANNELS 82 | backbone = FPN( 83 | bottom_up=bottom_up, 84 | in_features=in_features, 85 | out_channels=out_channels, 86 | norm=cfg.MODEL.FPN.NORM, 87 | top_block=LastLevelMaxPool(), 88 | fuse_type=cfg.MODEL.FPN.FUSE_TYPE, 89 | ) 90 | return backbone 91 | -------------------------------------------------------------------------------- /mmdet/modeling/meta_arch/__init__.py: -------------------------------------------------------------------------------- 1 | from .retinaface import RetinaFace 2 | -------------------------------------------------------------------------------- /mmdet/modeling/meta_arch/retinaface.py: -------------------------------------------------------------------------------- 1 | import math 2 | from typing import List 3 | 4 | import torch 5 | from detectron2.layers import ShapeSpec, cat 6 | from detectron2.modeling.anchor_generator import build_anchor_generator 7 | from detectron2.modeling.meta_arch import RetinaNet 8 | from detectron2.modeling.meta_arch.build import META_ARCH_REGISTRY 9 | from detectron2.structures import ImageList 10 | from detectron2.utils.events import get_event_storage 11 | from fvcore.nn import sigmoid_focal_loss_jit, smooth_l1_loss 12 | from torch import nn 13 | from torch.nn import functional as F 14 | 15 | # from aifwdet.layers.nms import batched_nms 16 | from aifwdet.layers.ssh import SSH 17 | 18 | __all__ = ["RetinaFace"] 19 | 20 | 21 | @META_ARCH_REGISTRY.register() 22 | class RetinaFace(RetinaNet): 23 | """ 24 | # Implement RetinaFace (https://arxiv.org/abs/1905.00641). 25 | """ 26 | 27 | def __init__(self, cfg): 28 | super().__init__(cfg) 29 | 30 | backbone_shape = self.backbone.output_shape() 31 | feature_shapes = [backbone_shape[f] for f in self.in_features] 32 | self.head = RetinaFaceHead(cfg, feature_shapes) 33 | self.input_format = cfg.INPUT.FORMAT 34 | self.to(self.device) 35 | 36 | def preprocess_image(self, batched_inputs): 37 | """ 38 | Normalize, pad and batch the input images. 39 | """ 40 | images = [x["image"].to(self.device) for x in batched_inputs] 41 | if self.input_format == "RGB": 42 | images = [(x / 255.0 - self.pixel_mean) / self.pixel_std for x in images] 43 | else: 44 | images = [(x - self.pixel_mean) / self.pixel_std for x in images] 45 | images = ImageList.from_tensors(images, self.backbone.size_divisibility) 46 | return images 47 | 48 | 49 | class RetinaFaceHead(nn.Module): 50 | """ 51 | The head used in RetinaNet for object classification and box regression. 52 | It has two subnets for the two tasks, with a common structure but separate parameters. 53 | """ 54 | 55 | def __init__(self, cfg, input_shape: List[ShapeSpec]): 56 | super().__init__() 57 | # fmt: off 58 | in_channels = input_shape[0].channels 59 | num_classes = cfg.MODEL.RETINANET.NUM_CLASSES 60 | num_convs = cfg.MODEL.RETINANET.NUM_CONVS 61 | prior_prob = cfg.MODEL.RETINANET.PRIOR_PROB 62 | num_anchors = build_anchor_generator(cfg, input_shape).num_cell_anchors 63 | # fmt: on 64 | assert ( 65 | len(set(num_anchors)) == 1 66 | ), "Using different number of anchors between levels is not currently supported!" 67 | num_anchors = num_anchors[0] 68 | 69 | self.ssh = [] 70 | for i in range(len(cfg.MODEL.RETINANET.IN_FEATURES)): 71 | ssh = SSH(cfg, in_channels, in_channels) 72 | name = "ssh" + str(i) 73 | self.add_module(name, ssh) 74 | self.ssh.append(ssh) 75 | 76 | cls_subnet = [] 77 | bbox_subnet = [] 78 | for _ in range(num_convs): 79 | cls_subnet.append( 80 | nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1) 81 | ) 82 | cls_subnet.append(nn.ReLU()) 83 | bbox_subnet.append( 84 | nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1) 85 | ) 86 | bbox_subnet.append(nn.ReLU()) 87 | 88 | self.cls_score = nn.Conv2d( 89 | in_channels, num_anchors * num_classes, kernel_size=1, stride=1, padding=0 90 | ) 91 | self.bbox_pred = nn.Conv2d(in_channels, num_anchors * 4, kernel_size=1, stride=1, padding=0) 92 | 93 | # Initialization 94 | for modules in [self.cls_score, self.bbox_pred]: 95 | for layer in modules.modules(): 96 | if isinstance(layer, nn.Conv2d): 97 | torch.nn.init.normal_(layer.weight, mean=0, std=0.01) 98 | torch.nn.init.constant_(layer.bias, 0) 99 | 100 | # Use prior in model initialization to improve stability 101 | bias_value = -math.log((1 - prior_prob) / prior_prob) 102 | torch.nn.init.constant_(self.cls_score.bias, bias_value) 103 | 104 | def forward(self, features): 105 | """ 106 | Arguments: 107 | features (list[Tensor]): FPN feature map tensors in high to low resolution. 108 | Each tensor in the list correspond to different feature levels. 109 | Returns: 110 | logits (list[Tensor]): #lvl tensors, each has shape (N, AxK, Hi, Wi). 111 | The tensor predicts the classification probability 112 | at each spatial position for each of the A anchors and K object 113 | classes. 114 | bbox_reg (list[Tensor]): #lvl tensors, each has shape (N, Ax4, Hi, Wi). 115 | The tensor predicts 4-vector (dx,dy,dw,dh) box 116 | regression values for every anchor. These values are the 117 | relative offset between the anchor and the ground truth box. 118 | """ 119 | logits = [] 120 | bbox_reg = [] 121 | for i, feature in enumerate(features): 122 | feature = self.ssh[i](feature) 123 | logits.append(self.cls_score(feature)) 124 | bbox_reg.append(self.bbox_pred(feature)) 125 | return logits, bbox_reg 126 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import glob 4 | import os 5 | from distutils.core import Extension, setup 6 | 7 | import numpy 8 | import torch 9 | from Cython.Build import cythonize 10 | from torch.utils.cpp_extension import CUDA_HOME, CppExtension, CUDAExtension 11 | 12 | requirements = ["torch", "torchvision"] 13 | 14 | torch_ver = [int(x) for x in torch.__version__.split(".")[:2]] 15 | assert torch_ver >= [1, 4], "Requires PyTorch >= 1.4" 16 | 17 | bbox_oevelaps_extensions = [ 18 | Extension( 19 | "mmdet.evaluation.bbox", 20 | ["mmdet/evaluation/box_overlaps.pyx"], 21 | include_dirs=[numpy.get_include()], 22 | ) 23 | ] 24 | 25 | 26 | def get_dcnv2_extensions(): 27 | this_dir = os.path.dirname(os.path.abspath(__file__)) 28 | extensions_dir = os.path.join(this_dir, "mmdet/layers/DCNv2/src") 29 | 30 | main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) 31 | source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) 32 | source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) 33 | 34 | os.environ["CC"] = "g++" 35 | sources = main_file + source_cpu 36 | extension = CppExtension 37 | extra_compile_args = {"cxx": []} 38 | define_macros = [] 39 | 40 | if torch.cuda.is_available() and CUDA_HOME is not None: 41 | extension = CUDAExtension 42 | sources += source_cuda 43 | define_macros += [("WITH_CUDA", None)] 44 | extra_compile_args["nvcc"] = [ 45 | "-DCUDA_HAS_FP16=1", 46 | "-D__CUDA_NO_HALF_OPERATORS__", 47 | "-D__CUDA_NO_HALF_CONVERSIONS__", 48 | "-D__CUDA_NO_HALF2_OPERATORS__", 49 | ] 50 | else: 51 | # raise NotImplementedError('Cuda is not available') 52 | pass 53 | 54 | sources = [os.path.join(extensions_dir, s) for s in sources] 55 | include_dirs = [extensions_dir] 56 | ext_modules = [ 57 | extension( 58 | "_ext", 59 | sources, 60 | include_dirs=include_dirs, 61 | define_macros=define_macros, 62 | extra_compile_args=extra_compile_args, 63 | ) 64 | ] 65 | return ext_modules 66 | 67 | 68 | setup( 69 | name="mmdet", 70 | version="0.1.0", 71 | author="lbin", 72 | url="https://github.com/lbin/Retinaface_Mobilenet_Pytorch", 73 | description="mmdet", 74 | # packages=find_packages(exclude=("configs", "tests")), 75 | python_requires=">=3.6", 76 | install_requires=[ 77 | "termcolor>=1.1", 78 | "Pillow", # you can also use pillow-simd for better performance 79 | "yacs>=0.1.6", 80 | "tabulate", 81 | "cloudpickle", 82 | "matplotlib", 83 | "tqdm>4.29.0", 84 | "tensorboard", 85 | "fvcore", 86 | "future", # used by caffe2 87 | "pydot", # used to save caffe2 SVGs 88 | ], 89 | extras_require={ 90 | "all": ["shapely", "psutil"], 91 | "dev": ["flake8", "isort", "black==19.3b0", "flake8-bugbear", "flake8-comprehensions"], 92 | }, 93 | ext_modules=[cythonize(bbox_oevelaps_extensions)[0], get_dcnv2_extensions()[0]], 94 | cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, 95 | ) 96 | -------------------------------------------------------------------------------- /train_net.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from detectron2.checkpoint import DetectionCheckpointer 4 | 5 | # from detectron2.config import get_cfg 6 | from detectron2.data import build_detection_test_loader, build_detection_train_loader 7 | from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, launch 8 | 9 | from mmdet.config import get_cfg 10 | from mmdet.data.datasets.widerface import register_widerface 11 | from mmdet.data.widerface_dataset_mapper import WiderFace_DatasetMapper 12 | 13 | # from mmdet.engine.apex_trainer import ApexTrainer 14 | from mmdet.evaluation.evaluator import WiderFaceEvaluator 15 | 16 | 17 | class Trainer(DefaultTrainer): 18 | @classmethod 19 | def build_evaluator(cls, cfg, dataset_name, output_folder=None): 20 | if output_folder is None: 21 | output_folder = os.path.join(cfg.OUTPUT_DIR, "inference") 22 | 23 | return WiderFaceEvaluator(dataset_name, output_folder) 24 | 25 | @classmethod 26 | def build_train_loader(cls, cfg): 27 | return build_detection_train_loader(cfg, mapper=WiderFace_DatasetMapper(cfg, True)) 28 | 29 | @classmethod 30 | def build_test_loader(cls, cfg, dataset_name): 31 | return build_detection_test_loader( 32 | cfg, dataset_name, mapper=WiderFace_DatasetMapper(cfg, False) 33 | ) 34 | 35 | 36 | def setup(args): 37 | """ 38 | Create configs and perform basic setups. 39 | """ 40 | cfg = get_cfg() 41 | cfg.merge_from_file(args.config_file) 42 | cfg.merge_from_list(args.opts) 43 | cfg.freeze() 44 | default_setup(cfg, args) 45 | return cfg 46 | 47 | 48 | def main(args): 49 | cfg = setup(args) 50 | register_widerface() 51 | 52 | if args.eval_only: 53 | model = Trainer.build_model(cfg) 54 | DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( 55 | cfg.MODEL.WEIGHTS, resume=args.resume 56 | ) 57 | res = Trainer.test(cfg, model) 58 | return res 59 | 60 | trainer = Trainer(cfg) 61 | trainer.resume_or_load(resume=args.resume) 62 | return trainer.train() 63 | 64 | 65 | if __name__ == "__main__": 66 | args = default_argument_parser().parse_args() 67 | print("Command Line Args:", args) 68 | launch( 69 | main, 70 | args.num_gpus, 71 | num_machines=args.num_machines, 72 | machine_rank=args.machine_rank, 73 | dist_url=args.dist_url, 74 | args=(args,), 75 | ) 76 | --------------------------------------------------------------------------------