├── .flake8
├── .gitignore
├── README.md
├── configs
    ├── Base_RetinaFace.yaml
    └── facetron
    │   ├── retinaface_r_50_3x.yaml
    │   └── retinaface_r_50_torchvision_3x.yaml
├── dev
    └── linter.sh
├── mmdet
    ├── __init__.py
    ├── config
    │   ├── __init__.py
    │   ├── config.py
    │   └── defaults.py
    ├── data
    │   ├── __init__.py
    │   ├── datasets
    │   │   ├── __init__.py
    │   │   └── widerface.py
    │   ├── transforms
    │   │   ├── __init__.py
    │   │   └── widerface_transform.py
    │   └── widerface_dataset_mapper.py
    ├── evaluation
    │   ├── __init__.py
    │   ├── box_overlaps.c
    │   ├── box_overlaps.pyx
    │   ├── evaluator.py
    │   └── widerface_evaluation.py
    ├── layers
    │   ├── DCNv2
    │   │   ├── __init__.py
    │   │   ├── dcn_v2.py
    │   │   └── src
    │   │   │   ├── cpu
    │   │   │       ├── dcn_v2_cpu.cpp
    │   │   │       ├── dcn_v2_im2col_cpu.cpp
    │   │   │       ├── dcn_v2_im2col_cpu.h
    │   │   │       ├── dcn_v2_psroi_pooling_cpu.cpp
    │   │   │       └── vision.h
    │   │   │   ├── cuda
    │   │   │       ├── dcn_v2_cuda.cu
    │   │   │       ├── dcn_v2_im2col_cuda.cu
    │   │   │       ├── dcn_v2_im2col_cuda.h
    │   │   │       ├── dcn_v2_psroi_pooling_cuda.cu
    │   │   │       └── vision.h
    │   │   │   ├── dcn_v2.h
    │   │   │   └── vision.cpp
    │   ├── __init__.py
    │   ├── nms.py
    │   └── ssh.py
    └── modeling
    │   ├── __init__.py
    │   ├── backbone
    │       ├── __init__.py
    │       └── torch_resnet.py
    │   └── meta_arch
    │       ├── __init__.py
    │       └── retinaface.py
├── setup.py
└── train_net.py


/.flake8:
--------------------------------------------------------------------------------
1 | # This is an example .flake8 config, used when developing *Black* itself.
2 | # Keep in sync with setup.cfg which is used for source packages.
3 | 
4 | [flake8]
5 | ignore = W503, E203, E221, C901
6 | max-line-length = 180
7 | max-complexity = 18
8 | select = B,C,E,F,W,T4,B9
9 | exclude = build,__init__.py


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # User Define
  2 | .vscode
  3 | 
  4 | # Byte-compiled / optimized / DLL files
  5 | __pycache__/
  6 | *.py[cod]
  7 | *$py.class
  8 | 
  9 | # C extensions
 10 | *.so
 11 | 
 12 | # Distribution / packaging
 13 | .Python
 14 | build/
 15 | develop-eggs/
 16 | dist/
 17 | downloads/
 18 | eggs/
 19 | .eggs/
 20 | lib/
 21 | lib64/
 22 | parts/
 23 | sdist/
 24 | var/
 25 | wheels/
 26 | *.egg-info/
 27 | .installed.cfg
 28 | *.egg
 29 | MANIFEST
 30 | 
 31 | # PyInstaller
 32 | #  Usually these files are written by a python script from a template
 33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 34 | *.manifest
 35 | *.spec
 36 | 
 37 | # Installer logs
 38 | pip-log.txt
 39 | pip-delete-this-directory.txt
 40 | 
 41 | # Unit test / coverage reports
 42 | htmlcov/
 43 | .tox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | 
 53 | # Translations
 54 | *.mo
 55 | *.pot
 56 | 
 57 | # Django stuff:
 58 | *.log
 59 | local_settings.py
 60 | db.sqlite3
 61 | 
 62 | # Flask stuff:
 63 | instance/
 64 | .webassets-cache
 65 | 
 66 | # Scrapy stuff:
 67 | .scrapy
 68 | 
 69 | # Sphinx documentation
 70 | docs/_build/
 71 | 
 72 | # PyBuilder
 73 | target/
 74 | 
 75 | # Jupyter Notebook
 76 | .ipynb_checkpoints
 77 | 
 78 | # pyenv
 79 | .python-version
 80 | 
 81 | # celery beat schedule file
 82 | celerybeat-schedule
 83 | 
 84 | # SageMath parsed files
 85 | *.sage.py
 86 | 
 87 | # Environments
 88 | .env
 89 | .venv
 90 | env/
 91 | venv/
 92 | ENV/
 93 | env.bak/
 94 | venv.bak/
 95 | 
 96 | # Spyder project settings
 97 | .spyderproject
 98 | .spyproject
 99 | 
100 | # Rope project settings
101 | .ropeproject
102 | 
103 | # mkdocs documentation
104 | /site
105 | 
106 | # mypy
107 | .mypy_cache/
108 | 
109 | .DS_Store
110 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # RetinaFace in PyTorch
 2 | 
 3 | A [PyTorch](https://pytorch.org/) implementation of [RetinaFace: Single-stage Dense Face Localisation in the Wild](https://arxiv.org/abs/1905.00641). The official code in Mxnet can be found [here](https://github.com/deepinsight/insightface/tree/master/RetinaFace).
 4 | 
 5 | Old version canbe found at [v1.0](https://github.com/lbin/Retinaface_Mobilenet_Pytorch/tree/v1.0)
 6 | 
 7 | 
 8 | ## WiderFace Val Performance in single scale When using ResNet50 as backbone net.
 9 | 
10 | | Style                 |  easy  | medium |  hard  |
11 | | :-------------------- | :----: | :----: | :----: |
12 | | Ours (Original Scale) | 94.14% | 92.71% | 81.13% |
13 | 
14 | ## Dependencies
15 | 
16 | * pytorch >= 1.4.0
17 | * torchvision >= 0.4.0
18 | * python >= 3.6
19 | 
20 | ## Installation
21 | 
22 | pip install -e .
23 | 


--------------------------------------------------------------------------------
/configs/Base_RetinaFace.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |     META_ARCHITECTURE: "RetinaFace"
 3 |     BACKBONE:
 4 |       NAME: "build_resnet_fpn_backbone"
 5 |     RESNETS:
 6 |       OUT_FEATURES: ["res3", "res4", "res5"]
 7 |     ANCHOR_GENERATOR:
 8 |       SIZES: !!python/object/apply:eval ["[[x, x * 2**(1.0/3), x * 2**(2.0/3) ] for x in [32, 64, 128, 256, 512 ]]"]
 9 |     FPN:
10 |       IN_FEATURES: ["res3", "res4", "res5"]
11 |     RETINANET:
12 |       IOU_THRESHOLDS: [0.4, 0.5]
13 |       IOU_LABELS: [0, -1, 1]
14 | DATASETS:
15 |     TRAIN: ("widerface_train",)
16 |     TEST: ("widerface_val",)
17 | SOLVER:
18 |     IMS_PER_BATCH: 16
19 |     BASE_LR: 0.01  # Note that RetinaNet uses a different default learning rate
20 |     STEPS: (60000, 80000)
21 |     MAX_ITER: 90000
22 |     CHECKPOINT_PERIOD: 2000
23 | INPUT:
24 |     MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
25 | DATALOADER:
26 |     NUM_WORKERS: 16
27 | TEST:
28 |     DETECTIONS_PER_IMAGE: 300
29 |     EVAL_PERIOD: 10000
30 | VERSION: 2
31 | OUTPUT_DIR: "/mnt/tensorboard/"
32 | 


--------------------------------------------------------------------------------
/configs/facetron/retinaface_r_50_3x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base_RetinaFace.yaml"
 2 | DATASETS:
 3 |   TRAIN: ("widerface_train",)
 4 |   TEST: ("widerface_val",)
 5 | MODEL:
 6 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 7 |   BACKBONE:
 8 |     NAME: "build_resnet_fpn_backbone"
 9 |     FREEZE_AT: 0
10 |   RESNETS:
11 |     DEPTH: 50
12 |     STRIDE_IN_1X1: False
13 |     NORM: 'SyncBN'
14 |   FPN:
15 |     NORM: 'SyncBN'
16 |   RETINANET:
17 |     NUM_CLASSES: 1
18 |     IN_FEATURES: ['p3', 'p4', 'p5']
19 |     SCORE_THRESH_TEST: 0.02
20 |     TOPK_CANDIDATES_TEST: 5000
21 |     NMS_THRESH_TEST: 0.4
22 |     WITH_DCNv2: True
23 |     NORM: 'SyncBN'
24 |     SMOOTH_L1_LOSS_BETA: 0.0
25 |   ANCHOR_GENERATOR:
26 |     SIZES: [[16, 32], [64, 128], [256, 512]]
27 |     ASPECT_RATIOS: [[1.0]]
28 | SOLVER:
29 |   IMS_PER_BATCH: 32
30 |   BASE_LR: 0.02 # Note that RetinaNet uses a different default learning rate
31 |   STEPS: (210000, 250000)
32 |   MAX_ITER: 270000
33 |   CHECKPOINT_PERIOD: 10000
34 | INPUT:
35 |   MIN_SIZE_TRAIN: (540, 640, 672, 704, 736, 768, 800)
36 |   MAX_SIZE_TRAIN: 1920
37 |   MIN_SIZE_TEST: 0
38 |   MAX_SIZE_TEST: 0
39 |   CROP:
40 |     ENABLED: True
41 | 


--------------------------------------------------------------------------------
/configs/facetron/retinaface_r_50_torchvision_3x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base_RetinaFace.yaml"
 2 | DATASETS:
 3 |   TRAIN: ("widerface_train",)
 4 |   TEST: ("widerface_val",)
 5 | MODEL:
 6 |   WEIGHTS: ""
 7 |   PIXEL_MEAN: [0.485, 0.456, 0.406]
 8 |   PIXEL_STD: [0.229, 0.224, 0.225]
 9 |   BACKBONE:
10 |     NAME: "build_torch_resnet_fpn_backbone"
11 |     FREEZE_AT: 0
12 |   RESNETS:
13 |     DEPTH: 50
14 |     STRIDE_IN_1X1: False
15 |     NORM: 'SyncBN'
16 |   FPN:
17 |     NORM: 'SyncBN'
18 |   RETINANET:
19 |     NUM_CLASSES: 1
20 |     IN_FEATURES: ['p3', 'p4', 'p5']
21 |     SCORE_THRESH_TEST: 0.02
22 |     TOPK_CANDIDATES_TEST: 5000
23 |     NMS_THRESH_TEST: 0.4
24 |     WITH_DCNv2: True
25 |     NORM: 'SyncBN'
26 |     SMOOTH_L1_LOSS_BETA: 0.0
27 |   ANCHOR_GENERATOR:
28 |     SIZES: [[16, 32], [64, 128], [256, 512]]
29 |     ASPECT_RATIOS: [[1.0]]
30 | SOLVER:
31 |   IMS_PER_BATCH: 32
32 |   BASE_LR: 0.02 # Note that RetinaNet uses a different default learning rate
33 |   STEPS: (210000, 250000)
34 |   MAX_ITER: 270000
35 |   CHECKPOINT_PERIOD: 10000
36 | INPUT:
37 |   MIN_SIZE_TRAIN: (540, 640, 672, 704, 736, 768, 800)
38 |   MAX_SIZE_TRAIN: 1920
39 |   MIN_SIZE_TEST: 0
40 |   MAX_SIZE_TEST: 0
41 |   CROP:
42 |     ENABLED: True
43 |   FORMAT: "RGB"


--------------------------------------------------------------------------------
/dev/linter.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -e
 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 3 | 
 4 | # Run this script at project root by "./dev/linter.sh" before you commit
 5 | 
 6 | vergte() {
 7 |   [ "$2" = "$(echo -e "$1\\n$2" | sort -V | head -n1)" ]
 8 | }
 9 | 
10 | {
11 | 	black --version | grep "19.3b0" > /dev/null
12 | } || {
13 | 	echo "Linter requires black==19.3b0 !"
14 | 	exit 1
15 | }
16 | 
17 | ISORT_TARGET_VERSION="4.3.21"
18 | ISORT_VERSION=$(isort -v | grep VERSION | awk '{print $2}')
19 | vergte "$ISORT_VERSION" "$ISORT_TARGET_VERSION" || {
20 |   echo "Linter requires isort>=${ISORT_TARGET_VERSION} !"
21 |   exit 1
22 | }
23 | 
24 | set -v
25 | 
26 | echo "Running isort ..."
27 | isort -y -sp . --atomic
28 | 
29 | echo "Running black ..."
30 | black -l 100 .
31 | 
32 | echo "Running flake8 ..."
33 | if [ -x "$(command -v flake8-3)" ]; then
34 |   flake8-3 .
35 | else
36 |   python3 -m flake8 .
37 | fi
38 | 
39 | # echo "Running mypy ..."
40 | # Pytorch does not have enough type annotations
41 | # mypy mmdet/solver mmdet/structures mmdet/config
42 | 
43 | # echo "Running clang-format ..."
44 | # find . -regex ".*\.\(cpp\|c\|cc\|cu\|cxx\|h\|hh\|hpp\|hxx\|tcc\|mm\|m\)" -print0 | xargs -0 clang-format -i
45 | 
46 | # command -v arc > /dev/null && arc lint
47 | 


--------------------------------------------------------------------------------
/mmdet/__init__.py:
--------------------------------------------------------------------------------
1 | from .modeling import RetinaFace, build_torch_resnet_fpn_backbone
2 | 
3 | __version__ = "0.1.0"
4 | 


--------------------------------------------------------------------------------
/mmdet/config/__init__.py:
--------------------------------------------------------------------------------
1 | from .config import get_cfg
2 | 
3 | __all__ = ["get_cfg"]
4 | 


--------------------------------------------------------------------------------
/mmdet/config/config.py:
--------------------------------------------------------------------------------
 1 | from detectron2.config import CfgNode
 2 | 
 3 | 
 4 | def get_cfg() -> CfgNode:
 5 |     """
 6 |     Get a copy of the default config.
 7 |     Returns:
 8 |         a detectron2 CfgNode instance.
 9 |     """
10 |     from .defaults import _C
11 | 
12 |     return _C.clone()
13 | 


--------------------------------------------------------------------------------
/mmdet/config/defaults.py:
--------------------------------------------------------------------------------
 1 | # from detectron2.config import CfgNode as CN
 2 | from detectron2.config.defaults import _C
 3 | 
 4 | # ---------------------------------------------------------------------------- #
 5 | # Additional Configs
 6 | # ---------------------------------------------------------------------------- #
 7 | 
 8 | _C.MODEL.RETINANET.WITH_DCNv2 = False
 9 | _C.MODEL.RETINANET.NORM = "BN"
10 | 


--------------------------------------------------------------------------------
/mmdet/data/__init__.py:
--------------------------------------------------------------------------------
1 | from . import datasets  # just to register data
2 | 


--------------------------------------------------------------------------------
/mmdet/data/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | from . import widerface
2 | 


--------------------------------------------------------------------------------
/mmdet/data/datasets/widerface.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import cv2
  4 | import numpy as np
  5 | from detectron2.data import DatasetCatalog, MetadataCatalog
  6 | from detectron2.structures.boxes import BoxMode
  7 | 
  8 | 
  9 | def get_widerface_metadata():
 10 |     metadata = {"thing_classes": ["face"]}
 11 |     return metadata
 12 | 
 13 | 
 14 | def get_widerface_dicts(image_root):
 15 |     label_file = os.path.join(image_root, "label.txt")
 16 | 
 17 |     imgs_path = []
 18 |     imgs_path_no_head = []
 19 |     words = []
 20 | 
 21 |     with open(label_file) as f:
 22 |         lines = f.readlines()
 23 |         isFirst = True
 24 |         labels = []
 25 |         for line in lines:
 26 |             line = line.rstrip()
 27 |             if line.startswith("#"):
 28 |                 if isFirst is True:
 29 |                     isFirst = False
 30 |                 else:
 31 |                     labels_copy = labels.copy()
 32 |                     words.append(labels_copy)
 33 |                     labels.clear()
 34 |                 path = line[2:]
 35 |                 imgs_path_no_head.append(path)
 36 |                 path = label_file.replace("label.txt", "images/") + path
 37 |                 imgs_path.append(path)
 38 |             else:
 39 |                 line = line.split(" ")
 40 |                 label = [float(x) for x in line]
 41 |                 labels.append(label)
 42 | 
 43 |         words.append(labels)
 44 | 
 45 |     widerface_dicts = []
 46 |     for index in range(len(words)):
 47 | 
 48 |         filename = imgs_path[index]
 49 |         height, width = cv2.imread(filename).shape[:2]
 50 | 
 51 |         record = {}
 52 |         record["file_name"] = filename
 53 |         record["image_id"] = imgs_path_no_head[index]
 54 |         record["height"] = height
 55 |         record["width"] = width
 56 | 
 57 |         labels = words[index]
 58 |         # annotations = np.zeros((0, 15))
 59 |         objs = []
 60 | 
 61 |         for idx, label in enumerate(labels):
 62 |             annotation = np.zeros((1, 15))
 63 |             # bbox
 64 |             annotation[0, 0] = label[0]  # x1
 65 |             annotation[0, 1] = label[1]  # y1
 66 | 
 67 |             if label[0] >= width or label[1] >= height:
 68 |                 continue
 69 | 
 70 |             if label[2] <= 0 or label[3] <= 0:
 71 |                 continue
 72 | 
 73 |             annotation[0, 2] = label[0] + label[2]  # x2
 74 |             if annotation[0, 2] >= width:
 75 |                 annotation[0, 2] = width - 1
 76 | 
 77 |             annotation[0, 3] = label[1] + label[3]  # y2
 78 |             if annotation[0, 3] >= height:
 79 |                 annotation[0, 3] = height - 1
 80 | 
 81 |             if len(label) > 4:
 82 |                 # landmarks
 83 |                 annotation[0, 4] = label[4]  # l0_x
 84 |                 annotation[0, 5] = label[5]  # l0_y
 85 |                 annotation[0, 6] = label[7]  # l1_x
 86 |                 annotation[0, 7] = label[8]  # l1_y
 87 |                 annotation[0, 8] = label[10]  # l2_x
 88 |                 annotation[0, 9] = label[11]  # l2_y
 89 |                 annotation[0, 10] = label[13]  # l3_x
 90 |                 annotation[0, 11] = label[14]  # l3_y
 91 |                 annotation[0, 12] = label[16]  # l4_x
 92 |                 annotation[0, 13] = label[17]  # l4_y
 93 |                 if annotation[0, 4] < 0:
 94 |                     annotation[0, 14] = -1
 95 |                 else:
 96 |                     annotation[0, 14] = 1
 97 |             obj = {
 98 |                 "bbox": [annotation[0, 0], annotation[0, 1], annotation[0, 2], annotation[0, 3]],
 99 |                 "bbox_mode": BoxMode.XYXY_ABS,
100 |                 "landmark": annotation,
101 |                 "category_id": 0,
102 |             }
103 |             objs.append(obj)
104 | 
105 |         record["annotations"] = objs
106 |         widerface_dicts.append(record)
107 |     return widerface_dicts
108 | 
109 | 
110 | def register_widerface():
111 |     SPLITS = {
112 |         "widerface_train": ("widerface/train", "widerface/train/label.txt"),
113 |         "widerface_val": ("widerface/val", "widerface/val/label.txt"),
114 |     }
115 |     for name, (image_root, label_file) in SPLITS.items():
116 |         label_file = os.path.join("datasets", label_file)
117 |         image_root = os.path.join("datasets", image_root)
118 |         register_widerface_instance(name, image_root)
119 | 
120 | 
121 | def register_widerface_instance(name, image_root):
122 |     DatasetCatalog.register(name, lambda name=name: get_widerface_dicts(image_root))
123 |     MetadataCatalog.get(name).set(**get_widerface_metadata())
124 | 


--------------------------------------------------------------------------------
/mmdet/data/transforms/__init__.py:
--------------------------------------------------------------------------------
1 | from detectron2.data.transforms import *
2 | from fvcore.transforms import *
3 | 
4 | from .widerface_transform import *
5 | 
6 | __all__ = [k for k in globals().keys() if not k.startswith("_")]
7 | 


--------------------------------------------------------------------------------
/mmdet/data/transforms/widerface_transform.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import sys
  3 | 
  4 | import numpy as np
  5 | from detectron2.data.transforms import ResizeTransform, TransformGen
  6 | from fvcore.transforms.transform import CropTransform, NoOpTransform
  7 | from PIL import Image
  8 | 
  9 | __all__ = ["WiderFace_ResizeShortestEdge", "WiderFace_NoOpTransform", "WiderFace_RandomCrop"]
 10 | 
 11 | 
 12 | class WiderFace_ResizeShortestEdge(TransformGen):
 13 |     """
 14 |     Scale the shorter edge to the given size, with a limit of `max_size` on the longer edge.
 15 |     If `max_size` is reached, then downscale so that the longer edge does not exceed max_size.
 16 |     """
 17 | 
 18 |     def __init__(
 19 |         self, short_edge_length, max_size=sys.maxsize, sample_style="choice", interp=Image.BILINEAR
 20 |     ):
 21 |         """
 22 |         Args:
 23 |             short_edge_length (list[int]): If ``sample_style=="range"``,
 24 |                 a [min, max] interval from which to sample the shortest edge length.
 25 |                 If ``sample_style=="choice"``, a list of shortest edge lengths to sample from.
 26 |             max_size (int): maximum allowed longest edge length.
 27 |             sample_style (str): either "range" or "choice".
 28 |         """
 29 |         super().__init__()
 30 |         assert sample_style in ["range", "choice"], sample_style
 31 | 
 32 |         self.is_range = sample_style == "range"
 33 |         if isinstance(short_edge_length, int):
 34 |             short_edge_length = (short_edge_length, short_edge_length)
 35 |         self._init(locals())
 36 | 
 37 |     def get_transform(self, img):
 38 |         h, w = img.shape[:2]
 39 |         if min(h, w) >= self.short_edge_length[0]:
 40 |             return NoOpTransform()
 41 | 
 42 |         scale = self.short_edge_length[0] * 1.0 / min(h, w)
 43 |         newh = h * scale
 44 |         neww = w * scale
 45 |         neww = int(neww + 0.5)
 46 |         newh = int(newh + 0.5)
 47 |         return ResizeTransform(h, w, newh, neww, self.interp)
 48 | 
 49 | 
 50 | class WiderFace_NoOpTransform(TransformGen):
 51 |     """
 52 |     Scale the shorter edge to the given size, with a limit of `max_size` on the longer edge.
 53 |     If `max_size` is reached, then downscale so that the longer edge does not exceed max_size.
 54 |     """
 55 | 
 56 |     def __init__(self):
 57 |         """
 58 |         Args:
 59 |             short_edge_length (list[int]): If ``sample_style=="range"``,
 60 |                 a [min, max] interval from which to sample the shortest edge length.
 61 |                 If ``sample_style=="choice"``, a list of shortest edge lengths to sample from.
 62 |             max_size (int): maximum allowed longest edge length.
 63 |             sample_style (str): either "range" or "choice".
 64 |         """
 65 |         super().__init__()
 66 | 
 67 |     def get_transform(self, img):
 68 | 
 69 |         return NoOpTransform()
 70 | 
 71 | 
 72 | class WiderFace_RandomCrop(TransformGen):
 73 |     """
 74 |     Randomly crop a subimage out of an image.
 75 |     """
 76 | 
 77 |     def __init__(self):
 78 |         super().__init__()
 79 |         self._init(locals())
 80 | 
 81 |     def get_transform(self, img):
 82 |         h, w = img.shape[:2]
 83 |         croph, cropw = self.get_crop_size((h, w))
 84 |         assert h >= croph and w >= cropw, "Shape computation in {} has bugs.".format(self)
 85 |         h0 = np.random.randint(h - croph + 1)
 86 |         w0 = np.random.randint(w - cropw + 1)
 87 |         return CropTransform(w0, h0, cropw, croph)
 88 | 
 89 |     def get_crop_size(self, image_size):
 90 |         """
 91 |         Args:
 92 |             image_size (tuple): height, width
 93 |         Returns:
 94 |             crop_size (tuple): height, width in absolute pixels
 95 |         """
 96 |         h, w = image_size
 97 | 
 98 |         # crop_size = np.asarray([0.1, 0.9], dtype=np.float32)
 99 |         # ch, cw = crop_size + np.random.rand(2) * (1 - crop_size)
100 |         # return int(h * ch + 0.5), int(w * cw + 0.5)
101 | 
102 |         PRE_SCALES = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
103 |         scale = random.choice(PRE_SCALES)
104 |         short_side = min(h, w)
105 |         w = int(scale * short_side)
106 |         h = w
107 |         return h, w
108 | 


--------------------------------------------------------------------------------
/mmdet/data/widerface_dataset_mapper.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  2 | import copy
  3 | import logging
  4 | 
  5 | import numpy as np
  6 | import torch
  7 | from detectron2.data import detection_utils as utils
  8 | from fvcore.common.file_io import PathManager
  9 | from PIL import Image
 10 | 
 11 | from . import transforms as T
 12 | 
 13 | """
 14 | This file contains the default mapping that's applied to "dataset dicts".
 15 | """
 16 | 
 17 | __all__ = ["WiderFace_DatasetMapper"]
 18 | 
 19 | 
 20 | class WiderFace_DatasetMapper:
 21 |     """
 22 |     A callable which takes a dataset dict in Detectron2 Dataset format,
 23 |     and map it into a format used by the model.
 24 | 
 25 |     This is the default callable to be used to map your dataset dict into training data.
 26 |     You may need to follow it to implement your own one for customized logic,
 27 |     such as a different way to read or transform images.
 28 |     See :doc:`/tutorials/data_loading` for details.
 29 | 
 30 |     The callable currently does the following:
 31 | 
 32 |     1. Read the image from "file_name"
 33 |     2. Applies cropping/geometric transforms to the image and annotations
 34 |     3. Prepare data and annotations to Tensor and :class:`Instances`
 35 |     """
 36 | 
 37 |     def __init__(self, cfg, is_train=True):
 38 |         if cfg.INPUT.CROP.ENABLED and is_train:
 39 |             # self.crop_gen = T.RandomCrop(cfg.INPUT.CROP.TYPE, cfg.INPUT.CROP.SIZE)
 40 |             self.crop_gen = T.WiderFace_RandomCrop()
 41 |             logging.getLogger(__name__).info("CropGen used in training: " + str(self.crop_gen))
 42 |         else:
 43 |             self.crop_gen = None
 44 | 
 45 |         if is_train:
 46 |             min_size = cfg.INPUT.MIN_SIZE_TRAIN
 47 |             max_size = cfg.INPUT.MAX_SIZE_TRAIN
 48 |             sample_style = cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING
 49 | 
 50 |             self.tfm_gens = []
 51 |             # self.tfm_gens.append(T.WiderFace_ResizeShortestEdge(min_size, max_size, sample_style))
 52 |             self.tfm_gens.append(T.ResizeShortestEdge(min_size, max_size, sample_style))
 53 |             self.tfm_gens.append(T.RandomFlip(prob=0.5, horizontal=True, vertical=False))
 54 |             self.tfm_gens.append(T.RandomFlip(prob=0.5, horizontal=False, vertical=True))
 55 |             # self.tfm_gens.append(T.RandomContrast(0.7, 3.2))
 56 |             # self.tfm_gens.append(T.RandomBrightness(0.6, 1.8))
 57 |             # self.tfm_gens.append(T.RandomSaturation(0.6, 1.4))
 58 |             # self.tfm_gens.append(T.RandomLighting(0.1))
 59 |             logging.getLogger(__name__).info(
 60 |                 "TransformGens used in training: " + str(self.tfm_gens)
 61 |             )
 62 |         else:
 63 |             self.tfm_gens = []
 64 |             self.tfm_gens.append(T.WiderFace_NoOpTransform())
 65 | 
 66 |         # fmt: off
 67 |         self.img_format     = cfg.INPUT.FORMAT
 68 |         self.mask_on        = cfg.MODEL.MASK_ON
 69 |         self.mask_format    = cfg.INPUT.MASK_FORMAT
 70 |         self.keypoint_on    = cfg.MODEL.KEYPOINT_ON
 71 |         self.load_proposals = cfg.MODEL.LOAD_PROPOSALS
 72 |         # fmt: on
 73 |         if self.keypoint_on and is_train:
 74 |             # Flip only makes sense in training
 75 |             self.keypoint_hflip_indices = utils.create_keypoint_hflip_indices(cfg.DATASETS.TRAIN)
 76 |         else:
 77 |             self.keypoint_hflip_indices = None
 78 | 
 79 |         if self.load_proposals:
 80 |             self.min_box_side_len = cfg.MODEL.PROPOSAL_GENERATOR.MIN_SIZE
 81 |             self.proposal_topk = (
 82 |                 cfg.DATASETS.PRECOMPUTED_PROPOSAL_TOPK_TRAIN
 83 |                 if is_train
 84 |                 else cfg.DATASETS.PRECOMPUTED_PROPOSAL_TOPK_TEST
 85 |             )
 86 |         self.is_train = is_train
 87 | 
 88 |     def __call__(self, dataset_dict):
 89 |         """
 90 |         Args:
 91 |             dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
 92 | 
 93 |         Returns:
 94 |             dict: a format that builtin models in detectron2 accept
 95 |         """
 96 |         dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
 97 |         # USER: Write your own image loading if it's not from a file
 98 |         image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
 99 |         utils.check_image_size(dataset_dict, image)
100 | 
101 |         if "annotations" not in dataset_dict:
102 |             image, transforms = T.apply_transform_gens(
103 |                 ([self.crop_gen] if self.crop_gen else []) + self.tfm_gens, image
104 |             )
105 |         else:
106 |             # Crop around an instance if there are instances in the image.
107 |             # USER: Remove if you don't use cropping
108 |             # image, transforms = T.apply_transform_gens(self.tfm_gens, image)
109 |             if self.crop_gen:
110 |                 crop_tfm = utils.gen_crop_transform_with_instance(
111 |                     self.crop_gen.get_crop_size(image.shape[:2]),
112 |                     image.shape[:2],
113 |                     np.random.choice(dataset_dict["annotations"]),
114 |                 )
115 |                 image = crop_tfm.apply_image(image)
116 | 
117 |             image, transforms = T.apply_transform_gens(self.tfm_gens, image)
118 |             if self.crop_gen:
119 |                 transforms = crop_tfm + transforms
120 | 
121 |         image_shape = image.shape[:2]  # h, w
122 | 
123 |         # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
124 |         # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
125 |         # Therefore it's important to use torch.Tensor.
126 |         dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
127 | 
128 |         # USER: Remove if you don't use pre-computed proposals.
129 |         if self.load_proposals:
130 |             utils.transform_proposals(
131 |                 dataset_dict, image_shape, transforms, self.min_box_side_len, self.proposal_topk
132 |             )
133 | 
134 |         if not self.is_train:
135 |             # USER: Modify this if you want to keep them for some reason.
136 |             dataset_dict.pop("annotations", None)
137 |             dataset_dict.pop("sem_seg_file_name", None)
138 |             return dataset_dict
139 | 
140 |         if "annotations" in dataset_dict:
141 |             # USER: Modify this if you want to keep them for some reason.
142 |             for anno in dataset_dict["annotations"]:
143 |                 if not self.mask_on:
144 |                     anno.pop("segmentation", None)
145 |                 if not self.keypoint_on:
146 |                     anno.pop("keypoints", None)
147 | 
148 |             # USER: Implement additional transformations if you have other types of data
149 |             annos = [
150 |                 utils.transform_instance_annotations(
151 |                     obj, transforms, image_shape, keypoint_hflip_indices=self.keypoint_hflip_indices
152 |                 )
153 |                 for obj in dataset_dict.pop("annotations")
154 |                 if obj.get("iscrowd", 0) == 0
155 |             ]
156 |             instances = utils.annotations_to_instances(
157 |                 annos, image_shape, mask_format=self.mask_format
158 |             )
159 |             # Create a tight bounding box from masks, useful when image is cropped
160 |             if self.crop_gen and instances.has("gt_masks"):
161 |                 instances.gt_boxes = instances.gt_masks.get_bounding_boxes()
162 |             dataset_dict["instances"] = utils.filter_empty_instances(instances)
163 | 
164 |         # USER: Remove if you don't do semantic/panoptic segmentation.
165 |         if "sem_seg_file_name" in dataset_dict:
166 |             with PathManager.open(dataset_dict.pop("sem_seg_file_name"), "rb") as f:
167 |                 sem_seg_gt = Image.open(f)
168 |                 sem_seg_gt = np.asarray(sem_seg_gt, dtype="uint8")
169 |             sem_seg_gt = transforms.apply_segmentation(sem_seg_gt)
170 |             sem_seg_gt = torch.as_tensor(sem_seg_gt.astype("long"))
171 |             dataset_dict["sem_seg"] = sem_seg_gt
172 |         return dataset_dict
173 | 


--------------------------------------------------------------------------------
/mmdet/evaluation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lbin/Retinaface_Detectron2/579e500b35efac6afc389dfc9bbea0b129e91ba6/mmdet/evaluation/__init__.py


--------------------------------------------------------------------------------
/mmdet/evaluation/box_overlaps.pyx:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Fast R-CNN
 3 | # Copyright (c) 2015 Microsoft
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # Written by Sergey Karayev
 6 | # --------------------------------------------------------
 7 | 
 8 | cimport cython
 9 | import numpy as np
10 | cimport numpy as np
11 | 
12 | DTYPE = np.float
13 | ctypedef np.float_t DTYPE_t
14 | 
15 | def bbox_overlaps(
16 |         np.ndarray[DTYPE_t, ndim=2] boxes,
17 |         np.ndarray[DTYPE_t, ndim=2] query_boxes):
18 |     """
19 |     Parameters
20 |     ----------
21 |     boxes: (N, 4) ndarray of float
22 |     query_boxes: (K, 4) ndarray of float
23 |     Returns
24 |     -------
25 |     overlaps: (N, K) ndarray of overlap between boxes and query_boxes
26 |     """
27 |     cdef unsigned int N = boxes.shape[0]
28 |     cdef unsigned int K = query_boxes.shape[0]
29 |     cdef np.ndarray[DTYPE_t, ndim=2] overlaps = np.zeros((N, K), dtype=DTYPE)
30 |     cdef DTYPE_t iw, ih, box_area
31 |     cdef DTYPE_t ua
32 |     cdef unsigned int k, n
33 |     for k in range(K):
34 |         box_area = (
35 |             (query_boxes[k, 2] - query_boxes[k, 0] + 1) *
36 |             (query_boxes[k, 3] - query_boxes[k, 1] + 1)
37 |         )
38 |         for n in range(N):
39 |             iw = (
40 |                 min(boxes[n, 2], query_boxes[k, 2]) -
41 |                 max(boxes[n, 0], query_boxes[k, 0]) + 1
42 |             )
43 |             if iw > 0:
44 |                 ih = (
45 |                     min(boxes[n, 3], query_boxes[k, 3]) -
46 |                     max(boxes[n, 1], query_boxes[k, 1]) + 1
47 |                 )
48 |                 if ih > 0:
49 |                     ua = float(
50 |                         (boxes[n, 2] - boxes[n, 0] + 1) *
51 |                         (boxes[n, 3] - boxes[n, 1] + 1) +
52 |                         box_area - iw * ih
53 |                     )
54 |                     overlaps[n, k] = iw * ih / ua
55 |     return overlaps


--------------------------------------------------------------------------------
/mmdet/evaluation/evaluator.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | from collections import OrderedDict, defaultdict
  4 | 
  5 | import torch
  6 | from detectron2.data import MetadataCatalog
  7 | from detectron2.evaluation.evaluator import DatasetEvaluator
  8 | from detectron2.utils import comm
  9 | 
 10 | from .widerface_evaluation import evaluation
 11 | 
 12 | 
 13 | class WiderFaceEvaluator(DatasetEvaluator):
 14 |     """
 15 |     Evaluate Wider Face AP.
 16 |     It contains a synchronization, therefore has to be called from all ranks.
 17 |     """
 18 | 
 19 |     def __init__(self, dataset_name, output_folder):
 20 |         """
 21 |         Args:
 22 |             dataset_name (str): name of the dataset, e.g., "widerface_val"
 23 |         """
 24 |         self._dataset_name = dataset_name
 25 |         self._output_folder = output_folder
 26 |         meta = MetadataCatalog.get(dataset_name)
 27 |         # data_info = DatasetCatalog.get(dataset_name)
 28 | 
 29 |         self._class_names = meta.thing_classes
 30 | 
 31 |         self._cpu_device = torch.device("cpu")
 32 |         self._logger = logging.getLogger(__name__)
 33 | 
 34 |     def reset(self):
 35 |         self._predictions = defaultdict(list)  # class name -> list of prediction strings
 36 | 
 37 |     def process(self, inputs, outputs):
 38 |         for input, output in zip(inputs, outputs):
 39 |             image_id = input["image_id"]
 40 |             instances = output["instances"].to(self._cpu_device)
 41 |             boxes = instances.pred_boxes.tensor.numpy()
 42 |             scores = instances.scores.tolist()
 43 |             classes = instances.pred_classes.tolist()
 44 |             for box, score, cls in zip(boxes, scores, classes):
 45 |                 xmin, ymin, xmax, ymax = box
 46 |                 # The inverse of data loading logic in `datasets/pascal_voc.py`
 47 |                 xmin += 1
 48 |                 ymin += 1
 49 | 
 50 |                 self._predictions[image_id].append([xmin, ymin, xmax, ymax, score])
 51 | 
 52 |             if len(self._predictions[image_id]) == 0:
 53 |                 self._predictions[image_id].append([0, 0, 0, 0, 0])
 54 | 
 55 |     def evaluate(self):
 56 | 
 57 |         all_predictions = comm.gather(self._predictions, dst=0)
 58 |         if not comm.is_main_process():
 59 |             return
 60 |         predictions = defaultdict(list)
 61 |         for predictions_per_rank in all_predictions:
 62 |             for clsid, lines in predictions_per_rank.items():
 63 |                 predictions[clsid].extend(lines)
 64 |         del all_predictions
 65 | 
 66 |         tmp_results_path = os.path.join(self._output_folder, "wider_face_val_results")
 67 | 
 68 |         for image_id in predictions.keys():
 69 |             tmp_results_file = tmp_results_path + "/" + image_id[:-4] + ".txt"
 70 |             dirname = os.path.dirname(tmp_results_file)
 71 |             if not os.path.isdir(dirname):
 72 |                 os.makedirs(dirname)
 73 | 
 74 |             with open(tmp_results_file, "w") as fd:
 75 |                 # bboxs = dets
 76 |                 file_name = os.path.basename(tmp_results_file)[:-4] + "\n"
 77 |                 bboxs_num = str(len(predictions[image_id])) + "\n"
 78 |                 fd.write(file_name)
 79 |                 fd.write(bboxs_num)
 80 |                 idx = 0
 81 |                 for box in predictions[image_id]:
 82 | 
 83 |                     x = int(box[0])
 84 |                     y = int(box[1])
 85 |                     w = int(box[2]) - int(box[0])
 86 |                     h = int(box[3]) - int(box[1])
 87 |                     confidence = str(float(box[4]))
 88 |                     line = (
 89 |                         str(x)
 90 |                         + " "
 91 |                         + str(y)
 92 |                         + " "
 93 |                         + str(w)
 94 |                         + " "
 95 |                         + str(h)
 96 |                         + " "
 97 |                         + confidence
 98 |                         + " \n"
 99 |                     )
100 |                     fd.write(line)
101 |                     idx = idx + 1
102 | 
103 |         aps = evaluation(tmp_results_path, "datasets/widerface/val/ground_truth")
104 | 
105 |         ret = OrderedDict()
106 |         ret["bbox"] = {"Easy": aps[0], "Medium": aps[1], "Hard": aps[2]}
107 |         return ret
108 | 


--------------------------------------------------------------------------------
/mmdet/evaluation/widerface_evaluation.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import logging
  3 | import os
  4 | import sys
  5 | 
  6 | import numpy as np
  7 | from scipy.io import loadmat
  8 | 
  9 | from .bbox import bbox_overlaps
 10 | 
 11 | sys.path.append(os.getcwd())
 12 | 
 13 | 
 14 | def get_gt_boxes(gt_dir):
 15 |     """ gt dir: (wider_face_val.mat, wider_easy_val.mat, wider_medium_val.mat, wider_hard_val.mat)"""
 16 | 
 17 |     gt_mat = loadmat(os.path.join(gt_dir, "wider_face_val.mat"))
 18 |     hard_mat = loadmat(os.path.join(gt_dir, "wider_hard_val.mat"))
 19 |     medium_mat = loadmat(os.path.join(gt_dir, "wider_medium_val.mat"))
 20 |     easy_mat = loadmat(os.path.join(gt_dir, "wider_easy_val.mat"))
 21 | 
 22 |     facebox_list = gt_mat["face_bbx_list"]
 23 |     event_list = gt_mat["event_list"]
 24 |     file_list = gt_mat["file_list"]
 25 | 
 26 |     hard_gt_list = hard_mat["gt_list"]
 27 |     medium_gt_list = medium_mat["gt_list"]
 28 |     easy_gt_list = easy_mat["gt_list"]
 29 | 
 30 |     return facebox_list, event_list, file_list, hard_gt_list, medium_gt_list, easy_gt_list
 31 | 
 32 | 
 33 | def get_gt_boxes_from_txt(gt_path, cache_dir):
 34 |     f = open(gt_path, "r")
 35 |     state = 0
 36 |     lines = f.readlines()
 37 |     lines = list(map(lambda x: x.rstrip("\r\n"), lines))
 38 |     boxes = {}
 39 |     # print(len(lines))
 40 |     f.close()
 41 |     current_boxes = []
 42 |     current_name = None
 43 |     for line in lines:
 44 |         if state == 0 and "--" in line:
 45 |             state = 1
 46 |             current_name = line
 47 |             continue
 48 |         if state == 1:
 49 |             state = 2
 50 |             continue
 51 | 
 52 |         if state == 2 and "--" in line:
 53 |             state = 1
 54 |             boxes[current_name] = np.array(current_boxes).astype("float32")
 55 |             current_name = line
 56 |             current_boxes = []
 57 |             continue
 58 | 
 59 |         if state == 2:
 60 |             box = [float(x) for x in line.split(" ")[:4]]
 61 |             current_boxes.append(box)
 62 |             continue
 63 | 
 64 |     return boxes
 65 | 
 66 | 
 67 | def read_pred_file(filepath):
 68 | 
 69 |     with open(filepath, "r") as f:
 70 |         lines = f.readlines()
 71 |         img_file = lines[0].rstrip("\n\r")
 72 |         lines = lines[2:]
 73 | 
 74 |     # b = lines[0].rstrip('\r\n').split(' ')[:-1]
 75 |     # c = float(b)
 76 |     # a = map(lambda x: [[float(a[0]), float(a[1]), float(a[2]), float(a[3]), float(a[4])] for a in x.rstrip('\r\n').split(' ')], lines)
 77 |     boxes = []
 78 |     for line in lines:
 79 |         line = line.rstrip("\r\n").split(" ")
 80 |         if line[0] == "":
 81 |             continue
 82 |         # a = float(line[4])
 83 |         boxes.append(
 84 |             [float(line[0]), float(line[1]), float(line[2]), float(line[3]), float(line[4])]
 85 |         )
 86 |     boxes = np.array(boxes)
 87 |     # boxes = np.array(list(map(lambda x: [float(a) for a in x.rstrip('\r\n').split(' ')], lines))).astype('float')
 88 |     return img_file.split("/")[-1], boxes
 89 | 
 90 | 
 91 | def get_preds(pred_dir):
 92 |     events = os.listdir(pred_dir)
 93 |     boxes = dict()
 94 | 
 95 |     for event in events:
 96 |         event_dir = os.path.join(pred_dir, event)
 97 |         event_images = os.listdir(event_dir)
 98 |         current_event = dict()
 99 |         for imgtxt in event_images:
100 |             imgname, _boxes = read_pred_file(os.path.join(event_dir, imgtxt))
101 |             current_event[imgname.rstrip(".jpg")] = _boxes
102 |         boxes[event] = current_event
103 |     return boxes
104 | 
105 | 
106 | def norm_score(pred):
107 |     """ norm score
108 |     pred {key: [[x1,y1,x2,y2,s]]}
109 |     """
110 | 
111 |     max_score = 0
112 |     min_score = 1
113 | 
114 |     for _, k in pred.items():
115 |         for _, v in k.items():
116 |             if len(v) == 0:
117 |                 continue
118 |             _min = np.min(v[:, -1])
119 |             _max = np.max(v[:, -1])
120 |             max_score = max(_max, max_score)
121 |             min_score = min(_min, min_score)
122 | 
123 |     diff = max_score - min_score
124 |     for _, k in pred.items():
125 |         for _, v in k.items():
126 |             if len(v) == 0:
127 |                 continue
128 |             v[:, -1] = (v[:, -1] - min_score) / diff
129 | 
130 | 
131 | def image_eval(pred, gt, ignore, iou_thresh):
132 |     """ single image evaluation
133 |     pred: Nx5
134 |     gt: Nx4
135 |     ignore:
136 |     """
137 | 
138 |     _pred = pred.copy()
139 |     _gt = gt.copy()
140 |     pred_recall = np.zeros(_pred.shape[0])
141 |     recall_list = np.zeros(_gt.shape[0])
142 |     proposal_list = np.ones(_pred.shape[0])
143 | 
144 |     _pred[:, 2] = _pred[:, 2] + _pred[:, 0]
145 |     _pred[:, 3] = _pred[:, 3] + _pred[:, 1]
146 |     _gt[:, 2] = _gt[:, 2] + _gt[:, 0]
147 |     _gt[:, 3] = _gt[:, 3] + _gt[:, 1]
148 | 
149 |     overlaps = bbox_overlaps(_pred[:, :4], _gt)
150 | 
151 |     for h in range(_pred.shape[0]):
152 | 
153 |         gt_overlap = overlaps[h]
154 |         max_overlap, max_idx = gt_overlap.max(), gt_overlap.argmax()
155 |         if max_overlap >= iou_thresh:
156 |             if ignore[max_idx] == 0:
157 |                 recall_list[max_idx] = -1
158 |                 proposal_list[h] = -1
159 |             elif recall_list[max_idx] == 0:
160 |                 recall_list[max_idx] = 1
161 | 
162 |         r_keep_index = np.where(recall_list == 1)[0]
163 |         pred_recall[h] = len(r_keep_index)
164 |     return pred_recall, proposal_list
165 | 
166 | 
167 | def img_pr_info(thresh_num, pred_info, proposal_list, pred_recall):
168 |     pr_info = np.zeros((thresh_num, 2)).astype("float")
169 |     for t in range(thresh_num):
170 | 
171 |         thresh = 1 - (t + 1) / thresh_num
172 |         r_index = np.where(pred_info[:, 4] >= thresh)[0]
173 |         if len(r_index) == 0:
174 |             pr_info[t, 0] = 0
175 |             pr_info[t, 1] = 0
176 |         else:
177 |             r_index = r_index[-1]
178 |             p_index = np.where(proposal_list[: r_index + 1] == 1)[0]
179 |             pr_info[t, 0] = len(p_index)
180 |             pr_info[t, 1] = pred_recall[r_index]
181 |     return pr_info
182 | 
183 | 
184 | def dataset_pr_info(thresh_num, pr_curve, count_face):
185 |     _pr_curve = np.zeros((thresh_num, 2))
186 |     for i in range(thresh_num):
187 |         _pr_curve[i, 0] = pr_curve[i, 1] / pr_curve[i, 0]
188 |         _pr_curve[i, 1] = pr_curve[i, 1] / count_face
189 |     return _pr_curve
190 | 
191 | 
192 | def voc_ap(rec, prec):
193 | 
194 |     # correct AP calculation
195 |     # first append sentinel values at the end
196 |     mrec = np.concatenate(([0.0], rec, [1.0]))
197 |     mpre = np.concatenate(([0.0], prec, [0.0]))
198 | 
199 |     # compute the precision envelope
200 |     for i in range(mpre.size - 1, 0, -1):
201 |         mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
202 | 
203 |     # to calculate area under PR curve, look for points
204 |     # where X axis (recall) changes value
205 |     i = np.where(mrec[1:] != mrec[:-1])[0]
206 | 
207 |     # and sum (\Delta recall) * prec
208 |     ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
209 |     return ap
210 | 
211 | 
212 | def evaluation(pred, gt_path, iou_thresh=0.5):
213 |     pred = get_preds(pred)
214 |     norm_score(pred)
215 |     facebox_list, event_list, file_list, hard_gt_list, medium_gt_list, easy_gt_list = get_gt_boxes(
216 |         gt_path
217 |     )
218 |     event_num = len(event_list)
219 |     thresh_num = 1000
220 |     setting_gts = [easy_gt_list, medium_gt_list, hard_gt_list]
221 |     aps = []
222 |     for setting_id in range(3):
223 |         # different setting
224 |         gt_list = setting_gts[setting_id]
225 |         count_face = 0
226 |         pr_curve = np.zeros((thresh_num, 2)).astype("float")
227 |         # [hard, medium, easy]
228 |         for i in range(event_num):
229 |             event_name = str(event_list[i][0][0])
230 |             img_list = file_list[i][0]
231 |             pred_list = pred[event_name]
232 |             sub_gt_list = gt_list[i][0]
233 |             # img_pr_info_list = np.zeros((len(img_list), thresh_num, 2))
234 |             gt_bbx_list = facebox_list[i][0]
235 | 
236 |             for j in range(len(img_list)):
237 |                 pred_info = pred_list[str(img_list[j][0][0])]
238 | 
239 |                 gt_boxes = gt_bbx_list[j][0].astype("float")
240 |                 keep_index = sub_gt_list[j][0]
241 |                 count_face += len(keep_index)
242 | 
243 |                 if len(gt_boxes) == 0 or len(pred_info) == 0:
244 |                     continue
245 |                 ignore = np.zeros(gt_boxes.shape[0])
246 |                 if len(keep_index) != 0:
247 |                     ignore[keep_index - 1] = 1
248 |                 pred_recall, proposal_list = image_eval(pred_info, gt_boxes, ignore, iou_thresh)
249 | 
250 |                 _img_pr_info = img_pr_info(thresh_num, pred_info, proposal_list, pred_recall)
251 | 
252 |                 pr_curve += _img_pr_info
253 |         pr_curve = dataset_pr_info(thresh_num, pr_curve, count_face)
254 | 
255 |         propose = pr_curve[:, 0]
256 |         recall = pr_curve[:, 1]
257 | 
258 |         ap = voc_ap(recall, propose)
259 |         aps.append(ap)
260 | 
261 |     logger = logging.getLogger(__name__)
262 |     logger.info("Easy   Val AP: {}".format(aps[0]))
263 |     logger.info("Medium Val AP: {}".format(aps[1]))
264 |     logger.info("Hard   Val AP: {}".format(aps[2]))
265 |     return aps
266 | 
267 | 
268 | if __name__ == "__main__":
269 | 
270 |     parser = argparse.ArgumentParser()
271 |     parser.add_argument("-p", "--pred", default="./tools/widerface_evaluate/widerface_txt/")
272 |     parser.add_argument("-g", "--gt", default="./tools/widerface_evaluate/ground_truth/")
273 | 
274 |     args = parser.parse_args()
275 |     evaluation(args.pred, args.gt)
276 | 


--------------------------------------------------------------------------------
/mmdet/layers/DCNv2/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lbin/Retinaface_Detectron2/579e500b35efac6afc389dfc9bbea0b129e91ba6/mmdet/layers/DCNv2/__init__.py


--------------------------------------------------------------------------------
/mmdet/layers/DCNv2/dcn_v2.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | from __future__ import absolute_import, division, print_function
  3 | 
  4 | import math
  5 | 
  6 | import torch
  7 | from torch import nn
  8 | from torch.autograd import Function
  9 | from torch.autograd.function import once_differentiable
 10 | from torch.nn.modules.utils import _pair
 11 | 
 12 | import _ext as _backend
 13 | 
 14 | 
 15 | class _DCNv2(Function):
 16 |     @staticmethod
 17 |     def forward(
 18 |         ctx, input, offset, mask, weight, bias, stride, padding, dilation, deformable_groups
 19 |     ):
 20 |         ctx.stride = _pair(stride)
 21 |         ctx.padding = _pair(padding)
 22 |         ctx.dilation = _pair(dilation)
 23 |         ctx.kernel_size = _pair(weight.shape[2:4])
 24 |         ctx.deformable_groups = deformable_groups
 25 |         output = _backend.dcn_v2_forward(
 26 |             input,
 27 |             weight,
 28 |             bias,
 29 |             offset,
 30 |             mask,
 31 |             ctx.kernel_size[0],
 32 |             ctx.kernel_size[1],
 33 |             ctx.stride[0],
 34 |             ctx.stride[1],
 35 |             ctx.padding[0],
 36 |             ctx.padding[1],
 37 |             ctx.dilation[0],
 38 |             ctx.dilation[1],
 39 |             ctx.deformable_groups,
 40 |         )
 41 |         ctx.save_for_backward(input, offset, mask, weight, bias)
 42 |         return output
 43 | 
 44 |     @staticmethod
 45 |     @once_differentiable
 46 |     def backward(ctx, grad_output):
 47 |         input, offset, mask, weight, bias = ctx.saved_tensors
 48 |         grad_input, grad_offset, grad_mask, grad_weight, grad_bias = _backend.dcn_v2_backward(
 49 |             input,
 50 |             weight,
 51 |             bias,
 52 |             offset,
 53 |             mask,
 54 |             grad_output,
 55 |             ctx.kernel_size[0],
 56 |             ctx.kernel_size[1],
 57 |             ctx.stride[0],
 58 |             ctx.stride[1],
 59 |             ctx.padding[0],
 60 |             ctx.padding[1],
 61 |             ctx.dilation[0],
 62 |             ctx.dilation[1],
 63 |             ctx.deformable_groups,
 64 |         )
 65 | 
 66 |         return grad_input, grad_offset, grad_mask, grad_weight, grad_bias, None, None, None, None
 67 | 
 68 | 
 69 | dcn_v2_conv = _DCNv2.apply
 70 | 
 71 | 
 72 | class DCNv2(nn.Module):
 73 |     def __init__(
 74 |         self,
 75 |         in_channels,
 76 |         out_channels,
 77 |         kernel_size,
 78 |         stride,
 79 |         padding,
 80 |         dilation=1,
 81 |         deformable_groups=1,
 82 |     ):
 83 |         super(DCNv2, self).__init__()
 84 |         self.in_channels = in_channels
 85 |         self.out_channels = out_channels
 86 |         self.kernel_size = _pair(kernel_size)
 87 |         self.stride = _pair(stride)
 88 |         self.padding = _pair(padding)
 89 |         self.dilation = _pair(dilation)
 90 |         self.deformable_groups = deformable_groups
 91 | 
 92 |         self.weight = nn.Parameter(torch.Tensor(out_channels, in_channels, *self.kernel_size))
 93 |         self.bias = nn.Parameter(torch.Tensor(out_channels))
 94 |         self.reset_parameters()
 95 | 
 96 |     def reset_parameters(self):
 97 |         n = self.in_channels
 98 |         for k in self.kernel_size:
 99 |             n *= k
100 |         stdv = 1.0 / math.sqrt(n)
101 |         self.weight.data.uniform_(-stdv, stdv)
102 |         self.bias.data.zero_()
103 | 
104 |     def forward(self, input, offset, mask):
105 |         assert (
106 |             2 * self.deformable_groups * self.kernel_size[0] * self.kernel_size[1]
107 |             == offset.shape[1]
108 |         )
109 |         assert self.deformable_groups * self.kernel_size[0] * self.kernel_size[1] == mask.shape[1]
110 |         return dcn_v2_conv(
111 |             input,
112 |             offset,
113 |             mask,
114 |             self.weight,
115 |             self.bias,
116 |             self.stride,
117 |             self.padding,
118 |             self.dilation,
119 |             self.deformable_groups,
120 |         )
121 | 
122 | 
123 | class DCN(DCNv2):
124 |     def __init__(
125 |         self,
126 |         in_channels,
127 |         out_channels,
128 |         kernel_size,
129 |         stride,
130 |         padding,
131 |         dilation=1,
132 |         deformable_groups=1,
133 |     ):
134 |         super(DCN, self).__init__(
135 |             in_channels, out_channels, kernel_size, stride, padding, dilation, deformable_groups
136 |         )
137 | 
138 |         channels_ = self.deformable_groups * 3 * self.kernel_size[0] * self.kernel_size[1]
139 |         self.conv_offset_mask = nn.Conv2d(
140 |             self.in_channels,
141 |             channels_,
142 |             kernel_size=self.kernel_size,
143 |             stride=self.stride,
144 |             padding=self.padding,
145 |             bias=True,
146 |         )
147 |         self.init_offset()
148 | 
149 |     def init_offset(self):
150 |         self.conv_offset_mask.weight.data.zero_()
151 |         self.conv_offset_mask.bias.data.zero_()
152 | 
153 |     def forward(self, input):
154 |         out = self.conv_offset_mask(input)
155 |         o1, o2, mask = torch.chunk(out, 3, dim=1)
156 |         offset = torch.cat((o1, o2), dim=1)
157 |         mask = torch.sigmoid(mask)
158 |         return dcn_v2_conv(
159 |             input,
160 |             offset,
161 |             mask,
162 |             self.weight,
163 |             self.bias,
164 |             self.stride,
165 |             self.padding,
166 |             self.dilation,
167 |             self.deformable_groups,
168 |         )
169 | 
170 | 
171 | class _DCNv2Pooling(Function):
172 |     @staticmethod
173 |     def forward(
174 |         ctx,
175 |         input,
176 |         rois,
177 |         offset,
178 |         spatial_scale,
179 |         pooled_size,
180 |         output_dim,
181 |         no_trans,
182 |         group_size=1,
183 |         part_size=None,
184 |         sample_per_part=4,
185 |         trans_std=0.0,
186 |     ):
187 |         ctx.spatial_scale = spatial_scale
188 |         ctx.no_trans = int(no_trans)
189 |         ctx.output_dim = output_dim
190 |         ctx.group_size = group_size
191 |         ctx.pooled_size = pooled_size
192 |         ctx.part_size = pooled_size if part_size is None else part_size
193 |         ctx.sample_per_part = sample_per_part
194 |         ctx.trans_std = trans_std
195 | 
196 |         output, output_count = _backend.dcn_v2_psroi_pooling_forward(
197 |             input,
198 |             rois,
199 |             offset,
200 |             ctx.no_trans,
201 |             ctx.spatial_scale,
202 |             ctx.output_dim,
203 |             ctx.group_size,
204 |             ctx.pooled_size,
205 |             ctx.part_size,
206 |             ctx.sample_per_part,
207 |             ctx.trans_std,
208 |         )
209 |         ctx.save_for_backward(input, rois, offset, output_count)
210 |         return output
211 | 
212 |     @staticmethod
213 |     @once_differentiable
214 |     def backward(ctx, grad_output):
215 |         input, rois, offset, output_count = ctx.saved_tensors
216 |         grad_input, grad_offset = _backend.dcn_v2_psroi_pooling_backward(
217 |             grad_output,
218 |             input,
219 |             rois,
220 |             offset,
221 |             output_count,
222 |             ctx.no_trans,
223 |             ctx.spatial_scale,
224 |             ctx.output_dim,
225 |             ctx.group_size,
226 |             ctx.pooled_size,
227 |             ctx.part_size,
228 |             ctx.sample_per_part,
229 |             ctx.trans_std,
230 |         )
231 | 
232 |         return grad_input, None, grad_offset, None, None, None, None, None, None, None, None
233 | 
234 | 
235 | dcn_v2_pooling = _DCNv2Pooling.apply
236 | 
237 | 
238 | class DCNv2Pooling(nn.Module):
239 |     def __init__(
240 |         self,
241 |         spatial_scale,
242 |         pooled_size,
243 |         output_dim,
244 |         no_trans,
245 |         group_size=1,
246 |         part_size=None,
247 |         sample_per_part=4,
248 |         trans_std=0.0,
249 |     ):
250 |         super(DCNv2Pooling, self).__init__()
251 |         self.spatial_scale = spatial_scale
252 |         self.pooled_size = pooled_size
253 |         self.output_dim = output_dim
254 |         self.no_trans = no_trans
255 |         self.group_size = group_size
256 |         self.part_size = pooled_size if part_size is None else part_size
257 |         self.sample_per_part = sample_per_part
258 |         self.trans_std = trans_std
259 | 
260 |     def forward(self, input, rois, offset):
261 |         assert input.shape[1] == self.output_dim
262 |         if self.no_trans:
263 |             offset = input.new()
264 |         return dcn_v2_pooling(
265 |             input,
266 |             rois,
267 |             offset,
268 |             self.spatial_scale,
269 |             self.pooled_size,
270 |             self.output_dim,
271 |             self.no_trans,
272 |             self.group_size,
273 |             self.part_size,
274 |             self.sample_per_part,
275 |             self.trans_std,
276 |         )
277 | 
278 | 
279 | class DCNPooling(DCNv2Pooling):
280 |     def __init__(
281 |         self,
282 |         spatial_scale,
283 |         pooled_size,
284 |         output_dim,
285 |         no_trans,
286 |         group_size=1,
287 |         part_size=None,
288 |         sample_per_part=4,
289 |         trans_std=0.0,
290 |         deform_fc_dim=1024,
291 |     ):
292 |         super(DCNPooling, self).__init__(
293 |             spatial_scale,
294 |             pooled_size,
295 |             output_dim,
296 |             no_trans,
297 |             group_size,
298 |             part_size,
299 |             sample_per_part,
300 |             trans_std,
301 |         )
302 | 
303 |         self.deform_fc_dim = deform_fc_dim
304 | 
305 |         if not no_trans:
306 |             self.offset_mask_fc = nn.Sequential(
307 |                 nn.Linear(
308 |                     self.pooled_size * self.pooled_size * self.output_dim, self.deform_fc_dim
309 |                 ),
310 |                 nn.ReLU(inplace=True),
311 |                 nn.Linear(self.deform_fc_dim, self.deform_fc_dim),
312 |                 nn.ReLU(inplace=True),
313 |                 nn.Linear(self.deform_fc_dim, self.pooled_size * self.pooled_size * 3),
314 |             )
315 |             self.offset_mask_fc[4].weight.data.zero_()
316 |             self.offset_mask_fc[4].bias.data.zero_()
317 | 
318 |     def forward(self, input, rois):
319 |         offset = input.new()
320 | 
321 |         if not self.no_trans:
322 | 
323 |             # do roi_align first
324 |             n = rois.shape[0]
325 |             roi = dcn_v2_pooling(
326 |                 input,
327 |                 rois,
328 |                 offset,
329 |                 self.spatial_scale,
330 |                 self.pooled_size,
331 |                 self.output_dim,
332 |                 True,  # no trans
333 |                 self.group_size,
334 |                 self.part_size,
335 |                 self.sample_per_part,
336 |                 self.trans_std,
337 |             )
338 | 
339 |             # build mask and offset
340 |             offset_mask = self.offset_mask_fc(roi.view(n, -1))
341 |             offset_mask = offset_mask.view(n, 3, self.pooled_size, self.pooled_size)
342 |             o1, o2, mask = torch.chunk(offset_mask, 3, dim=1)
343 |             offset = torch.cat((o1, o2), dim=1)
344 |             mask = torch.sigmoid(mask)
345 | 
346 |             # do pooling with offset and mask
347 |             return (
348 |                 dcn_v2_pooling(
349 |                     input,
350 |                     rois,
351 |                     offset,
352 |                     self.spatial_scale,
353 |                     self.pooled_size,
354 |                     self.output_dim,
355 |                     self.no_trans,
356 |                     self.group_size,
357 |                     self.part_size,
358 |                     self.sample_per_part,
359 |                     self.trans_std,
360 |                 )
361 |                 * mask
362 |             )
363 |         # only roi_align
364 |         return dcn_v2_pooling(
365 |             input,
366 |             rois,
367 |             offset,
368 |             self.spatial_scale,
369 |             self.pooled_size,
370 |             self.output_dim,
371 |             self.no_trans,
372 |             self.group_size,
373 |             self.part_size,
374 |             self.sample_per_part,
375 |             self.trans_std,
376 |         )
377 | 


--------------------------------------------------------------------------------
/mmdet/layers/DCNv2/src/cpu/dcn_v2_cpu.cpp:
--------------------------------------------------------------------------------
  1 | #include <vector>
  2 | #include "cpu/dcn_v2_im2col_cpu.h"
  3 | 
  4 | #include <ATen/ATen.h>
  5 | //#include <ATen/cuda/CUDAContext.h>
  6 | 
  7 | #include <TH/TH.h>
  8 | //#include <THC/THCAtomics.cuh>
  9 | //#include <THC/THCDeviceUtils.cuh>
 10 | 
 11 | //extern THCState *state;
 12 | 
 13 | // author: Charles Shang
 14 | // https://github.com/torch/cunn/blob/master/lib/THCUNN/generic/SpatialConvolutionMM.cu
 15 | // modified from the CUDA version for CPU use by Daniel K. Suhendro
 16 | 
 17 | at::Tensor
 18 | dcn_v2_cpu_forward(const at::Tensor &input,
 19 |                     const at::Tensor &weight,
 20 |                     const at::Tensor &bias,
 21 |                     const at::Tensor &offset,
 22 |                     const at::Tensor &mask,
 23 |                     const int kernel_h,
 24 |                     const int kernel_w,
 25 |                     const int stride_h,
 26 |                     const int stride_w,
 27 |                     const int pad_h,
 28 |                     const int pad_w,
 29 |                     const int dilation_h,
 30 |                     const int dilation_w,
 31 |                     const int deformable_group)
 32 | {
 33 |     // THCAssertSameGPU(THCudaTensor_checkGPU(state, 5, input, weight, bias, offset, mask));
 34 |     /*AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
 35 |     AT_ASSERTM(weight.type().is_cuda(), "weight must be a CUDA tensor");
 36 |     AT_ASSERTM(bias.type().is_cuda(), "bias must be a CUDA tensor");
 37 |     AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor");
 38 |     AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor");*/
 39 | 
 40 |     const int batch = input.size(0);
 41 |     const int channels = input.size(1);
 42 |     const int height = input.size(2);
 43 |     const int width = input.size(3);
 44 | 
 45 |     const int channels_out = weight.size(0);
 46 |     const int channels_kernel = weight.size(1);
 47 |     const int kernel_h_ = weight.size(2);
 48 |     const int kernel_w_ = weight.size(3);
 49 | 
 50 |     // printf("Kernels: %d %d %d %d\n", kernel_h_, kernel_w_, kernel_w, kernel_h);
 51 |     // printf("Channels: %d %d\n", channels, channels_kernel);
 52 |     // printf("Channels: %d %d\n", channels_out, channels_kernel);
 53 | 
 54 |     AT_ASSERTM(kernel_h_ == kernel_h && kernel_w_ == kernel_w,
 55 |                "Input shape and kernel shape wont match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_);
 56 | 
 57 |     AT_ASSERTM(channels == channels_kernel,
 58 |                "Input shape and kernel channels wont match: (%d vs %d).", channels, channels_kernel);
 59 | 
 60 |     const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
 61 |     const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
 62 | 
 63 |     auto ones = at::ones({height_out, width_out}, input.options());
 64 |     auto columns = at::empty({channels * kernel_h * kernel_w, 1 * height_out * width_out}, input.options());
 65 |     auto output = at::empty({batch, channels_out, height_out, width_out}, input.options());
 66 | 
 67 |     using scalar_t = float;
 68 |     for (int b = 0; b < batch; b++)
 69 |     {
 70 |         auto input_n = input.select(0, b);
 71 |         auto offset_n = offset.select(0, b);
 72 |         auto mask_n = mask.select(0, b);
 73 |         auto output_n = output.select(0, b);
 74 | 
 75 |         // Do Bias first:
 76 |         // M,N,K are dims of matrix A and B
 77 |         // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
 78 |         // (N x 1) (1 x M)
 79 |         long m_ = channels_out;
 80 |         long n_ = height_out * width_out;
 81 |         long k_ = 1;
 82 |         THFloatBlas_gemm('t', 'n', n_, m_, k_, 1.0f,
 83 |                          ones.contiguous().data<scalar_t>(), k_,
 84 |                          bias.contiguous().data<scalar_t>(), k_, 0.0f,
 85 |                          output_n.data<scalar_t>(), n_);
 86 | 
 87 |         modulated_deformable_im2col_cpu(input_n.data<scalar_t>(),
 88 |                                          offset_n.data<scalar_t>(),
 89 |                                          mask_n.data<scalar_t>(),
 90 |                                          1, channels, height, width,
 91 |                                          height_out, width_out, kernel_h, kernel_w,
 92 |                                          pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
 93 |                                          deformable_group,
 94 |                                          columns.data<scalar_t>());
 95 | 
 96 |         //(k * m)  x  (m * n)
 97 |         // Y = WC
 98 |         long m = channels_out;
 99 |         long n = height_out * width_out;
100 |         long k = channels * kernel_h * kernel_w;
101 |         THFloatBlas_gemm('n', 'n', n, m, k, 1.0f,
102 |                          columns.data<scalar_t>(), n,
103 |                          weight.data<scalar_t>(), k, 1.0f,
104 |                          output_n.data<scalar_t>(), n);
105 |     }
106 |     return output;
107 | }
108 | 
109 | std::vector<at::Tensor> dcn_v2_cpu_backward(const at::Tensor &input,
110 |                                              const at::Tensor &weight,
111 |                                              const at::Tensor &bias,
112 |                                              const at::Tensor &offset,
113 |                                              const at::Tensor &mask,
114 |                                              const at::Tensor &grad_output,
115 |                                              int kernel_h, int kernel_w,
116 |                                              int stride_h, int stride_w,
117 |                                              int pad_h, int pad_w,
118 |                                              int dilation_h, int dilation_w,
119 |                                              int deformable_group)
120 | {
121 | 
122 |     THArgCheck(input.is_contiguous(), 1, "input tensor has to be contiguous");
123 |     THArgCheck(weight.is_contiguous(), 2, "weight tensor has to be contiguous");
124 | 
125 |     /*AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
126 |     AT_ASSERTM(weight.type().is_cuda(), "weight must be a CUDA tensor");
127 |     AT_ASSERTM(bias.type().is_cuda(), "bias must be a CUDA tensor");
128 |     AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor");
129 |     AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor");*/
130 | 
131 |     const int batch = input.size(0);
132 |     const int channels = input.size(1);
133 |     const int height = input.size(2);
134 |     const int width = input.size(3);
135 | 
136 |     const int channels_out = weight.size(0);
137 |     const int channels_kernel = weight.size(1);
138 |     const int kernel_h_ = weight.size(2);
139 |     const int kernel_w_ = weight.size(3);
140 | 
141 |     AT_ASSERTM(kernel_h_ == kernel_h && kernel_w_ == kernel_w,
142 |                "Input shape and kernel shape wont match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_);
143 | 
144 |     AT_ASSERTM(channels == channels_kernel,
145 |                "Input shape and kernel channels wont match: (%d vs %d).", channels, channels_kernel);
146 | 
147 |     const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
148 |     const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
149 | 
150 |     auto ones = at::ones({height_out, width_out}, input.options());
151 |     auto columns = at::empty({channels * kernel_h * kernel_w, 1 * height_out * width_out}, input.options());
152 |     auto output = at::empty({batch, channels_out, height_out, width_out}, input.options());
153 | 
154 |     auto grad_input = at::zeros_like(input);
155 |     auto grad_weight = at::zeros_like(weight);
156 |     auto grad_bias = at::zeros_like(bias);
157 |     auto grad_offset = at::zeros_like(offset);
158 |     auto grad_mask = at::zeros_like(mask);
159 | 
160 |     using scalar_t = float;
161 | 
162 |     for (int b = 0; b < batch; b++)
163 |     {
164 |         auto input_n = input.select(0, b);
165 |         auto offset_n = offset.select(0, b);
166 |         auto mask_n = mask.select(0, b);
167 |         auto grad_output_n = grad_output.select(0, b);
168 |         auto grad_input_n = grad_input.select(0, b);
169 |         auto grad_offset_n = grad_offset.select(0, b);
170 |         auto grad_mask_n = grad_mask.select(0, b);
171 | 
172 |         long m = channels * kernel_h * kernel_w;
173 |         long n = height_out * width_out;
174 |         long k = channels_out;
175 | 
176 |         THFloatBlas_gemm('n', 't', n, m, k, 1.0f,
177 |                          grad_output_n.data<scalar_t>(), n,
178 |                          weight.data<scalar_t>(), m, 0.0f,
179 |                          columns.data<scalar_t>(), n);
180 | 
181 |         // gradient w.r.t. input coordinate data
182 |         modulated_deformable_col2im_coord_cpu(columns.data<scalar_t>(),
183 |                                                input_n.data<scalar_t>(),
184 |                                                offset_n.data<scalar_t>(),
185 |                                                mask_n.data<scalar_t>(),
186 |                                                1, channels, height, width,
187 |                                                height_out, width_out, kernel_h, kernel_w,
188 |                                                pad_h, pad_w, stride_h, stride_w,
189 |                                                dilation_h, dilation_w, deformable_group,
190 |                                                grad_offset_n.data<scalar_t>(),
191 |                                                grad_mask_n.data<scalar_t>());
192 |         // gradient w.r.t. input data
193 |         modulated_deformable_col2im_cpu(columns.data<scalar_t>(),
194 |                                          offset_n.data<scalar_t>(),
195 |                                          mask_n.data<scalar_t>(),
196 |                                          1, channels, height, width,
197 |                                          height_out, width_out, kernel_h, kernel_w,
198 |                                          pad_h, pad_w, stride_h, stride_w,
199 |                                          dilation_h, dilation_w, deformable_group,
200 |                                          grad_input_n.data<scalar_t>());
201 | 
202 |         // gradient w.r.t. weight, dWeight should accumulate across the batch and group
203 |         modulated_deformable_im2col_cpu(input_n.data<scalar_t>(),
204 |                                          offset_n.data<scalar_t>(),
205 |                                          mask_n.data<scalar_t>(),
206 |                                          1, channels, height, width,
207 |                                          height_out, width_out, kernel_h, kernel_w,
208 |                                          pad_h, pad_w, stride_h, stride_w,
209 |                                          dilation_h, dilation_w, deformable_group,
210 |                                          columns.data<scalar_t>());
211 | 
212 |         long m_ = channels_out;
213 |         long n_ = channels * kernel_h * kernel_w;
214 |         long k_ = height_out * width_out;
215 | 
216 |         THFloatBlas_gemm('t', 'n', n_, m_, k_, 1.0f,
217 |                          columns.data<scalar_t>(), k_,
218 |                          grad_output_n.data<scalar_t>(), k_, 1.0f,
219 |                          grad_weight.data<scalar_t>(), n_);
220 | 
221 |         // gradient w.r.t. bias
222 |         // long m_ = channels_out;
223 |         // long k__ = height_out * width_out;
224 |         THFloatBlas_gemv('t', k_, m_, 1.0f,
225 |                          grad_output_n.data<scalar_t>(), k_,
226 |                          ones.data<scalar_t>(), 1, 1.0f,
227 |                          grad_bias.data<scalar_t>(), 1);
228 |     }
229 | 
230 |     return {
231 |         grad_input, grad_offset, grad_mask, grad_weight, grad_bias
232 |     };
233 | }


--------------------------------------------------------------------------------
/mmdet/layers/DCNv2/src/cpu/dcn_v2_im2col_cpu.cpp:
--------------------------------------------------------------------------------
  1 | #include "dcn_v2_im2col_cpu.h"
  2 | #include <cstdio>
  3 | #include <algorithm>
  4 | #include <cstring>
  5 | 
  6 | #include <ATen/ATen.h>
  7 | //#include <ATen/cuda/CUDAContext.h>
  8 | 
  9 | #include <TH/TH.h>
 10 | //#include <THC/THCAtomics.cuh>
 11 | //#include <THC/THCDeviceUtils.cuh>
 12 | 
 13 | // modified from the CUDA version for CPU use by Daniel K. Suhendro
 14 | 
 15 | /*#define CUDA_KERNEL_LOOP(i, n)                          \
 16 |   for (int i = blockIdx.x * blockDim.x + threadIdx.x;   \
 17 |       i < (n);                                          \
 18 |       i += blockDim.x * gridDim.x)
 19 | 
 20 | const int CUDA_NUM_THREADS = 1024;
 21 | inline int GET_BLOCKS(const int N)
 22 | {
 23 |   return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
 24 | }*/
 25 | 
 26 | 
 27 | float dmcn_im2col_bilinear_cpu(const float *bottom_data, const int data_width,
 28 |                            const int height, const int width, float h, float w)
 29 | {
 30 |   int h_low = floor(h);
 31 |   int w_low = floor(w);
 32 |   int h_high = h_low + 1;
 33 |   int w_high = w_low + 1;
 34 | 
 35 |   float lh = h - h_low;
 36 |   float lw = w - w_low;
 37 |   float hh = 1 - lh, hw = 1 - lw;
 38 | 
 39 |   float v1 = 0;
 40 |   if (h_low >= 0 && w_low >= 0)
 41 |     v1 = bottom_data[h_low * data_width + w_low];
 42 |   float v2 = 0;
 43 |   if (h_low >= 0 && w_high <= width - 1)
 44 |     v2 = bottom_data[h_low * data_width + w_high];
 45 |   float v3 = 0;
 46 |   if (h_high <= height - 1 && w_low >= 0)
 47 |     v3 = bottom_data[h_high * data_width + w_low];
 48 |   float v4 = 0;
 49 |   if (h_high <= height - 1 && w_high <= width - 1)
 50 |     v4 = bottom_data[h_high * data_width + w_high];
 51 | 
 52 |   float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
 53 | 
 54 |   float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
 55 |   return val;
 56 | }
 57 | 
 58 | float dmcn_get_gradient_weight_cpu(float argmax_h, float argmax_w,
 59 |                                const int h, const int w, const int height, const int width)
 60 | {
 61 |   if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
 62 |   {
 63 |     //empty
 64 |     return 0;
 65 |   }
 66 | 
 67 |   int argmax_h_low = floor(argmax_h);
 68 |   int argmax_w_low = floor(argmax_w);
 69 |   int argmax_h_high = argmax_h_low + 1;
 70 |   int argmax_w_high = argmax_w_low + 1;
 71 | 
 72 |   float weight = 0;
 73 |   if (h == argmax_h_low && w == argmax_w_low)
 74 |     weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
 75 |   if (h == argmax_h_low && w == argmax_w_high)
 76 |     weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
 77 |   if (h == argmax_h_high && w == argmax_w_low)
 78 |     weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
 79 |   if (h == argmax_h_high && w == argmax_w_high)
 80 |     weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
 81 |   return weight;
 82 | }
 83 | 
 84 | float dmcn_get_coordinate_weight_cpu(float argmax_h, float argmax_w,
 85 |                                  const int height, const int width, const float *im_data,
 86 |                                  const int data_width, const int bp_dir)
 87 | {
 88 |   if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
 89 |   {
 90 |     //empty
 91 |     return 0;
 92 |   }
 93 | 
 94 |   int argmax_h_low = floor(argmax_h);
 95 |   int argmax_w_low = floor(argmax_w);
 96 |   int argmax_h_high = argmax_h_low + 1;
 97 |   int argmax_w_high = argmax_w_low + 1;
 98 | 
 99 |   float weight = 0;
100 | 
101 |   if (bp_dir == 0)
102 |   {
103 |     if (argmax_h_low >= 0 && argmax_w_low >= 0)
104 |       weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low];
105 |     if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
106 |       weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high];
107 |     if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
108 |       weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low];
109 |     if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
110 |       weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high];
111 |   }
112 |   else if (bp_dir == 1)
113 |   {
114 |     if (argmax_h_low >= 0 && argmax_w_low >= 0)
115 |       weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low];
116 |     if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
117 |       weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high];
118 |     if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
119 |       weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low];
120 |     if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
121 |       weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high];
122 |   }
123 | 
124 |   return weight;
125 | }
126 | 
127 | void modulated_deformable_im2col_cpu_kernel(const int n, const float *data_im, const float *data_offset, const float *data_mask,
128 |                                                        const int height, const int width, const int kernel_h, const int kernel_w,
129 |                                                        const int pad_h, const int pad_w,
130 |                                                        const int stride_h, const int stride_w,
131 |                                                        const int dilation_h, const int dilation_w,
132 |                                                        const int channel_per_deformable_group,
133 |                                                        const int batch_size, const int num_channels, const int deformable_group,
134 |                                                        const int height_col, const int width_col,
135 |                                                        float *data_col)
136 | {
137 |   // launch channels * batch_size * height_col * width_col cores
138 |   for(int index=0; index<n; index++)
139 |   {
140 |     // NOTE(CharlesShang): different from Dai Jifeng's MXNet implementation, col_buffer is of shape (c*kw*kh, N, oh, ow)
141 |     // here columns is of shape (N, c*kw*kh, oh * ow), need to adapt axis
142 | 
143 |     // index index of output matrix
144 |     const int w_col = index % width_col;
145 |     const int h_col = (index / width_col) % height_col;
146 |     // const int b_col = (index / width_col / height_col) % batch_size;
147 |     const int b_col = (index / width_col / height_col / num_channels) % batch_size;
148 |     // const int c_im = (index / width_col / height_col) / batch_size;
149 |     const int c_im = (index / width_col / height_col) % num_channels;
150 |     // const int c_col = c_im * kernel_h * kernel_w;
151 |     const int c_col = c_im * kernel_h * kernel_w;
152 | 
153 |     // compute deformable group index
154 |     const int deformable_group_index = c_im / channel_per_deformable_group;
155 | 
156 |     const int h_in = h_col * stride_h - pad_h;
157 |     const int w_in = w_col * stride_w - pad_w;
158 | 
159 |     //  float *data_col_ptr = data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
160 |     float *data_col_ptr = data_col + ((b_col * num_channels * kernel_w * kernel_h + c_col) * height_col + h_col) * width_col + w_col;
161 |     //const float* data_im_ptr = data_im + ((b_col * num_channels + c_im) * height + h_in) * width + w_in;
162 |     const float *data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width;
163 |     const float *data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
164 | 
165 |     const float *data_mask_ptr = data_mask + (b_col * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
166 | 
167 |     for (int i = 0; i < kernel_h; ++i)
168 |     {
169 |       for (int j = 0; j < kernel_w; ++j)
170 |       {
171 |         const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
172 |         const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col;
173 |         const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
174 |         const float offset_h = data_offset_ptr[data_offset_h_ptr];
175 |         const float offset_w = data_offset_ptr[data_offset_w_ptr];
176 |         const float mask = data_mask_ptr[data_mask_hw_ptr];
177 |         float val = static_cast<float>(0);
178 |         const float h_im = h_in + i * dilation_h + offset_h;
179 |         const float w_im = w_in + j * dilation_w + offset_w;
180 |         //if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) {
181 |         if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
182 |         {
183 |           //const float map_h = i * dilation_h + offset_h;
184 |           //const float map_w = j * dilation_w + offset_w;
185 |           //const int cur_height = height - h_in;
186 |           //const int cur_width = width - w_in;
187 |           //val = dmcn_im2col_bilinear_cpu(data_im_ptr, width, cur_height, cur_width, map_h, map_w);
188 |           val = dmcn_im2col_bilinear_cpu(data_im_ptr, width, height, width, h_im, w_im);
189 |         }
190 |         *data_col_ptr = val * mask;
191 |         // data_col_ptr += batch_size * height_col * width_col;
192 |         data_col_ptr += height_col * width_col;
193 |       }
194 |     }
195 |   }
196 | }
197 | 
198 | void modulated_deformable_col2im_cpu_kernel(const int n, const float *data_col, const float *data_offset, const float *data_mask,
199 |                                                        const int channels, const int height, const int width,
200 |                                                        const int kernel_h, const int kernel_w,
201 |                                                        const int pad_h, const int pad_w,
202 |                                                        const int stride_h, const int stride_w,
203 |                                                        const int dilation_h, const int dilation_w,
204 |                                                        const int channel_per_deformable_group,
205 |                                                        const int batch_size, const int deformable_group,
206 |                                                        const int height_col, const int width_col,
207 |                                                        float *grad_im)
208 | {
209 |   for(int index = 0; index < n; index++)
210 |   {
211 |     const int j = (index / width_col / height_col / batch_size) % kernel_w;
212 |     const int i = (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
213 |     const int c = index / width_col / height_col / batch_size / kernel_w / kernel_h;
214 |     // compute the start and end of the output
215 | 
216 |     const int deformable_group_index = c / channel_per_deformable_group;
217 | 
218 |     int w_out = index % width_col;
219 |     int h_out = (index / width_col) % height_col;
220 |     int b = (index / width_col / height_col) % batch_size;
221 |     int w_in = w_out * stride_w - pad_w;
222 |     int h_in = h_out * stride_h - pad_h;
223 | 
224 |     const float *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
225 |     const float *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
226 |     const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
227 |     const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
228 |     const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
229 |     const float offset_h = data_offset_ptr[data_offset_h_ptr];
230 |     const float offset_w = data_offset_ptr[data_offset_w_ptr];
231 |     const float mask = data_mask_ptr[data_mask_hw_ptr];
232 |     const float cur_inv_h_data = h_in + i * dilation_h + offset_h;
233 |     const float cur_inv_w_data = w_in + j * dilation_w + offset_w;
234 | 
235 |     const float cur_top_grad = data_col[index] * mask;
236 |     const int cur_h = (int)cur_inv_h_data;
237 |     const int cur_w = (int)cur_inv_w_data;
238 |     
239 |     for (int dy = -2; dy <= 2; dy++)
240 |     {
241 |       for (int dx = -2; dx <= 2; dx++)
242 |       {
243 |         if (cur_h + dy >= 0 && cur_h + dy < height &&
244 |             cur_w + dx >= 0 && cur_w + dx < width &&
245 |             abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
246 |             abs(cur_inv_w_data - (cur_w + dx)) < 1)
247 |         {
248 |           int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
249 |           float weight = dmcn_get_gradient_weight_cpu(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width);
250 |           //atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
251 |           *(grad_im + cur_bottom_grad_pos) += weight * cur_top_grad;
252 | 
253 |         }
254 |       }
255 |     }
256 |   }
257 | }
258 | 
259 | void modulated_deformable_col2im_coord_cpu_kernel(const int n, const float *data_col, const float *data_im,
260 |                                                              const float *data_offset, const float *data_mask,
261 |                                                              const int channels, const int height, const int width,
262 |                                                              const int kernel_h, const int kernel_w,
263 |                                                              const int pad_h, const int pad_w,
264 |                                                              const int stride_h, const int stride_w,
265 |                                                              const int dilation_h, const int dilation_w,
266 |                                                              const int channel_per_deformable_group,
267 |                                                              const int batch_size, const int offset_channels, const int deformable_group,
268 |                                                              const int height_col, const int width_col,
269 |                                                              float *grad_offset, float *grad_mask)
270 | {
271 |   for(int index = 0; index < n; index++)
272 |   {
273 |     float val = 0, mval = 0;
274 |     int w = index % width_col;
275 |     int h = (index / width_col) % height_col;
276 |     int c = (index / width_col / height_col) % offset_channels;
277 |     int b = (index / width_col / height_col) / offset_channels;
278 |     // compute the start and end of the output
279 | 
280 |     const int deformable_group_index = c / (2 * kernel_h * kernel_w);
281 |     const int col_step = kernel_h * kernel_w;
282 |     int cnt = 0;
283 |     const float *data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group * batch_size * width_col * height_col;
284 |     const float *data_im_ptr = data_im + (b * deformable_group + deformable_group_index) * channel_per_deformable_group / kernel_h / kernel_w * height * width;
285 |     const float *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
286 |     const float *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
287 | 
288 |     const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
289 | 
290 |     for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step)
291 |     {
292 |       const int col_pos = (((col_c * batch_size + b) * height_col) + h) * width_col + w;
293 |       const int bp_dir = offset_c % 2;
294 | 
295 |       int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
296 |       int i = (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
297 |       int w_out = col_pos % width_col;
298 |       int h_out = (col_pos / width_col) % height_col;
299 |       int w_in = w_out * stride_w - pad_w;
300 |       int h_in = h_out * stride_h - pad_h;
301 |       const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
302 |       const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out);
303 |       const int data_mask_hw_ptr = (((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
304 |       const float offset_h = data_offset_ptr[data_offset_h_ptr];
305 |       const float offset_w = data_offset_ptr[data_offset_w_ptr];
306 |       const float mask = data_mask_ptr[data_mask_hw_ptr];
307 |       float inv_h = h_in + i * dilation_h + offset_h;
308 |       float inv_w = w_in + j * dilation_w + offset_w;
309 |       if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
310 |       {
311 |         inv_h = inv_w = -2;
312 |       }
313 |       else
314 |       {
315 |         mval += data_col_ptr[col_pos] * dmcn_im2col_bilinear_cpu(data_im_ptr + cnt * height * width, width, height, width, inv_h, inv_w);
316 |       }
317 |       const float weight = dmcn_get_coordinate_weight_cpu(
318 |           inv_h, inv_w,
319 |           height, width, data_im_ptr + cnt * height * width, width, bp_dir);
320 |       val += weight * data_col_ptr[col_pos] * mask;
321 |       cnt += 1;
322 |     }
323 |     // KERNEL_ASSIGN(grad_offset[index], offset_req, val);
324 |     grad_offset[index] = val;
325 |     if (offset_c % 2 == 0)
326 |       // KERNEL_ASSIGN(grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w], mask_req, mval);
327 |       grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w] = mval;
328 |   }
329 | }
330 | 
331 | void modulated_deformable_im2col_cpu(const float* data_im, const float* data_offset, const float* data_mask,
332 |   const int batch_size, const int channels, const int height_im, const int width_im, 
333 |   const int height_col, const int width_col, const int kernel_h, const int kernel_w,
334 |   const int pad_h, const int pad_w, const int stride_h, const int stride_w, 
335 |   const int dilation_h, const int dilation_w,
336 |   const int deformable_group, float* data_col) {
337 |   // num_axes should be smaller than block size
338 |   const int channel_per_deformable_group = channels / deformable_group;
339 |   const int num_kernels = channels * batch_size * height_col * width_col;
340 |   modulated_deformable_im2col_cpu_kernel(
341 |       num_kernels, data_im, data_offset, data_mask, height_im, width_im, kernel_h, kernel_w,
342 |       pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group,
343 |       batch_size, channels, deformable_group, height_col, width_col, data_col);
344 |   
345 |   /*cudaError_t err = cudaGetLastError();
346 |   if (err != cudaSuccess)
347 |   {
348 |     printf("error in modulated_deformable_im2col_cuda: %s\n", cudaGetErrorString(err));
349 |   }*/
350 | 
351 | }
352 | 
353 | void modulated_deformable_col2im_cpu(const float* data_col, const float* data_offset, const float* data_mask,
354 |   const int batch_size, const int channels, const int height_im, const int width_im, 
355 |   const int height_col, const int width_col, const int kernel_h, const int kernel_w,
356 |   const int pad_h, const int pad_w, const int stride_h, const int stride_w, 
357 |   const int dilation_h, const int dilation_w, 
358 |   const int deformable_group, float* grad_im){
359 | 
360 |   const int channel_per_deformable_group = channels / deformable_group;
361 |   const int num_kernels = channels * kernel_h * kernel_w * batch_size * height_col * width_col;
362 |   modulated_deformable_col2im_cpu_kernel(
363 |         num_kernels, data_col, data_offset, data_mask, channels, height_im, width_im,
364 |         kernel_h, kernel_w, pad_h, pad_h, stride_h, stride_w,
365 |         dilation_h, dilation_w, channel_per_deformable_group,
366 |         batch_size, deformable_group, height_col, width_col, grad_im);
367 |   /*cudaError_t err = cudaGetLastError();
368 |   if (err != cudaSuccess)
369 |   {
370 |     printf("error in modulated_deformable_col2im_cuda: %s\n", cudaGetErrorString(err));
371 |   }*/
372 | 
373 | }
374 | 
375 | void modulated_deformable_col2im_coord_cpu(const float* data_col, const float* data_im, const float* data_offset, const float* data_mask,
376 |   const int batch_size, const int channels, const int height_im, const int width_im, 
377 |   const int height_col, const int width_col, const int kernel_h, const int kernel_w,
378 |   const int pad_h, const int pad_w, const int stride_h, const int stride_w, 
379 |   const int dilation_h, const int dilation_w, 
380 |   const int deformable_group,
381 |   float* grad_offset, float* grad_mask) {
382 |   const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h * kernel_w * deformable_group;
383 |   const int channel_per_deformable_group = channels * kernel_h * kernel_w / deformable_group;
384 |   modulated_deformable_col2im_coord_cpu_kernel(
385 |         num_kernels, data_col, data_im, data_offset, data_mask, channels, height_im, width_im,
386 |         kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
387 |         dilation_h, dilation_w, channel_per_deformable_group,
388 |         batch_size, 2 * kernel_h * kernel_w * deformable_group, deformable_group, height_col, width_col, 
389 |         grad_offset, grad_mask);
390 |   /*cudaError_t err = cudaGetLastError();
391 |   if (err != cudaSuccess)
392 |   {
393 |     printf("error in modulated_deformable_col2im_coord_cuda: %s\n", cudaGetErrorString(err));
394 |   }*/
395 | }


--------------------------------------------------------------------------------
/mmdet/layers/DCNv2/src/cpu/dcn_v2_im2col_cpu.h:
--------------------------------------------------------------------------------
 1 | 
 2 | /*!
 3 |  ******************* BEGIN Caffe Copyright Notice and Disclaimer ****************
 4 |  *
 5 |  * COPYRIGHT
 6 |  *
 7 |  * All contributions by the University of California:
 8 |  * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
 9 |  * All rights reserved.
10 |  *
11 |  * All other contributions:
12 |  * Copyright (c) 2014-2017, the respective contributors
13 |  * All rights reserved.
14 |  *
15 |  * Caffe uses a shared copyright model: each contributor holds copyright over
16 |  * their contributions to Caffe. The project versioning records all such
17 |  * contribution and copyright details. If a contributor wants to further mark
18 |  * their specific copyright on a particular contribution, they should indicate
19 |  * their copyright solely in the commit message of the change when it is
20 |  * committed.
21 |  *
22 |  * LICENSE
23 |  *
24 |  * Redistribution and use in source and binary forms, with or without
25 |  * modification, are permitted provided that the following conditions are met:
26 |  *
27 |  * 1. Redistributions of source code must retain the above copyright notice, this
28 |  * list of conditions and the following disclaimer.
29 |  * 2. Redistributions in binary form must reproduce the above copyright notice,
30 |  * this list of conditions and the following disclaimer in the documentation
31 |  * and/or other materials provided with the distribution.
32 |  *
33 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
34 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
35 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
36 |  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
37 |  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
38 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
39 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
40 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
41 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
42 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43 |  *
44 |  * CONTRIBUTION AGREEMENT
45 |  *
46 |  * By contributing to the BVLC/caffe repository through pull-request, comment,
47 |  * or otherwise, the contributor releases their content to the
48 |  * license and copyright terms herein.
49 |  *
50 |  ***************** END Caffe Copyright Notice and Disclaimer ********************
51 |  *
52 |  * Copyright (c) 2018 Microsoft
53 |  * Licensed under The MIT License [see LICENSE for details]
54 |  * \file modulated_deformable_im2col.h
55 |  * \brief Function definitions of converting an image to
56 |  * column matrix based on kernel, padding, dilation, and offset.
57 |  * These functions are mainly used in deformable convolution operators.
58 |  * \ref: https://arxiv.org/abs/1811.11168
59 |  * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu
60 |  */
61 | 
62 | /***************** Adapted by Charles Shang *********************/
63 | // modified from the CUDA version for CPU use by Daniel K. Suhendro
64 | 
65 | #ifndef DCN_V2_IM2COL_CPU
66 | #define DCN_V2_IM2COL_CPU
67 | 
68 | #ifdef __cplusplus
69 | extern "C"
70 | {
71 | #endif
72 | 
73 |   void modulated_deformable_im2col_cpu(const float *data_im, const float *data_offset, const float *data_mask,
74 |                                         const int batch_size, const int channels, const int height_im, const int width_im,
75 |                                         const int height_col, const int width_col, const int kernel_h, const int kenerl_w,
76 |                                         const int pad_h, const int pad_w, const int stride_h, const int stride_w,
77 |                                         const int dilation_h, const int dilation_w,
78 |                                         const int deformable_group, float *data_col);
79 | 
80 |   void modulated_deformable_col2im_cpu(const float *data_col, const float *data_offset, const float *data_mask,
81 |                                         const int batch_size, const int channels, const int height_im, const int width_im,
82 |                                         const int height_col, const int width_col, const int kernel_h, const int kenerl_w,
83 |                                         const int pad_h, const int pad_w, const int stride_h, const int stride_w,
84 |                                         const int dilation_h, const int dilation_w,
85 |                                         const int deformable_group, float *grad_im);
86 | 
87 |   void modulated_deformable_col2im_coord_cpu(const float *data_col, const float *data_im, const float *data_offset, const float *data_mask,
88 |                                          const int batch_size, const int channels, const int height_im, const int width_im,
89 |                                          const int height_col, const int width_col, const int kernel_h, const int kenerl_w,
90 |                                          const int pad_h, const int pad_w, const int stride_h, const int stride_w,
91 |                                          const int dilation_h, const int dilation_w,
92 |                                          const int deformable_group,
93 |                                          float *grad_offset, float *grad_mask);
94 | 
95 | #ifdef __cplusplus
96 | }
97 | #endif
98 | 
99 | #endif


--------------------------------------------------------------------------------
/mmdet/layers/DCNv2/src/cpu/dcn_v2_psroi_pooling_cpu.cpp:
--------------------------------------------------------------------------------
  1 | /*!
  2 |  * Copyright (c) 2017 Microsoft
  3 |  * Licensed under The MIT License [see LICENSE for details]
  4 |  * \file deformable_psroi_pooling.cu
  5 |  * \brief
  6 |  * \author Yi Li, Guodong Zhang, Jifeng Dai
  7 | */
  8 | /***************** Adapted by Charles Shang *********************/
  9 | // modified from the CUDA version for CPU use by Daniel K. Suhendro
 10 | 
 11 | #include <cstdio>
 12 | #include <algorithm>
 13 | #include <cstring>
 14 | 
 15 | #include <ATen/ATen.h>
 16 | //#include <ATen/cuda/CUDAContext.h>
 17 | 
 18 | #include <TH/TH.h>
 19 | //#include <THC/THCAtomics.cuh>
 20 | //#include <THC/THCDeviceUtils.cuh>
 21 | 
 22 | /*#define CUDA_KERNEL_LOOP(i, n)                        \
 23 |   for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
 24 |        i < (n);                                       \
 25 |        i += blockDim.x * gridDim.x)
 26 | 
 27 | const int CUDA_NUM_THREADS = 1024;
 28 | inline int GET_BLOCKS(const int N)
 29 | {
 30 |   return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
 31 | }*/
 32 | 
 33 | template <typename T>
 34 | T bilinear_interp_cpu(
 35 |     const T *data,
 36 |     const T x,
 37 |     const T y,
 38 |     const int width,
 39 |     const int height)
 40 | {
 41 |   int x1 = floor(x);
 42 |   int x2 = ceil(x);
 43 |   int y1 = floor(y);
 44 |   int y2 = ceil(y);
 45 |   T dist_x = static_cast<T>(x - x1);
 46 |   T dist_y = static_cast<T>(y - y1);
 47 |   T value11 = data[y1 * width + x1];
 48 |   T value12 = data[y2 * width + x1];
 49 |   T value21 = data[y1 * width + x2];
 50 |   T value22 = data[y2 * width + x2];
 51 |   T value = (1 - dist_x) * (1 - dist_y) * value11 +
 52 |             (1 - dist_x) * dist_y * value12 +
 53 |             dist_x * (1 - dist_y) * value21 +
 54 |             dist_x * dist_y * value22;
 55 |   return value;
 56 | }
 57 | 
 58 | template <typename T>
 59 |  void DeformablePSROIPoolForwardKernelCpu(
 60 |     const int count,
 61 |     const T *bottom_data,
 62 |     const T spatial_scale,
 63 |     const int channels,
 64 |     const int height, const int width,
 65 |     const int pooled_height, const int pooled_width,
 66 |     const T *bottom_rois, const T *bottom_trans,
 67 |     const int no_trans,
 68 |     const T trans_std,
 69 |     const int sample_per_part,
 70 |     const int output_dim,
 71 |     const int group_size,
 72 |     const int part_size,
 73 |     const int num_classes,
 74 |     const int channels_each_class,
 75 |     T *top_data,
 76 |     T *top_count)
 77 | {
 78 |   for(int index = 0; index < count; index++)
 79 |   {
 80 |     // The output is in order (n, ctop, ph, pw)
 81 |     int pw = index % pooled_width;
 82 |     int ph = (index / pooled_width) % pooled_height;
 83 |     int ctop = (index / pooled_width / pooled_height) % output_dim;
 84 |     int n = index / pooled_width / pooled_height / output_dim;
 85 | 
 86 |     // [start, end) interval for spatial sampling
 87 |     const T *offset_bottom_rois = bottom_rois + n * 5;
 88 |     int roi_batch_ind = offset_bottom_rois[0];
 89 |     T roi_start_w = static_cast<T>(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
 90 |     T roi_start_h = static_cast<T>(round(offset_bottom_rois[2])) * spatial_scale - 0.5;
 91 |     T roi_end_w = static_cast<T>(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
 92 |     T roi_end_h = static_cast<T>(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5;
 93 | 
 94 |     // Force too small ROIs to be 1x1
 95 |     T roi_width = std::max(roi_end_w - roi_start_w, T(0.1)); //avoid 0
 96 |     T roi_height = std::max(roi_end_h - roi_start_h, T(0.1));
 97 | 
 98 |     // Compute w and h at bottom
 99 |     T bin_size_h = roi_height / static_cast<T>(pooled_height);
100 |     T bin_size_w = roi_width / static_cast<T>(pooled_width);
101 | 
102 |     T sub_bin_size_h = bin_size_h / static_cast<T>(sample_per_part);
103 |     T sub_bin_size_w = bin_size_w / static_cast<T>(sample_per_part);
104 | 
105 |     int part_h = floor(static_cast<T>(ph) / pooled_height * part_size);
106 |     int part_w = floor(static_cast<T>(pw) / pooled_width * part_size);
107 |     int class_id = ctop / channels_each_class;
108 |     T trans_x = no_trans ? static_cast<T>(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * trans_std;
109 |     T trans_y = no_trans ? static_cast<T>(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * trans_std;
110 | 
111 |     T wstart = static_cast<T>(pw) * bin_size_w + roi_start_w;
112 |     wstart += trans_x * roi_width;
113 |     T hstart = static_cast<T>(ph) * bin_size_h + roi_start_h;
114 |     hstart += trans_y * roi_height;
115 | 
116 |     T sum = 0;
117 |     int count = 0;
118 |     int gw = floor(static_cast<T>(pw) * group_size / pooled_width);
119 |     int gh = floor(static_cast<T>(ph) * group_size / pooled_height);
120 |     gw = std::min(std::max(gw, 0), group_size - 1);
121 |     gh = std::min(std::max(gh, 0), group_size - 1);
122 | 
123 |     const T *offset_bottom_data = bottom_data + (roi_batch_ind * channels) * height * width;
124 |     for (int ih = 0; ih < sample_per_part; ih++)
125 |     {
126 |       for (int iw = 0; iw < sample_per_part; iw++)
127 |       {
128 |         T w = wstart + iw * sub_bin_size_w;
129 |         T h = hstart + ih * sub_bin_size_h;
130 |         // bilinear interpolation
131 |         if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5)
132 |         {
133 |           continue;
134 |         }
135 |         w = std::min(std::max(w, T(0.)), width - T(1.));
136 |         h = std::min(std::max(h, T(0.)), height - T(1.));
137 |         int c = (ctop * group_size + gh) * group_size + gw;
138 |         T val = bilinear_interp_cpu(offset_bottom_data + c * height * width, w, h, width, height);
139 |         sum += val;
140 |         count++;
141 |       }
142 |     }
143 |     top_data[index] = count == 0 ? static_cast<T>(0) : sum / count;
144 |     top_count[index] = count;
145 |   }
146 | }
147 | 
148 | template <typename T>
149 | void DeformablePSROIPoolBackwardAccKernelCpu(
150 |     const int count,
151 |     const T *top_diff,
152 |     const T *top_count,
153 |     const int num_rois,
154 |     const T spatial_scale,
155 |     const int channels,
156 |     const int height, const int width,
157 |     const int pooled_height, const int pooled_width,
158 |     const int output_dim,
159 |     T *bottom_data_diff, T *bottom_trans_diff,
160 |     const T *bottom_data,
161 |     const T *bottom_rois,
162 |     const T *bottom_trans,
163 |     const int no_trans,
164 |     const T trans_std,
165 |     const int sample_per_part,
166 |     const int group_size,
167 |     const int part_size,
168 |     const int num_classes,
169 |     const int channels_each_class)
170 | {
171 |   for(int index = 0; index < count; index++)
172 |   {
173 |     // The output is in order (n, ctop, ph, pw)
174 |     int pw = index % pooled_width;
175 |     int ph = (index / pooled_width) % pooled_height;
176 |     int ctop = (index / pooled_width / pooled_height) % output_dim;
177 |     int n = index / pooled_width / pooled_height / output_dim;
178 | 
179 |     // [start, end) interval for spatial sampling
180 |     const T *offset_bottom_rois = bottom_rois + n * 5;
181 |     int roi_batch_ind = offset_bottom_rois[0];
182 |     T roi_start_w = static_cast<T>(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
183 |     T roi_start_h = static_cast<T>(round(offset_bottom_rois[2])) * spatial_scale - 0.5;
184 |     T roi_end_w = static_cast<T>(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
185 |     T roi_end_h = static_cast<T>(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5;
186 |     
187 |     // Force too small ROIs to be 1x1
188 |     T roi_width = std::max(roi_end_w - roi_start_w, T(0.1)); //avoid 0
189 |     T roi_height = std::max(roi_end_h - roi_start_h, T(0.1));
190 | 
191 |     // Compute w and h at bottom
192 |     T bin_size_h = roi_height / static_cast<T>(pooled_height);
193 |     T bin_size_w = roi_width / static_cast<T>(pooled_width);
194 | 
195 |     T sub_bin_size_h = bin_size_h / static_cast<T>(sample_per_part);
196 |     T sub_bin_size_w = bin_size_w / static_cast<T>(sample_per_part);
197 | 
198 |     int part_h = floor(static_cast<T>(ph) / pooled_height * part_size);
199 |     int part_w = floor(static_cast<T>(pw) / pooled_width * part_size);
200 |     int class_id = ctop / channels_each_class;
201 |     T trans_x = no_trans ? static_cast<T>(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * trans_std;
202 |     T trans_y = no_trans ? static_cast<T>(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * trans_std;
203 | 
204 |     T wstart = static_cast<T>(pw) * bin_size_w + roi_start_w;
205 |     wstart += trans_x * roi_width;
206 |     T hstart = static_cast<T>(ph) * bin_size_h + roi_start_h;
207 |     hstart += trans_y * roi_height;
208 | 
209 |     if (top_count[index] <= 0)
210 |     {
211 |       continue;
212 |     }
213 |     T diff_val = top_diff[index] / top_count[index];
214 |     const T *offset_bottom_data = bottom_data + roi_batch_ind * channels * height * width;
215 |     T *offset_bottom_data_diff = bottom_data_diff + roi_batch_ind * channels * height * width;
216 |     int gw = floor(static_cast<T>(pw) * group_size / pooled_width);
217 |     int gh = floor(static_cast<T>(ph) * group_size / pooled_height);
218 |     gw = std::min(std::max(gw, 0), group_size - 1);
219 |     gh = std::min(std::max(gh, 0), group_size - 1);
220 | 
221 |     for (int ih = 0; ih < sample_per_part; ih++)
222 |     {
223 |       for (int iw = 0; iw < sample_per_part; iw++)
224 |       {
225 |         T w = wstart + iw * sub_bin_size_w;
226 |         T h = hstart + ih * sub_bin_size_h;
227 |         // bilinear interpolation
228 |         if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5)
229 |         {
230 |           continue;
231 |         }
232 |         w = std::min(std::max(w, T(0.)), width - T(1.));
233 |         h = std::min(std::max(h, T(0.)), height - T(1.));
234 |         int c = (ctop * group_size + gh) * group_size + gw;
235 |         // backward on feature
236 |         int x0 = floor(w);
237 |         int x1 = ceil(w);
238 |         int y0 = floor(h);
239 |         int y1 = ceil(h);
240 |         T dist_x = w - x0, dist_y = h - y0;
241 |         T q00 = (1 - dist_x) * (1 - dist_y);
242 |         T q01 = (1 - dist_x) * dist_y;
243 |         T q10 = dist_x * (1 - dist_y);
244 |         T q11 = dist_x * dist_y;
245 |         int bottom_index_base = c * height * width;
246 |         /*atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x0, q00 * diff_val);
247 |         atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x0, q01 * diff_val);
248 |         atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x1, q10 * diff_val);
249 |         atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x1, q11 * diff_val);*/
250 |        *(offset_bottom_data_diff + bottom_index_base + y0 * width + x0) += q00 * diff_val;
251 |        *(offset_bottom_data_diff + bottom_index_base + y1 * width + x0) += q01 * diff_val;
252 |        *(offset_bottom_data_diff + bottom_index_base + y0 * width + x1) += q10 * diff_val;
253 |        *(offset_bottom_data_diff + bottom_index_base + y1 * width + x1) += q11 * diff_val;
254 | 
255 | 
256 |         if (no_trans)
257 |         {
258 |           continue;
259 |         }
260 |         T U00 = offset_bottom_data[bottom_index_base + y0 * width + x0];
261 |         T U01 = offset_bottom_data[bottom_index_base + y1 * width + x0];
262 |         T U10 = offset_bottom_data[bottom_index_base + y0 * width + x1];
263 |         T U11 = offset_bottom_data[bottom_index_base + y1 * width + x1];
264 |         T diff_x = (U11 * dist_y + U10 * (1 - dist_y) - U01 * dist_y - U00 * (1 - dist_y)) * trans_std * diff_val;
265 |         diff_x *= roi_width;
266 |         T diff_y = (U11 * dist_x + U01 * (1 - dist_x) - U10 * dist_x - U00 * (1 - dist_x)) * trans_std * diff_val;
267 |         diff_y *= roi_height;
268 | 
269 |         /*atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w, diff_x);
270 |         atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w, diff_y);*/
271 |         *(bottom_trans_diff + (((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w) += diff_x;
272 |         *(bottom_trans_diff + (((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w) += diff_y;
273 |       }
274 |     }
275 |   }
276 | }
277 | 
278 | std::tuple<at::Tensor, at::Tensor>
279 | dcn_v2_psroi_pooling_cpu_forward(const at::Tensor &input,
280 |                                   const at::Tensor &bbox,
281 |                                   const at::Tensor &trans,
282 |                                   const int no_trans,
283 |                                   const float spatial_scale,
284 |                                   const int output_dim,
285 |                                   const int group_size,
286 |                                   const int pooled_size,
287 |                                   const int part_size,
288 |                                   const int sample_per_part,
289 |                                   const float trans_std)
290 | {
291 |   /*AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
292 |   AT_ASSERTM(bbox.type().is_cuda(), "rois must be a CUDA tensor");
293 |   AT_ASSERTM(trans.type().is_cuda(), "trans must be a CUDA tensor");*/
294 | 
295 |   const int batch = input.size(0);
296 |   const int channels = input.size(1);
297 |   const int height = input.size(2);
298 |   const int width = input.size(3);
299 |   const int channels_trans = no_trans ? 2 : trans.size(1);
300 |   const int num_bbox = bbox.size(0);
301 | 
302 |   AT_ASSERTM(channels == output_dim, "input channels and output channels must equal");
303 |   auto pooled_height = pooled_size;
304 |   auto pooled_width = pooled_size;
305 | 
306 |   auto out = at::empty({num_bbox, output_dim, pooled_height, pooled_width}, input.options());
307 |   long out_size = num_bbox * output_dim * pooled_height * pooled_width;
308 |   auto top_count = at::zeros({num_bbox, output_dim, pooled_height, pooled_width}, input.options());
309 | 
310 |   const int num_classes = no_trans ? 1 : channels_trans / 2;
311 |   const int channels_each_class = no_trans ? output_dim : output_dim / num_classes;
312 | 
313 |   //cudaStream_t stream = at::cuda::getCurrentCUDAStream();
314 | 
315 |   if (out.numel() == 0)
316 |   {
317 |     //THCudaCheck(cudaGetLastError());
318 |     return std::make_tuple(out, top_count);
319 |   }
320 | 
321 |   /*dim3 grid(std::min(THCCeilDiv(out_size, 512L), 4096L));
322 |   dim3 block(512);*/
323 | 
324 |   AT_DISPATCH_FLOATING_TYPES(input.type(), "dcn_v2_psroi_pooling_cpu_forward", [&] {
325 |     DeformablePSROIPoolForwardKernelCpu<scalar_t>(
326 |         out_size,
327 |         input.contiguous().data<scalar_t>(),
328 |         spatial_scale,
329 |         channels,
330 |         height, width,
331 |         pooled_height,
332 |         pooled_width,
333 |         bbox.contiguous().data<scalar_t>(),
334 |         trans.contiguous().data<scalar_t>(),
335 |         no_trans,
336 |         trans_std,
337 |         sample_per_part,
338 |         output_dim,
339 |         group_size,
340 |         part_size,
341 |         num_classes,
342 |         channels_each_class,
343 |         out.data<scalar_t>(),
344 |         top_count.data<scalar_t>());
345 |   });
346 |   //THCudaCheck(cudaGetLastError());
347 |   return std::make_tuple(out, top_count);
348 | }
349 | 
350 | std::tuple<at::Tensor, at::Tensor>
351 | dcn_v2_psroi_pooling_cpu_backward(const at::Tensor &out_grad,
352 |                                    const at::Tensor &input,
353 |                                    const at::Tensor &bbox,
354 |                                    const at::Tensor &trans,
355 |                                    const at::Tensor &top_count,
356 |                                    const int no_trans,
357 |                                    const float spatial_scale,
358 |                                    const int output_dim,
359 |                                    const int group_size,
360 |                                    const int pooled_size,
361 |                                    const int part_size,
362 |                                    const int sample_per_part,
363 |                                    const float trans_std)
364 | {
365 |   /*AT_ASSERTM(out_grad.type().is_cuda(), "out_grad must be a CUDA tensor");
366 |   AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
367 |   AT_ASSERTM(bbox.type().is_cuda(), "bbox must be a CUDA tensor");
368 |   AT_ASSERTM(trans.type().is_cuda(), "trans must be a CUDA tensor");
369 |   AT_ASSERTM(top_count.type().is_cuda(), "top_count must be a CUDA tensor");*/
370 | 
371 |   const int batch = input.size(0);
372 |   const int channels = input.size(1);
373 |   const int height = input.size(2);
374 |   const int width = input.size(3);
375 |   const int channels_trans = no_trans ? 2 : trans.size(1);
376 |   const int num_bbox = bbox.size(0);
377 | 
378 |   AT_ASSERTM(channels == output_dim, "input channels and output channels must equal");
379 |   auto pooled_height = pooled_size;
380 |   auto pooled_width = pooled_size;
381 |   long out_size = num_bbox * output_dim * pooled_height * pooled_width;
382 |   const int num_classes = no_trans ? 1 : channels_trans / 2;
383 |   const int channels_each_class = no_trans ? output_dim : output_dim / num_classes;
384 | 
385 |   auto input_grad = at::zeros({batch, channels, height, width}, out_grad.options());
386 |   auto trans_grad = at::zeros_like(trans);
387 | 
388 |   if (input_grad.numel() == 0)
389 |   {
390 |     //THCudaCheck(cudaGetLastError());
391 |     return std::make_tuple(input_grad, trans_grad);
392 |   }
393 | 
394 |   /*dim3 grid(std::min(THCCeilDiv(out_size, 512L), 4096L));
395 |   dim3 block(512);
396 |   cudaStream_t stream = at::cuda::getCurrentCUDAStream();*/
397 | 
398 |   AT_DISPATCH_FLOATING_TYPES(out_grad.type(), "dcn_v2_psroi_pooling_cpu_backward", [&] {
399 |     DeformablePSROIPoolBackwardAccKernelCpu<scalar_t>(
400 |         out_size,
401 |         out_grad.contiguous().data<scalar_t>(),
402 |         top_count.contiguous().data<scalar_t>(),
403 |         num_bbox,
404 |         spatial_scale,
405 |         channels,
406 |         height,
407 |         width,
408 |         pooled_height,
409 |         pooled_width,
410 |         output_dim,
411 |         input_grad.contiguous().data<scalar_t>(),
412 |         trans_grad.contiguous().data<scalar_t>(),
413 |         input.contiguous().data<scalar_t>(),
414 |         bbox.contiguous().data<scalar_t>(),
415 |         trans.contiguous().data<scalar_t>(),
416 |         no_trans,
417 |         trans_std,
418 |         sample_per_part,
419 |         group_size,
420 |         part_size,
421 |         num_classes,
422 |         channels_each_class);
423 |   });
424 |   //THCudaCheck(cudaGetLastError());
425 |   return std::make_tuple(input_grad, trans_grad);
426 | }


--------------------------------------------------------------------------------
/mmdet/layers/DCNv2/src/cpu/vision.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <torch/extension.h>
 3 | 
 4 | at::Tensor
 5 | dcn_v2_cpu_forward(const at::Tensor &input,
 6 |                     const at::Tensor &weight,
 7 |                     const at::Tensor &bias,
 8 |                     const at::Tensor &offset,
 9 |                     const at::Tensor &mask,
10 |                     const int kernel_h,
11 |                     const int kernel_w,
12 |                     const int stride_h,
13 |                     const int stride_w,
14 |                     const int pad_h,
15 |                     const int pad_w,
16 |                     const int dilation_h,
17 |                     const int dilation_w,
18 |                     const int deformable_group);
19 | 
20 | std::vector<at::Tensor>
21 | dcn_v2_cpu_backward(const at::Tensor &input,
22 |                      const at::Tensor &weight,
23 |                      const at::Tensor &bias,
24 |                      const at::Tensor &offset,
25 |                      const at::Tensor &mask,
26 |                      const at::Tensor &grad_output,
27 |                      int kernel_h, int kernel_w,
28 |                      int stride_h, int stride_w,
29 |                      int pad_h, int pad_w,
30 |                      int dilation_h, int dilation_w,
31 |                      int deformable_group);
32 | 
33 | 
34 | std::tuple<at::Tensor, at::Tensor>
35 | dcn_v2_psroi_pooling_cpu_forward(const at::Tensor &input,
36 |                                   const at::Tensor &bbox,
37 |                                   const at::Tensor &trans,
38 |                                   const int no_trans,
39 |                                   const float spatial_scale,
40 |                                   const int output_dim,
41 |                                   const int group_size,
42 |                                   const int pooled_size,
43 |                                   const int part_size,
44 |                                   const int sample_per_part,
45 |                                   const float trans_std);
46 | 
47 | std::tuple<at::Tensor, at::Tensor>
48 | dcn_v2_psroi_pooling_cpu_backward(const at::Tensor &out_grad,
49 |                                    const at::Tensor &input,
50 |                                    const at::Tensor &bbox,
51 |                                    const at::Tensor &trans,
52 |                                    const at::Tensor &top_count,
53 |                                    const int no_trans,
54 |                                    const float spatial_scale,
55 |                                    const int output_dim,
56 |                                    const int group_size,
57 |                                    const int pooled_size,
58 |                                    const int part_size,
59 |                                    const int sample_per_part,
60 |                                    const float trans_std);


--------------------------------------------------------------------------------
/mmdet/layers/DCNv2/src/cuda/dcn_v2_cuda.cu:
--------------------------------------------------------------------------------
  1 | #include <vector>
  2 | #include "cuda/dcn_v2_im2col_cuda.h"
  3 | 
  4 | #include <ATen/ATen.h>
  5 | #include <ATen/cuda/CUDAContext.h>
  6 | 
  7 | #include <THC/THC.h>
  8 | #include <THC/THCAtomics.cuh>
  9 | #include <THC/THCDeviceUtils.cuh>
 10 | 
 11 | THCState *state = at::globalContext().lazyInitCUDA();
 12 | 
 13 | // author: Charles Shang
 14 | // https://github.com/torch/cunn/blob/master/lib/THCUNN/generic/SpatialConvolutionMM.cu
 15 | 
 16 | // [batch gemm]
 17 | // https://github.com/pytorch/pytorch/blob/master/aten/src/THC/generic/THCTensorMathBlas.cu
 18 | 
 19 | __global__ void createBatchGemmBuffer(const float **input_b, float **output_b,
 20 |                                       float **columns_b, const float **ones_b,
 21 |                                       const float **weight_b, const float **bias_b,
 22 |                                       float *input, float *output,
 23 |                                       float *columns, float *ones,
 24 |                                       float *weight, float *bias,
 25 |                                       const int input_stride, const int output_stride,
 26 |                                       const int columns_stride, const int ones_stride,
 27 |                                       const int num_batches)
 28 | {
 29 |     const int idx = blockIdx.x * blockDim.x + threadIdx.x;
 30 |     if (idx < num_batches)
 31 |     {
 32 |         input_b[idx] = input + idx * input_stride;
 33 |         output_b[idx] = output + idx * output_stride;
 34 |         columns_b[idx] = columns + idx * columns_stride;
 35 |         ones_b[idx] = ones + idx * ones_stride;
 36 |         // share weights and bias within a Mini-Batch
 37 |         weight_b[idx] = weight;
 38 |         bias_b[idx] = bias;
 39 |     }
 40 | }
 41 | 
 42 | at::Tensor
 43 | dcn_v2_cuda_forward(const at::Tensor &input,
 44 |                     const at::Tensor &weight,
 45 |                     const at::Tensor &bias,
 46 |                     const at::Tensor &offset,
 47 |                     const at::Tensor &mask,
 48 |                     const int kernel_h,
 49 |                     const int kernel_w,
 50 |                     const int stride_h,
 51 |                     const int stride_w,
 52 |                     const int pad_h,
 53 |                     const int pad_w,
 54 |                     const int dilation_h,
 55 |                     const int dilation_w,
 56 |                     const int deformable_group)
 57 | {
 58 |     using scalar_t = float;
 59 |     // THCAssertSameGPU(THCudaTensor_checkGPU(state, 5, input, weight, bias, offset, mask));
 60 |     AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
 61 |     AT_ASSERTM(weight.type().is_cuda(), "weight must be a CUDA tensor");
 62 |     AT_ASSERTM(bias.type().is_cuda(), "bias must be a CUDA tensor");
 63 |     AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor");
 64 |     AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor");
 65 | 
 66 |     const int batch = input.size(0);
 67 |     const int channels = input.size(1);
 68 |     const int height = input.size(2);
 69 |     const int width = input.size(3);
 70 | 
 71 |     const int channels_out = weight.size(0);
 72 |     const int channels_kernel = weight.size(1);
 73 |     const int kernel_h_ = weight.size(2);
 74 |     const int kernel_w_ = weight.size(3);
 75 | 
 76 |     // printf("Kernels: %d %d %d %d\n", kernel_h_, kernel_w_, kernel_w, kernel_h);
 77 |     // printf("Channels: %d %d\n", channels, channels_kernel);
 78 |     // printf("Channels: %d %d\n", channels_out, channels_kernel);
 79 | 
 80 |     AT_ASSERTM(kernel_h_ == kernel_h && kernel_w_ == kernel_w,
 81 |                "Input shape and kernel shape wont match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_);
 82 | 
 83 |     AT_ASSERTM(channels == channels_kernel,
 84 |                "Input shape and kernel channels wont match: (%d vs %d).", channels, channels_kernel);
 85 | 
 86 |     const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
 87 |     const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
 88 | 
 89 |     auto ones = at::ones({batch, height_out, width_out}, input.options());
 90 |     auto columns = at::empty({batch, channels * kernel_h * kernel_w, 1 * height_out * width_out}, input.options());
 91 |     auto output = at::empty({batch, channels_out, height_out, width_out}, input.options());
 92 | 
 93 |     // prepare for batch-wise computing, which is significantly faster than instance-wise computing
 94 |     // when batch size is large.
 95 |     // launch batch threads
 96 |     int matrices_size = batch * sizeof(float *);
 97 |     auto input_b = static_cast<const float **>(THCudaMalloc(state, matrices_size));
 98 |     auto output_b = static_cast<float **>(THCudaMalloc(state, matrices_size));
 99 |     auto columns_b = static_cast<float **>(THCudaMalloc(state, matrices_size));
100 |     auto ones_b = static_cast<const float **>(THCudaMalloc(state, matrices_size));
101 |     auto weight_b = static_cast<const float **>(THCudaMalloc(state, matrices_size));
102 |     auto bias_b = static_cast<const float **>(THCudaMalloc(state, matrices_size));
103 | 
104 |     const int block = 128;
105 |     const int grid = (batch + block - 1) / block;
106 | 
107 |     createBatchGemmBuffer<<<grid, block, 0, c10::cuda::getCurrentCUDAStream()>>>(
108 |         input_b, output_b,
109 |         columns_b, ones_b,
110 |         weight_b, bias_b,
111 |         input.data<scalar_t>(),
112 |         output.data<scalar_t>(),
113 |         columns.data<scalar_t>(),
114 |         ones.data<scalar_t>(),
115 |         weight.data<scalar_t>(),
116 |         bias.data<scalar_t>(),
117 |         channels * width * height,
118 |         channels_out * width_out * height_out,
119 |         channels * kernel_h * kernel_w * height_out * width_out,
120 |         height_out * width_out,
121 |         batch);
122 | 
123 |     long m_ = channels_out;
124 |     long n_ = height_out * width_out;
125 |     long k_ = 1;
126 |     THCudaBlas_SgemmBatched(state,
127 |                             't',
128 |                             'n',
129 |                             n_,
130 |                             m_,
131 |                             k_,
132 |                             1.0f,
133 |                             ones_b, k_,
134 |                             bias_b, k_,
135 |                             0.0f,
136 |                             output_b, n_,
137 |                             batch);
138 | 
139 |     modulated_deformable_im2col_cuda(c10::cuda::getCurrentCUDAStream(),
140 |                                      input.data<scalar_t>(),
141 |                                      offset.data<scalar_t>(),
142 |                                      mask.data<scalar_t>(),
143 |                                      batch, channels, height, width,
144 |                                      height_out, width_out, kernel_h, kernel_w,
145 |                                      pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
146 |                                      deformable_group,
147 |                                      columns.data<scalar_t>());
148 | 
149 |     long m = channels_out;
150 |     long n = height_out * width_out;
151 |     long k = channels * kernel_h * kernel_w;
152 |     THCudaBlas_SgemmBatched(state,
153 |                             'n',
154 |                             'n',
155 |                             n,
156 |                             m,
157 |                             k,
158 |                             1.0f,
159 |                             (const float **)columns_b, n,
160 |                             weight_b, k,
161 |                             1.0f,
162 |                             output_b, n,
163 |                             batch);
164 | 
165 |     THCudaFree(state, input_b);
166 |     THCudaFree(state, output_b);
167 |     THCudaFree(state, columns_b);
168 |     THCudaFree(state, ones_b);
169 |     THCudaFree(state, weight_b);
170 |     THCudaFree(state, bias_b);
171 |     return output;
172 | }
173 | 
174 | __global__ void createBatchGemmBufferBackward(
175 |     float **grad_output_b,
176 |     float **columns_b,
177 |     float **ones_b,
178 |     float **weight_b,
179 |     float **grad_weight_b,
180 |     float **grad_bias_b,
181 |     float *grad_output,
182 |     float *columns,
183 |     float *ones,
184 |     float *weight,
185 |     float *grad_weight,
186 |     float *grad_bias,
187 |     const int grad_output_stride,
188 |     const int columns_stride,
189 |     const int ones_stride,
190 |     const int num_batches)
191 | {
192 |     const int idx = blockIdx.x * blockDim.x + threadIdx.x;
193 |     if (idx < num_batches)
194 |     {
195 |         grad_output_b[idx] = grad_output + idx * grad_output_stride;
196 |         columns_b[idx] = columns + idx * columns_stride;
197 |         ones_b[idx] = ones + idx * ones_stride;
198 | 
199 |         // share weights and bias within a Mini-Batch
200 |         weight_b[idx] = weight;
201 |         grad_weight_b[idx] = grad_weight;
202 |         grad_bias_b[idx] = grad_bias;
203 |     }
204 | }
205 | 
206 | std::vector<at::Tensor> dcn_v2_cuda_backward(const at::Tensor &input,
207 |                                              const at::Tensor &weight,
208 |                                              const at::Tensor &bias,
209 |                                              const at::Tensor &offset,
210 |                                              const at::Tensor &mask,
211 |                                              const at::Tensor &grad_output,
212 |                                              int kernel_h, int kernel_w,
213 |                                              int stride_h, int stride_w,
214 |                                              int pad_h, int pad_w,
215 |                                              int dilation_h, int dilation_w,
216 |                                              int deformable_group)
217 | {
218 | 
219 |     THArgCheck(input.is_contiguous(), 1, "input tensor has to be contiguous");
220 |     THArgCheck(weight.is_contiguous(), 2, "weight tensor has to be contiguous");
221 | 
222 |     AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
223 |     AT_ASSERTM(weight.type().is_cuda(), "weight must be a CUDA tensor");
224 |     AT_ASSERTM(bias.type().is_cuda(), "bias must be a CUDA tensor");
225 |     AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor");
226 |     AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor");
227 | 
228 |     const int batch = input.size(0);
229 |     const int channels = input.size(1);
230 |     const int height = input.size(2);
231 |     const int width = input.size(3);
232 | 
233 |     const int channels_out = weight.size(0);
234 |     const int channels_kernel = weight.size(1);
235 |     const int kernel_h_ = weight.size(2);
236 |     const int kernel_w_ = weight.size(3);
237 | 
238 |     AT_ASSERTM(kernel_h_ == kernel_h && kernel_w_ == kernel_w,
239 |                "Input shape and kernel shape wont match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_);
240 | 
241 |     AT_ASSERTM(channels == channels_kernel,
242 |                "Input shape and kernel channels wont match: (%d vs %d).", channels, channels_kernel);
243 | 
244 |     const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
245 |     const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
246 | 
247 |     auto ones = at::ones({height_out, width_out}, input.options());
248 |     auto columns = at::empty({channels * kernel_h * kernel_w, 1 * height_out * width_out}, input.options());
249 |     auto output = at::empty({batch, channels_out, height_out, width_out}, input.options());
250 | 
251 |     auto grad_input = at::zeros_like(input);
252 |     auto grad_weight = at::zeros_like(weight);
253 |     auto grad_bias = at::zeros_like(bias);
254 |     auto grad_offset = at::zeros_like(offset);
255 |     auto grad_mask = at::zeros_like(mask);
256 | 
257 |     using scalar_t = float;
258 | 
259 |     for (int b = 0; b < batch; b++)
260 |     {
261 |         auto input_n = input.select(0, b);
262 |         auto offset_n = offset.select(0, b);
263 |         auto mask_n = mask.select(0, b);
264 |         auto grad_output_n = grad_output.select(0, b);
265 |         auto grad_input_n = grad_input.select(0, b);
266 |         auto grad_offset_n = grad_offset.select(0, b);
267 |         auto grad_mask_n = grad_mask.select(0, b);
268 | 
269 |         long m = channels * kernel_h * kernel_w;
270 |         long n = height_out * width_out;
271 |         long k = channels_out;
272 | 
273 |         THCudaBlas_Sgemm(state, 'n', 't', n, m, k, 1.0f,
274 |                          grad_output_n.data<scalar_t>(), n,
275 |                          weight.data<scalar_t>(), m, 0.0f,
276 |                          columns.data<scalar_t>(), n);
277 | 
278 |         // gradient w.r.t. input coordinate data
279 |         modulated_deformable_col2im_coord_cuda(c10::cuda::getCurrentCUDAStream(),
280 |                                                columns.data<scalar_t>(),
281 |                                                input_n.data<scalar_t>(),
282 |                                                offset_n.data<scalar_t>(),
283 |                                                mask_n.data<scalar_t>(),
284 |                                                1, channels, height, width,
285 |                                                height_out, width_out, kernel_h, kernel_w,
286 |                                                pad_h, pad_w, stride_h, stride_w,
287 |                                                dilation_h, dilation_w, deformable_group,
288 |                                                grad_offset_n.data<scalar_t>(),
289 |                                                grad_mask_n.data<scalar_t>());
290 |         // gradient w.r.t. input data
291 |         modulated_deformable_col2im_cuda(c10::cuda::getCurrentCUDAStream(),
292 |                                          columns.data<scalar_t>(),
293 |                                          offset_n.data<scalar_t>(),
294 |                                          mask_n.data<scalar_t>(),
295 |                                          1, channels, height, width,
296 |                                          height_out, width_out, kernel_h, kernel_w,
297 |                                          pad_h, pad_w, stride_h, stride_w,
298 |                                          dilation_h, dilation_w, deformable_group,
299 |                                          grad_input_n.data<scalar_t>());
300 | 
301 |         // gradient w.r.t. weight, dWeight should accumulate across the batch and group
302 |         modulated_deformable_im2col_cuda(c10::cuda::getCurrentCUDAStream(),
303 |                                          input_n.data<scalar_t>(),
304 |                                          offset_n.data<scalar_t>(),
305 |                                          mask_n.data<scalar_t>(),
306 |                                          1, channels, height, width,
307 |                                          height_out, width_out, kernel_h, kernel_w,
308 |                                          pad_h, pad_w, stride_h, stride_w,
309 |                                          dilation_h, dilation_w, deformable_group,
310 |                                          columns.data<scalar_t>());
311 | 
312 |         long m_ = channels_out;
313 |         long n_ = channels * kernel_h * kernel_w;
314 |         long k_ = height_out * width_out;
315 | 
316 |         THCudaBlas_Sgemm(state, 't', 'n', n_, m_, k_, 1.0f,
317 |                          columns.data<scalar_t>(), k_,
318 |                          grad_output_n.data<scalar_t>(), k_, 1.0f,
319 |                          grad_weight.data<scalar_t>(), n_);
320 | 
321 |         // gradient w.r.t. bias
322 |         // long m_ = channels_out;
323 |         // long k__ = height_out * width_out;
324 |         THCudaBlas_Sgemv(state,
325 |                          't',
326 |                          k_, m_, 1.0f,
327 |                          grad_output_n.data<scalar_t>(), k_,
328 |                          ones.data<scalar_t>(), 1, 1.0f,
329 |                          grad_bias.data<scalar_t>(), 1);
330 |     }
331 | 
332 |     return {
333 |         grad_input, grad_offset, grad_mask, grad_weight, grad_bias
334 |     };
335 | }


--------------------------------------------------------------------------------
/mmdet/layers/DCNv2/src/cuda/dcn_v2_im2col_cuda.cu:
--------------------------------------------------------------------------------
  1 | #include "dcn_v2_im2col_cuda.h"
  2 | #include <cstdio>
  3 | #include <algorithm>
  4 | #include <cstring>
  5 | 
  6 | #include <ATen/ATen.h>
  7 | #include <ATen/cuda/CUDAContext.h>
  8 | 
  9 | #include <THC/THC.h>
 10 | #include <THC/THCAtomics.cuh>
 11 | #include <THC/THCDeviceUtils.cuh>
 12 | 
 13 | #define CUDA_KERNEL_LOOP(i, n)                          \
 14 |   for (int i = blockIdx.x * blockDim.x + threadIdx.x;   \
 15 |       i < (n);                                          \
 16 |       i += blockDim.x * gridDim.x)
 17 | 
 18 | const int CUDA_NUM_THREADS = 1024;
 19 | inline int GET_BLOCKS(const int N)
 20 | {
 21 |   return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
 22 | }
 23 | 
 24 | 
 25 | __device__ float dmcn_im2col_bilinear_cuda(const float *bottom_data, const int data_width,
 26 |                                       const int height, const int width, float h, float w)
 27 | {
 28 |   int h_low = floor(h);
 29 |   int w_low = floor(w);
 30 |   int h_high = h_low + 1;
 31 |   int w_high = w_low + 1;
 32 | 
 33 |   float lh = h - h_low;
 34 |   float lw = w - w_low;
 35 |   float hh = 1 - lh, hw = 1 - lw;
 36 | 
 37 |   float v1 = 0;
 38 |   if (h_low >= 0 && w_low >= 0)
 39 |     v1 = bottom_data[h_low * data_width + w_low];
 40 |   float v2 = 0;
 41 |   if (h_low >= 0 && w_high <= width - 1)
 42 |     v2 = bottom_data[h_low * data_width + w_high];
 43 |   float v3 = 0;
 44 |   if (h_high <= height - 1 && w_low >= 0)
 45 |     v3 = bottom_data[h_high * data_width + w_low];
 46 |   float v4 = 0;
 47 |   if (h_high <= height - 1 && w_high <= width - 1)
 48 |     v4 = bottom_data[h_high * data_width + w_high];
 49 | 
 50 |   float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
 51 | 
 52 |   float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
 53 |   return val;
 54 | }
 55 | 
 56 | __device__ float dmcn_get_gradient_weight_cuda(float argmax_h, float argmax_w,
 57 |                                           const int h, const int w, const int height, const int width)
 58 | {
 59 |   if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
 60 |   {
 61 |     //empty
 62 |     return 0;
 63 |   }
 64 | 
 65 |   int argmax_h_low = floor(argmax_h);
 66 |   int argmax_w_low = floor(argmax_w);
 67 |   int argmax_h_high = argmax_h_low + 1;
 68 |   int argmax_w_high = argmax_w_low + 1;
 69 | 
 70 |   float weight = 0;
 71 |   if (h == argmax_h_low && w == argmax_w_low)
 72 |     weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
 73 |   if (h == argmax_h_low && w == argmax_w_high)
 74 |     weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
 75 |   if (h == argmax_h_high && w == argmax_w_low)
 76 |     weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
 77 |   if (h == argmax_h_high && w == argmax_w_high)
 78 |     weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
 79 |   return weight;
 80 | }
 81 | 
 82 | __device__ float dmcn_get_coordinate_weight_cuda(float argmax_h, float argmax_w,
 83 |                                             const int height, const int width, const float *im_data,
 84 |                                             const int data_width, const int bp_dir)
 85 | {
 86 |   if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
 87 |   {
 88 |     //empty
 89 |     return 0;
 90 |   }
 91 | 
 92 |   int argmax_h_low = floor(argmax_h);
 93 |   int argmax_w_low = floor(argmax_w);
 94 |   int argmax_h_high = argmax_h_low + 1;
 95 |   int argmax_w_high = argmax_w_low + 1;
 96 | 
 97 |   float weight = 0;
 98 | 
 99 |   if (bp_dir == 0)
100 |   {
101 |     if (argmax_h_low >= 0 && argmax_w_low >= 0)
102 |       weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low];
103 |     if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
104 |       weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high];
105 |     if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
106 |       weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low];
107 |     if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
108 |       weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high];
109 |   }
110 |   else if (bp_dir == 1)
111 |   {
112 |     if (argmax_h_low >= 0 && argmax_w_low >= 0)
113 |       weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low];
114 |     if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
115 |       weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high];
116 |     if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
117 |       weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low];
118 |     if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
119 |       weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high];
120 |   }
121 | 
122 |   return weight;
123 | }
124 | 
125 | __global__ void modulated_deformable_im2col_gpu_kernel(const int n,
126 |                                                        const float *data_im, const float *data_offset, const float *data_mask,
127 |                                                        const int height, const int width, const int kernel_h, const int kernel_w,
128 |                                                        const int pad_h, const int pad_w,
129 |                                                        const int stride_h, const int stride_w,
130 |                                                        const int dilation_h, const int dilation_w,
131 |                                                        const int channel_per_deformable_group,
132 |                                                        const int batch_size, const int num_channels, const int deformable_group,
133 |                                                        const int height_col, const int width_col,
134 |                                                        float *data_col)
135 | {
136 |   // launch channels * batch_size * height_col * width_col cores
137 |   CUDA_KERNEL_LOOP(index, n)
138 |   {
139 |     // NOTE(CharlesShang): different from Dai Jifeng's MXNet implementation, col_buffer is of shape (c*kw*kh, N, oh, ow)
140 |     // here columns is of shape (N, c*kw*kh, oh * ow), need to adapt axis
141 | 
142 |     // index index of output matrix
143 |     const int w_col = index % width_col;
144 |     const int h_col = (index / width_col) % height_col;
145 |     // const int b_col = (index / width_col / height_col) % batch_size;
146 |     const int b_col = (index / width_col / height_col / num_channels) % batch_size;
147 |     // const int c_im = (index / width_col / height_col) / batch_size;
148 |     const int c_im = (index / width_col / height_col) % num_channels;
149 |     // const int c_col = c_im * kernel_h * kernel_w;
150 |     const int c_col = c_im * kernel_h * kernel_w;
151 | 
152 |     // compute deformable group index
153 |     const int deformable_group_index = c_im / channel_per_deformable_group;
154 | 
155 |     const int h_in = h_col * stride_h - pad_h;
156 |     const int w_in = w_col * stride_w - pad_w;
157 | 
158 |     //  float *data_col_ptr = data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
159 |     float *data_col_ptr = data_col + ((b_col * num_channels * kernel_w * kernel_h + c_col) * height_col + h_col) * width_col + w_col;
160 |     //const float* data_im_ptr = data_im + ((b_col * num_channels + c_im) * height + h_in) * width + w_in;
161 |     const float *data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width;
162 |     const float *data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
163 | 
164 |     const float *data_mask_ptr = data_mask + (b_col * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
165 | 
166 |     for (int i = 0; i < kernel_h; ++i)
167 |     {
168 |       for (int j = 0; j < kernel_w; ++j)
169 |       {
170 |         const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
171 |         const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col;
172 |         const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
173 |         const float offset_h = data_offset_ptr[data_offset_h_ptr];
174 |         const float offset_w = data_offset_ptr[data_offset_w_ptr];
175 |         const float mask = data_mask_ptr[data_mask_hw_ptr];
176 |         float val = static_cast<float>(0);
177 |         const float h_im = h_in + i * dilation_h + offset_h;
178 |         const float w_im = w_in + j * dilation_w + offset_w;
179 |         //if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) {
180 |         if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
181 |         {
182 |           //const float map_h = i * dilation_h + offset_h;
183 |           //const float map_w = j * dilation_w + offset_w;
184 |           //const int cur_height = height - h_in;
185 |           //const int cur_width = width - w_in;
186 |           //val = dmcn_im2col_bilinear_cuda(data_im_ptr, width, cur_height, cur_width, map_h, map_w);
187 |           val = dmcn_im2col_bilinear_cuda(data_im_ptr, width, height, width, h_im, w_im);
188 |         }
189 |         *data_col_ptr = val * mask;
190 |         // data_col_ptr += batch_size * height_col * width_col;
191 |         data_col_ptr += height_col * width_col;
192 |       }
193 |     }
194 |   }
195 | }
196 | 
197 | __global__ void modulated_deformable_col2im_gpu_kernel(const int n,
198 |                                                        const float *data_col, const float *data_offset, const float *data_mask,
199 |                                                        const int channels, const int height, const int width,
200 |                                                        const int kernel_h, const int kernel_w,
201 |                                                        const int pad_h, const int pad_w,
202 |                                                        const int stride_h, const int stride_w,
203 |                                                        const int dilation_h, const int dilation_w,
204 |                                                        const int channel_per_deformable_group,
205 |                                                        const int batch_size, const int deformable_group,
206 |                                                        const int height_col, const int width_col,
207 |                                                        float *grad_im)
208 | {
209 |   CUDA_KERNEL_LOOP(index, n)
210 |   {
211 |     const int j = (index / width_col / height_col / batch_size) % kernel_w;
212 |     const int i = (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
213 |     const int c = index / width_col / height_col / batch_size / kernel_w / kernel_h;
214 |     // compute the start and end of the output
215 | 
216 |     const int deformable_group_index = c / channel_per_deformable_group;
217 | 
218 |     int w_out = index % width_col;
219 |     int h_out = (index / width_col) % height_col;
220 |     int b = (index / width_col / height_col) % batch_size;
221 |     int w_in = w_out * stride_w - pad_w;
222 |     int h_in = h_out * stride_h - pad_h;
223 | 
224 |     const float *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
225 |     const float *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
226 |     const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
227 |     const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
228 |     const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
229 |     const float offset_h = data_offset_ptr[data_offset_h_ptr];
230 |     const float offset_w = data_offset_ptr[data_offset_w_ptr];
231 |     const float mask = data_mask_ptr[data_mask_hw_ptr];
232 |     const float cur_inv_h_data = h_in + i * dilation_h + offset_h;
233 |     const float cur_inv_w_data = w_in + j * dilation_w + offset_w;
234 | 
235 |     const float cur_top_grad = data_col[index] * mask;
236 |     const int cur_h = (int)cur_inv_h_data;
237 |     const int cur_w = (int)cur_inv_w_data;
238 |     for (int dy = -2; dy <= 2; dy++)
239 |     {
240 |       for (int dx = -2; dx <= 2; dx++)
241 |       {
242 |         if (cur_h + dy >= 0 && cur_h + dy < height &&
243 |             cur_w + dx >= 0 && cur_w + dx < width &&
244 |             abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
245 |             abs(cur_inv_w_data - (cur_w + dx)) < 1)
246 |         {
247 |           int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
248 |           float weight = dmcn_get_gradient_weight_cuda(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width);
249 |           atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
250 |         }
251 |       }
252 |     }
253 |   }
254 | }
255 | 
256 | __global__ void modulated_deformable_col2im_coord_gpu_kernel(const int n,
257 |                                                              const float *data_col, const float *data_im,
258 |                                                              const float *data_offset, const float *data_mask,
259 |                                                              const int channels, const int height, const int width,
260 |                                                              const int kernel_h, const int kernel_w,
261 |                                                              const int pad_h, const int pad_w,
262 |                                                              const int stride_h, const int stride_w,
263 |                                                              const int dilation_h, const int dilation_w,
264 |                                                              const int channel_per_deformable_group,
265 |                                                              const int batch_size, const int offset_channels, const int deformable_group,
266 |                                                              const int height_col, const int width_col,
267 |                                                              float *grad_offset, float *grad_mask)
268 | {
269 |   CUDA_KERNEL_LOOP(index, n)
270 |   {
271 |     float val = 0, mval = 0;
272 |     int w = index % width_col;
273 |     int h = (index / width_col) % height_col;
274 |     int c = (index / width_col / height_col) % offset_channels;
275 |     int b = (index / width_col / height_col) / offset_channels;
276 |     // compute the start and end of the output
277 | 
278 |     const int deformable_group_index = c / (2 * kernel_h * kernel_w);
279 |     const int col_step = kernel_h * kernel_w;
280 |     int cnt = 0;
281 |     const float *data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group * batch_size * width_col * height_col;
282 |     const float *data_im_ptr = data_im + (b * deformable_group + deformable_group_index) * channel_per_deformable_group / kernel_h / kernel_w * height * width;
283 |     const float *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
284 |     const float *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
285 | 
286 |     const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
287 | 
288 |     for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step)
289 |     {
290 |       const int col_pos = (((col_c * batch_size + b) * height_col) + h) * width_col + w;
291 |       const int bp_dir = offset_c % 2;
292 | 
293 |       int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
294 |       int i = (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
295 |       int w_out = col_pos % width_col;
296 |       int h_out = (col_pos / width_col) % height_col;
297 |       int w_in = w_out * stride_w - pad_w;
298 |       int h_in = h_out * stride_h - pad_h;
299 |       const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
300 |       const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out);
301 |       const int data_mask_hw_ptr = (((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
302 |       const float offset_h = data_offset_ptr[data_offset_h_ptr];
303 |       const float offset_w = data_offset_ptr[data_offset_w_ptr];
304 |       const float mask = data_mask_ptr[data_mask_hw_ptr];
305 |       float inv_h = h_in + i * dilation_h + offset_h;
306 |       float inv_w = w_in + j * dilation_w + offset_w;
307 |       if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
308 |       {
309 |         inv_h = inv_w = -2;
310 |       }
311 |       else
312 |       {
313 |         mval += data_col_ptr[col_pos] * dmcn_im2col_bilinear_cuda(data_im_ptr + cnt * height * width, width, height, width, inv_h, inv_w);
314 |       }
315 |       const float weight = dmcn_get_coordinate_weight_cuda(
316 |           inv_h, inv_w,
317 |           height, width, data_im_ptr + cnt * height * width, width, bp_dir);
318 |       val += weight * data_col_ptr[col_pos] * mask;
319 |       cnt += 1;
320 |     }
321 |     // KERNEL_ASSIGN(grad_offset[index], offset_req, val);
322 |     grad_offset[index] = val;
323 |     if (offset_c % 2 == 0)
324 |       // KERNEL_ASSIGN(grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w], mask_req, mval);
325 |       grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w] = mval;
326 |   }
327 | }
328 | 
329 | void modulated_deformable_im2col_cuda(cudaStream_t stream,
330 |   const float* data_im, const float* data_offset, const float* data_mask,
331 |   const int batch_size, const int channels, const int height_im, const int width_im, 
332 |   const int height_col, const int width_col, const int kernel_h, const int kernel_w,
333 |   const int pad_h, const int pad_w, const int stride_h, const int stride_w, 
334 |   const int dilation_h, const int dilation_w,
335 |   const int deformable_group, float* data_col) {
336 |   // num_axes should be smaller than block size
337 |   const int channel_per_deformable_group = channels / deformable_group;
338 |   const int num_kernels = channels * batch_size * height_col * width_col;
339 |   modulated_deformable_im2col_gpu_kernel
340 |       <<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS,
341 |           0, stream>>>(
342 |       num_kernels, data_im, data_offset, data_mask, height_im, width_im, kernel_h, kernel_w,
343 |       pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group,
344 |       batch_size, channels, deformable_group, height_col, width_col, data_col);
345 |   
346 |   cudaError_t err = cudaGetLastError();
347 |   if (err != cudaSuccess)
348 |   {
349 |     printf("error in modulated_deformable_im2col_cuda: %s\n", cudaGetErrorString(err));
350 |   }
351 | 
352 | }
353 | 
354 | void modulated_deformable_col2im_cuda(cudaStream_t stream,
355 |   const float* data_col, const float* data_offset, const float* data_mask,
356 |   const int batch_size, const int channels, const int height_im, const int width_im, 
357 |   const int height_col, const int width_col, const int kernel_h, const int kernel_w,
358 |   const int pad_h, const int pad_w, const int stride_h, const int stride_w, 
359 |   const int dilation_h, const int dilation_w, 
360 |   const int deformable_group, float* grad_im){
361 | 
362 |   const int channel_per_deformable_group = channels / deformable_group;
363 |   const int num_kernels = channels * kernel_h * kernel_w * batch_size * height_col * width_col;
364 |   modulated_deformable_col2im_gpu_kernel
365 |       <<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS,
366 |           0, stream>>>(
367 |         num_kernels, data_col, data_offset, data_mask, channels, height_im, width_im,
368 |         kernel_h, kernel_w, pad_h, pad_h, stride_h, stride_w,
369 |         dilation_h, dilation_w, channel_per_deformable_group,
370 |         batch_size, deformable_group, height_col, width_col, grad_im);
371 |   cudaError_t err = cudaGetLastError();
372 |   if (err != cudaSuccess)
373 |   {
374 |     printf("error in modulated_deformable_col2im_cuda: %s\n", cudaGetErrorString(err));
375 |   }
376 | 
377 | }
378 | 
379 | void modulated_deformable_col2im_coord_cuda(cudaStream_t stream,
380 |   const float* data_col, const float* data_im, const float* data_offset, const float* data_mask,
381 |   const int batch_size, const int channels, const int height_im, const int width_im, 
382 |   const int height_col, const int width_col, const int kernel_h, const int kernel_w,
383 |   const int pad_h, const int pad_w, const int stride_h, const int stride_w, 
384 |   const int dilation_h, const int dilation_w, 
385 |   const int deformable_group,
386 |   float* grad_offset, float* grad_mask) {
387 |   const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h * kernel_w * deformable_group;
388 |   const int channel_per_deformable_group = channels * kernel_h * kernel_w / deformable_group;
389 |   modulated_deformable_col2im_coord_gpu_kernel
390 |       <<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS,
391 |         0, stream>>>(
392 |         num_kernels, data_col, data_im, data_offset, data_mask, channels, height_im, width_im,
393 |         kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
394 |         dilation_h, dilation_w, channel_per_deformable_group,
395 |         batch_size, 2 * kernel_h * kernel_w * deformable_group, deformable_group, height_col, width_col, 
396 |         grad_offset, grad_mask);
397 |   cudaError_t err = cudaGetLastError();
398 |   if (err != cudaSuccess)
399 |   {
400 |     printf("error in modulated_deformable_col2im_coord_cuda: %s\n", cudaGetErrorString(err));
401 |   }
402 | }


--------------------------------------------------------------------------------
/mmdet/layers/DCNv2/src/cuda/dcn_v2_im2col_cuda.h:
--------------------------------------------------------------------------------
  1 | 
  2 | /*!
  3 |  ******************* BEGIN Caffe Copyright Notice and Disclaimer ****************
  4 |  *
  5 |  * COPYRIGHT
  6 |  *
  7 |  * All contributions by the University of California:
  8 |  * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
  9 |  * All rights reserved.
 10 |  *
 11 |  * All other contributions:
 12 |  * Copyright (c) 2014-2017, the respective contributors
 13 |  * All rights reserved.
 14 |  *
 15 |  * Caffe uses a shared copyright model: each contributor holds copyright over
 16 |  * their contributions to Caffe. The project versioning records all such
 17 |  * contribution and copyright details. If a contributor wants to further mark
 18 |  * their specific copyright on a particular contribution, they should indicate
 19 |  * their copyright solely in the commit message of the change when it is
 20 |  * committed.
 21 |  *
 22 |  * LICENSE
 23 |  *
 24 |  * Redistribution and use in source and binary forms, with or without
 25 |  * modification, are permitted provided that the following conditions are met:
 26 |  *
 27 |  * 1. Redistributions of source code must retain the above copyright notice, this
 28 |  * list of conditions and the following disclaimer.
 29 |  * 2. Redistributions in binary form must reproduce the above copyright notice,
 30 |  * this list of conditions and the following disclaimer in the documentation
 31 |  * and/or other materials provided with the distribution.
 32 |  *
 33 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 34 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 35 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 36 |  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
 37 |  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 38 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 39 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 40 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 41 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 42 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 43 |  *
 44 |  * CONTRIBUTION AGREEMENT
 45 |  *
 46 |  * By contributing to the BVLC/caffe repository through pull-request, comment,
 47 |  * or otherwise, the contributor releases their content to the
 48 |  * license and copyright terms herein.
 49 |  *
 50 |  ***************** END Caffe Copyright Notice and Disclaimer ********************
 51 |  *
 52 |  * Copyright (c) 2018 Microsoft
 53 |  * Licensed under The MIT License [see LICENSE for details]
 54 |  * \file modulated_deformable_im2col.h
 55 |  * \brief Function definitions of converting an image to
 56 |  * column matrix based on kernel, padding, dilation, and offset.
 57 |  * These functions are mainly used in deformable convolution operators.
 58 |  * \ref: https://arxiv.org/abs/1811.11168
 59 |  * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu
 60 |  */
 61 | 
 62 | /***************** Adapted by Charles Shang *********************/
 63 | 
 64 | #ifndef DCN_V2_IM2COL_CUDA
 65 | #define DCN_V2_IM2COL_CUDA
 66 | 
 67 | #ifdef __cplusplus
 68 | extern "C"
 69 | {
 70 | #endif
 71 | 
 72 |   void modulated_deformable_im2col_cuda(cudaStream_t stream,
 73 |                                         const float *data_im, const float *data_offset, const float *data_mask,
 74 |                                         const int batch_size, const int channels, const int height_im, const int width_im,
 75 |                                         const int height_col, const int width_col, const int kernel_h, const int kenerl_w,
 76 |                                         const int pad_h, const int pad_w, const int stride_h, const int stride_w,
 77 |                                         const int dilation_h, const int dilation_w,
 78 |                                         const int deformable_group, float *data_col);
 79 | 
 80 |   void modulated_deformable_col2im_cuda(cudaStream_t stream,
 81 |                                         const float *data_col, const float *data_offset, const float *data_mask,
 82 |                                         const int batch_size, const int channels, const int height_im, const int width_im,
 83 |                                         const int height_col, const int width_col, const int kernel_h, const int kenerl_w,
 84 |                                         const int pad_h, const int pad_w, const int stride_h, const int stride_w,
 85 |                                         const int dilation_h, const int dilation_w,
 86 |                                         const int deformable_group, float *grad_im);
 87 | 
 88 |   void modulated_deformable_col2im_coord_cuda(cudaStream_t stream,
 89 |                                          const float *data_col, const float *data_im, const float *data_offset, const float *data_mask,
 90 |                                          const int batch_size, const int channels, const int height_im, const int width_im,
 91 |                                          const int height_col, const int width_col, const int kernel_h, const int kenerl_w,
 92 |                                          const int pad_h, const int pad_w, const int stride_h, const int stride_w,
 93 |                                          const int dilation_h, const int dilation_w,
 94 |                                          const int deformable_group,
 95 |                                          float *grad_offset, float *grad_mask);
 96 | 
 97 | #ifdef __cplusplus
 98 | }
 99 | #endif
100 | 
101 | #endif


--------------------------------------------------------------------------------
/mmdet/layers/DCNv2/src/cuda/dcn_v2_psroi_pooling_cuda.cu:
--------------------------------------------------------------------------------
  1 | /*!
  2 |  * Copyright (c) 2017 Microsoft
  3 |  * Licensed under The MIT License [see LICENSE for details]
  4 |  * \file deformable_psroi_pooling.cu
  5 |  * \brief
  6 |  * \author Yi Li, Guodong Zhang, Jifeng Dai
  7 | */
  8 | /***************** Adapted by Charles Shang *********************/
  9 | 
 10 | #include <cstdio>
 11 | #include <algorithm>
 12 | #include <cstring>
 13 | #include <iostream>
 14 | 
 15 | #include <ATen/ATen.h>
 16 | #include <ATen/cuda/CUDAContext.h>
 17 | 
 18 | #include <THC/THC.h>
 19 | #include <THC/THCAtomics.cuh>
 20 | #include <THC/THCDeviceUtils.cuh>
 21 | 
 22 | #define CUDA_KERNEL_LOOP(i, n)                        \
 23 |   for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
 24 |        i < (n);                                       \
 25 |        i += blockDim.x * gridDim.x)
 26 | 
 27 | const int CUDA_NUM_THREADS = 1024;
 28 | inline int GET_BLOCKS(const int N)
 29 | {
 30 |   return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
 31 | }
 32 | 
 33 | template <typename T>
 34 | __device__ T bilinear_interp_cuda(
 35 |     const T *data,
 36 |     const T x,
 37 |     const T y,
 38 |     const int width,
 39 |     const int height)
 40 | {
 41 |   int x1 = floor(x);
 42 |   int x2 = ceil(x);
 43 |   int y1 = floor(y);
 44 |   int y2 = ceil(y);
 45 |   T dist_x = static_cast<T>(x - x1);
 46 |   T dist_y = static_cast<T>(y - y1);
 47 |   T value11 = data[y1 * width + x1];
 48 |   T value12 = data[y2 * width + x1];
 49 |   T value21 = data[y1 * width + x2];
 50 |   T value22 = data[y2 * width + x2];
 51 |   T value = (1 - dist_x) * (1 - dist_y) * value11 +
 52 |             (1 - dist_x) * dist_y * value12 +
 53 |             dist_x * (1 - dist_y) * value21 +
 54 |             dist_x * dist_y * value22;
 55 |   return value;
 56 | }
 57 | 
 58 | template <typename T>
 59 | __global__ void DeformablePSROIPoolForwardKernelCuda(
 60 |     const int count,
 61 |     const T *bottom_data,
 62 |     const T spatial_scale,
 63 |     const int channels,
 64 |     const int height, const int width,
 65 |     const int pooled_height, const int pooled_width,
 66 |     const T *bottom_rois, const T *bottom_trans,
 67 |     const int no_trans,
 68 |     const T trans_std,
 69 |     const int sample_per_part,
 70 |     const int output_dim,
 71 |     const int group_size,
 72 |     const int part_size,
 73 |     const int num_classes,
 74 |     const int channels_each_class,
 75 |     T *top_data,
 76 |     T *top_count)
 77 | {
 78 |   CUDA_KERNEL_LOOP(index, count)
 79 |   {
 80 |     // The output is in order (n, ctop, ph, pw)
 81 |     int pw = index % pooled_width;
 82 |     int ph = (index / pooled_width) % pooled_height;
 83 |     int ctop = (index / pooled_width / pooled_height) % output_dim;
 84 |     int n = index / pooled_width / pooled_height / output_dim;
 85 | 
 86 |     // [start, end) interval for spatial sampling
 87 |     const T *offset_bottom_rois = bottom_rois + n * 5;
 88 |     int roi_batch_ind = offset_bottom_rois[0];
 89 |     T roi_start_w = static_cast<T>(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
 90 |     T roi_start_h = static_cast<T>(round(offset_bottom_rois[2])) * spatial_scale - 0.5;
 91 |     T roi_end_w = static_cast<T>(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
 92 |     T roi_end_h = static_cast<T>(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5;
 93 | 
 94 |     // Force too small ROIs to be 1x1
 95 |     T roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0
 96 |     T roi_height = max(roi_end_h - roi_start_h, 0.1);
 97 | 
 98 |     // Compute w and h at bottom
 99 |     T bin_size_h = roi_height / static_cast<T>(pooled_height);
100 |     T bin_size_w = roi_width / static_cast<T>(pooled_width);
101 | 
102 |     T sub_bin_size_h = bin_size_h / static_cast<T>(sample_per_part);
103 |     T sub_bin_size_w = bin_size_w / static_cast<T>(sample_per_part);
104 | 
105 |     int part_h = floor(static_cast<T>(ph) / pooled_height * part_size);
106 |     int part_w = floor(static_cast<T>(pw) / pooled_width * part_size);
107 |     int class_id = ctop / channels_each_class;
108 |     T trans_x = no_trans ? static_cast<T>(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * trans_std;
109 |     T trans_y = no_trans ? static_cast<T>(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * trans_std;
110 | 
111 |     T wstart = static_cast<T>(pw) * bin_size_w + roi_start_w;
112 |     wstart += trans_x * roi_width;
113 |     T hstart = static_cast<T>(ph) * bin_size_h + roi_start_h;
114 |     hstart += trans_y * roi_height;
115 | 
116 |     T sum = 0;
117 |     int count = 0;
118 |     int gw = floor(static_cast<T>(pw) * group_size / pooled_width);
119 |     int gh = floor(static_cast<T>(ph) * group_size / pooled_height);
120 |     gw = min(max(gw, 0), group_size - 1);
121 |     gh = min(max(gh, 0), group_size - 1);
122 | 
123 |     const T *offset_bottom_data = bottom_data + (roi_batch_ind * channels) * height * width;
124 |     for (int ih = 0; ih < sample_per_part; ih++)
125 |     {
126 |       for (int iw = 0; iw < sample_per_part; iw++)
127 |       {
128 |         T w = wstart + iw * sub_bin_size_w;
129 |         T h = hstart + ih * sub_bin_size_h;
130 |         // bilinear interpolation
131 |         if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5)
132 |         {
133 |           continue;
134 |         }
135 |         w = min(max(w, 0.), width - 1.);
136 |         h = min(max(h, 0.), height - 1.);
137 |         int c = (ctop * group_size + gh) * group_size + gw;
138 |         T val = bilinear_interp_cuda(offset_bottom_data + c * height * width, w, h, width, height);
139 |         sum += val;
140 |         count++;
141 |       }
142 |     }
143 |     top_data[index] = count == 0 ? static_cast<T>(0) : sum / count;
144 |     top_count[index] = count;
145 |   }
146 | }
147 | 
148 | template <typename T>
149 | __global__ void DeformablePSROIPoolBackwardAccKernelCuda(
150 |     const int count,
151 |     const T *top_diff,
152 |     const T *top_count,
153 |     const int num_rois,
154 |     const T spatial_scale,
155 |     const int channels,
156 |     const int height, const int width,
157 |     const int pooled_height, const int pooled_width,
158 |     const int output_dim,
159 |     T *bottom_data_diff, T *bottom_trans_diff,
160 |     const T *bottom_data,
161 |     const T *bottom_rois,
162 |     const T *bottom_trans,
163 |     const int no_trans,
164 |     const T trans_std,
165 |     const int sample_per_part,
166 |     const int group_size,
167 |     const int part_size,
168 |     const int num_classes,
169 |     const int channels_each_class)
170 | {
171 |   CUDA_KERNEL_LOOP(index, count)
172 |   {
173 |     // The output is in order (n, ctop, ph, pw)
174 |     int pw = index % pooled_width;
175 |     int ph = (index / pooled_width) % pooled_height;
176 |     int ctop = (index / pooled_width / pooled_height) % output_dim;
177 |     int n = index / pooled_width / pooled_height / output_dim;
178 | 
179 |     // [start, end) interval for spatial sampling
180 |     const T *offset_bottom_rois = bottom_rois + n * 5;
181 |     int roi_batch_ind = offset_bottom_rois[0];
182 |     T roi_start_w = static_cast<T>(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
183 |     T roi_start_h = static_cast<T>(round(offset_bottom_rois[2])) * spatial_scale - 0.5;
184 |     T roi_end_w = static_cast<T>(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
185 |     T roi_end_h = static_cast<T>(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5;
186 | 
187 |     // Force too small ROIs to be 1x1
188 |     T roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0
189 |     T roi_height = max(roi_end_h - roi_start_h, 0.1);
190 | 
191 |     // Compute w and h at bottom
192 |     T bin_size_h = roi_height / static_cast<T>(pooled_height);
193 |     T bin_size_w = roi_width / static_cast<T>(pooled_width);
194 | 
195 |     T sub_bin_size_h = bin_size_h / static_cast<T>(sample_per_part);
196 |     T sub_bin_size_w = bin_size_w / static_cast<T>(sample_per_part);
197 | 
198 |     int part_h = floor(static_cast<T>(ph) / pooled_height * part_size);
199 |     int part_w = floor(static_cast<T>(pw) / pooled_width * part_size);
200 |     int class_id = ctop / channels_each_class;
201 |     T trans_x = no_trans ? static_cast<T>(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * trans_std;
202 |     T trans_y = no_trans ? static_cast<T>(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * trans_std;
203 | 
204 |     T wstart = static_cast<T>(pw) * bin_size_w + roi_start_w;
205 |     wstart += trans_x * roi_width;
206 |     T hstart = static_cast<T>(ph) * bin_size_h + roi_start_h;
207 |     hstart += trans_y * roi_height;
208 | 
209 |     if (top_count[index] <= 0)
210 |     {
211 |       continue;
212 |     }
213 |     T diff_val = top_diff[index] / top_count[index];
214 |     const T *offset_bottom_data = bottom_data + roi_batch_ind * channels * height * width;
215 |     T *offset_bottom_data_diff = bottom_data_diff + roi_batch_ind * channels * height * width;
216 |     int gw = floor(static_cast<T>(pw) * group_size / pooled_width);
217 |     int gh = floor(static_cast<T>(ph) * group_size / pooled_height);
218 |     gw = min(max(gw, 0), group_size - 1);
219 |     gh = min(max(gh, 0), group_size - 1);
220 | 
221 |     for (int ih = 0; ih < sample_per_part; ih++)
222 |     {
223 |       for (int iw = 0; iw < sample_per_part; iw++)
224 |       {
225 |         T w = wstart + iw * sub_bin_size_w;
226 |         T h = hstart + ih * sub_bin_size_h;
227 |         // bilinear interpolation
228 |         if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5)
229 |         {
230 |           continue;
231 |         }
232 |         w = min(max(w, 0.), width - 1.);
233 |         h = min(max(h, 0.), height - 1.);
234 |         int c = (ctop * group_size + gh) * group_size + gw;
235 |         // backward on feature
236 |         int x0 = floor(w);
237 |         int x1 = ceil(w);
238 |         int y0 = floor(h);
239 |         int y1 = ceil(h);
240 |         T dist_x = w - x0, dist_y = h - y0;
241 |         T q00 = (1 - dist_x) * (1 - dist_y);
242 |         T q01 = (1 - dist_x) * dist_y;
243 |         T q10 = dist_x * (1 - dist_y);
244 |         T q11 = dist_x * dist_y;
245 |         int bottom_index_base = c * height * width;
246 |         atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x0, q00 * diff_val);
247 |         atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x0, q01 * diff_val);
248 |         atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x1, q10 * diff_val);
249 |         atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x1, q11 * diff_val);
250 | 
251 |         if (no_trans)
252 |         {
253 |           continue;
254 |         }
255 |         T U00 = offset_bottom_data[bottom_index_base + y0 * width + x0];
256 |         T U01 = offset_bottom_data[bottom_index_base + y1 * width + x0];
257 |         T U10 = offset_bottom_data[bottom_index_base + y0 * width + x1];
258 |         T U11 = offset_bottom_data[bottom_index_base + y1 * width + x1];
259 |         T diff_x = (U11 * dist_y + U10 * (1 - dist_y) - U01 * dist_y - U00 * (1 - dist_y)) * trans_std * diff_val;
260 |         diff_x *= roi_width;
261 |         T diff_y = (U11 * dist_x + U01 * (1 - dist_x) - U10 * dist_x - U00 * (1 - dist_x)) * trans_std * diff_val;
262 |         diff_y *= roi_height;
263 | 
264 |         atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w, diff_x);
265 |         atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w, diff_y);
266 |       }
267 |     }
268 |   }
269 | }
270 | 
271 | std::tuple<at::Tensor, at::Tensor>
272 | dcn_v2_psroi_pooling_cuda_forward(const at::Tensor &input,
273 |                                   const at::Tensor &bbox,
274 |                                   const at::Tensor &trans,
275 |                                   const int no_trans,
276 |                                   const float spatial_scale,
277 |                                   const int output_dim,
278 |                                   const int group_size,
279 |                                   const int pooled_size,
280 |                                   const int part_size,
281 |                                   const int sample_per_part,
282 |                                   const float trans_std)
283 | {
284 |   AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
285 |   AT_ASSERTM(bbox.type().is_cuda(), "rois must be a CUDA tensor");
286 |   AT_ASSERTM(trans.type().is_cuda(), "trans must be a CUDA tensor");
287 | 
288 |   const int batch = input.size(0);
289 |   const int channels = input.size(1);
290 |   const int height = input.size(2);
291 |   const int width = input.size(3);
292 |   const int channels_trans = no_trans ? 2 : trans.size(1);
293 |   const int num_bbox = bbox.size(0);
294 | 
295 |   AT_ASSERTM(channels == output_dim, "input channels and output channels must equal");
296 |   auto pooled_height = pooled_size;
297 |   auto pooled_width = pooled_size;
298 | 
299 |   auto out = at::empty({num_bbox, output_dim, pooled_height, pooled_width}, input.options());
300 |   long out_size = num_bbox * output_dim * pooled_height * pooled_width;
301 |   auto top_count = at::zeros({num_bbox, output_dim, pooled_height, pooled_width}, input.options());
302 | 
303 |   const int num_classes = no_trans ? 1 : channels_trans / 2;
304 |   const int channels_each_class = no_trans ? output_dim : output_dim / num_classes;
305 | 
306 |   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
307 | 
308 |   if (out.numel() == 0)
309 |   {
310 |     THCudaCheck(cudaGetLastError());
311 |     return std::make_tuple(out, top_count);
312 |   }
313 | 
314 |   dim3 grid(std::min(THCCeilDiv(out_size, 512L), 4096L));
315 |   dim3 block(512);
316 | 
317 |   AT_DISPATCH_FLOATING_TYPES(input.type(), "dcn_v2_psroi_pooling_cuda_forward", [&] {
318 |     DeformablePSROIPoolForwardKernelCuda<scalar_t><<<grid, block, 0, stream>>>(
319 |         out_size,
320 |         input.contiguous().data<scalar_t>(),
321 |         spatial_scale,
322 |         channels,
323 |         height, width,
324 |         pooled_height,
325 |         pooled_width,
326 |         bbox.contiguous().data<scalar_t>(),
327 |         trans.contiguous().data<scalar_t>(),
328 |         no_trans,
329 |         trans_std,
330 |         sample_per_part,
331 |         output_dim,
332 |         group_size,
333 |         part_size,
334 |         num_classes,
335 |         channels_each_class,
336 |         out.data<scalar_t>(),
337 |         top_count.data<scalar_t>());
338 |   });
339 |   THCudaCheck(cudaGetLastError());
340 |   return std::make_tuple(out, top_count);
341 | }
342 | 
343 | std::tuple<at::Tensor, at::Tensor>
344 | dcn_v2_psroi_pooling_cuda_backward(const at::Tensor &out_grad,
345 |                                    const at::Tensor &input,
346 |                                    const at::Tensor &bbox,
347 |                                    const at::Tensor &trans,
348 |                                    const at::Tensor &top_count,
349 |                                    const int no_trans,
350 |                                    const float spatial_scale,
351 |                                    const int output_dim,
352 |                                    const int group_size,
353 |                                    const int pooled_size,
354 |                                    const int part_size,
355 |                                    const int sample_per_part,
356 |                                    const float trans_std)
357 | {
358 |   AT_ASSERTM(out_grad.type().is_cuda(), "out_grad must be a CUDA tensor");
359 |   AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
360 |   AT_ASSERTM(bbox.type().is_cuda(), "bbox must be a CUDA tensor");
361 |   AT_ASSERTM(trans.type().is_cuda(), "trans must be a CUDA tensor");
362 |   AT_ASSERTM(top_count.type().is_cuda(), "top_count must be a CUDA tensor");
363 | 
364 |   const int batch = input.size(0);
365 |   const int channels = input.size(1);
366 |   const int height = input.size(2);
367 |   const int width = input.size(3);
368 |   const int channels_trans = no_trans ? 2 : trans.size(1);
369 |   const int num_bbox = bbox.size(0);
370 | 
371 |   AT_ASSERTM(channels == output_dim, "input channels and output channels must equal");
372 |   auto pooled_height = pooled_size;
373 |   auto pooled_width = pooled_size;
374 |   long out_size = num_bbox * output_dim * pooled_height * pooled_width;
375 |   const int num_classes = no_trans ? 1 : channels_trans / 2;
376 |   const int channels_each_class = no_trans ? output_dim : output_dim / num_classes;
377 | 
378 |   auto input_grad = at::zeros({batch, channels, height, width}, out_grad.options());
379 |   auto trans_grad = at::zeros_like(trans);
380 | 
381 |   if (input_grad.numel() == 0)
382 |   {
383 |     THCudaCheck(cudaGetLastError());
384 |     return std::make_tuple(input_grad, trans_grad);
385 |   }
386 | 
387 |   dim3 grid(std::min(THCCeilDiv(out_size, 512L), 4096L));
388 |   dim3 block(512);
389 |   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
390 | 
391 |   AT_DISPATCH_FLOATING_TYPES(out_grad.type(), "dcn_v2_psroi_pooling_cuda_backward", [&] {
392 |     DeformablePSROIPoolBackwardAccKernelCuda<scalar_t><<<grid, block, 0, stream>>>(
393 |         out_size,
394 |         out_grad.contiguous().data<scalar_t>(),
395 |         top_count.contiguous().data<scalar_t>(),
396 |         num_bbox,
397 |         spatial_scale,
398 |         channels,
399 |         height,
400 |         width,
401 |         pooled_height,
402 |         pooled_width,
403 |         output_dim,
404 |         input_grad.contiguous().data<scalar_t>(),
405 |         trans_grad.contiguous().data<scalar_t>(),
406 |         input.contiguous().data<scalar_t>(),
407 |         bbox.contiguous().data<scalar_t>(),
408 |         trans.contiguous().data<scalar_t>(),
409 |         no_trans,
410 |         trans_std,
411 |         sample_per_part,
412 |         group_size,
413 |         part_size,
414 |         num_classes,
415 |         channels_each_class);
416 |   });
417 |   THCudaCheck(cudaGetLastError());
418 |   return std::make_tuple(input_grad, trans_grad);
419 | }


--------------------------------------------------------------------------------
/mmdet/layers/DCNv2/src/cuda/vision.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <torch/extension.h>
 3 | 
 4 | at::Tensor
 5 | dcn_v2_cuda_forward(const at::Tensor &input,
 6 |                     const at::Tensor &weight,
 7 |                     const at::Tensor &bias,
 8 |                     const at::Tensor &offset,
 9 |                     const at::Tensor &mask,
10 |                     const int kernel_h,
11 |                     const int kernel_w,
12 |                     const int stride_h,
13 |                     const int stride_w,
14 |                     const int pad_h,
15 |                     const int pad_w,
16 |                     const int dilation_h,
17 |                     const int dilation_w,
18 |                     const int deformable_group);
19 | 
20 | std::vector<at::Tensor>
21 | dcn_v2_cuda_backward(const at::Tensor &input,
22 |                      const at::Tensor &weight,
23 |                      const at::Tensor &bias,
24 |                      const at::Tensor &offset,
25 |                      const at::Tensor &mask,
26 |                      const at::Tensor &grad_output,
27 |                      int kernel_h, int kernel_w,
28 |                      int stride_h, int stride_w,
29 |                      int pad_h, int pad_w,
30 |                      int dilation_h, int dilation_w,
31 |                      int deformable_group);
32 | 
33 | 
34 | std::tuple<at::Tensor, at::Tensor>
35 | dcn_v2_psroi_pooling_cuda_forward(const at::Tensor &input,
36 |                                   const at::Tensor &bbox,
37 |                                   const at::Tensor &trans,
38 |                                   const int no_trans,
39 |                                   const float spatial_scale,
40 |                                   const int output_dim,
41 |                                   const int group_size,
42 |                                   const int pooled_size,
43 |                                   const int part_size,
44 |                                   const int sample_per_part,
45 |                                   const float trans_std);
46 | 
47 | std::tuple<at::Tensor, at::Tensor>
48 | dcn_v2_psroi_pooling_cuda_backward(const at::Tensor &out_grad,
49 |                                    const at::Tensor &input,
50 |                                    const at::Tensor &bbox,
51 |                                    const at::Tensor &trans,
52 |                                    const at::Tensor &top_count,
53 |                                    const int no_trans,
54 |                                    const float spatial_scale,
55 |                                    const int output_dim,
56 |                                    const int group_size,
57 |                                    const int pooled_size,
58 |                                    const int part_size,
59 |                                    const int sample_per_part,
60 |                                    const float trans_std);


--------------------------------------------------------------------------------
/mmdet/layers/DCNv2/src/dcn_v2.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include "cpu/vision.h"
  4 | 
  5 | #ifdef WITH_CUDA
  6 | #include "cuda/vision.h"
  7 | #endif
  8 | 
  9 | at::Tensor
 10 | dcn_v2_forward(const at::Tensor &input,
 11 |                const at::Tensor &weight,
 12 |                const at::Tensor &bias,
 13 |                const at::Tensor &offset,
 14 |                const at::Tensor &mask,
 15 |                const int kernel_h,
 16 |                const int kernel_w,
 17 |                const int stride_h,
 18 |                const int stride_w,
 19 |                const int pad_h,
 20 |                const int pad_w,
 21 |                const int dilation_h,
 22 |                const int dilation_w,
 23 |                const int deformable_group)
 24 | {
 25 |     if (input.type().is_cuda())
 26 |     {
 27 | #ifdef WITH_CUDA
 28 |         return dcn_v2_cuda_forward(input, weight, bias, offset, mask,
 29 |                                    kernel_h, kernel_w,
 30 |                                    stride_h, stride_w,
 31 |                                    pad_h, pad_w,
 32 |                                    dilation_h, dilation_w,
 33 |                                    deformable_group);
 34 | #else
 35 |         AT_ERROR("Not compiled with GPU support");
 36 | #endif
 37 |     }
 38 |     else{
 39 |         return dcn_v2_cpu_forward(input, weight, bias, offset, mask,
 40 |                                    kernel_h, kernel_w,
 41 |                                    stride_h, stride_w,
 42 |                                    pad_h, pad_w,
 43 |                                    dilation_h, dilation_w,
 44 |                                    deformable_group);
 45 |     }
 46 | }
 47 | 
 48 | std::vector<at::Tensor>
 49 | dcn_v2_backward(const at::Tensor &input,
 50 |                 const at::Tensor &weight,
 51 |                 const at::Tensor &bias,
 52 |                 const at::Tensor &offset,
 53 |                 const at::Tensor &mask,
 54 |                 const at::Tensor &grad_output,
 55 |                 int kernel_h, int kernel_w,
 56 |                 int stride_h, int stride_w,
 57 |                 int pad_h, int pad_w,
 58 |                 int dilation_h, int dilation_w,
 59 |                 int deformable_group)
 60 | {
 61 |     if (input.type().is_cuda())
 62 |     {
 63 | #ifdef WITH_CUDA
 64 |         return dcn_v2_cuda_backward(input,
 65 |                                     weight,
 66 |                                     bias,
 67 |                                     offset,
 68 |                                     mask,
 69 |                                     grad_output,
 70 |                                     kernel_h, kernel_w,
 71 |                                     stride_h, stride_w,
 72 |                                     pad_h, pad_w,
 73 |                                     dilation_h, dilation_w,
 74 |                                     deformable_group);
 75 | #else
 76 |         AT_ERROR("Not compiled with GPU support");
 77 | #endif
 78 |     }
 79 |     else{
 80 |         return dcn_v2_cpu_backward(input,
 81 |                                     weight,
 82 |                                     bias,
 83 |                                     offset,
 84 |                                     mask,
 85 |                                     grad_output,
 86 |                                     kernel_h, kernel_w,
 87 |                                     stride_h, stride_w,
 88 |                                     pad_h, pad_w,
 89 |                                     dilation_h, dilation_w,
 90 |                                     deformable_group);
 91 |     }
 92 | }
 93 | 
 94 | std::tuple<at::Tensor, at::Tensor>
 95 | dcn_v2_psroi_pooling_forward(const at::Tensor &input,
 96 |                              const at::Tensor &bbox,
 97 |                              const at::Tensor &trans,
 98 |                              const int no_trans,
 99 |                              const float spatial_scale,
100 |                              const int output_dim,
101 |                              const int group_size,
102 |                              const int pooled_size,
103 |                              const int part_size,
104 |                              const int sample_per_part,
105 |                              const float trans_std)
106 | {
107 |     if (input.type().is_cuda())
108 |     {
109 | #ifdef WITH_CUDA
110 |         return dcn_v2_psroi_pooling_cuda_forward(input,
111 |                                                  bbox,
112 |                                                  trans,
113 |                                                  no_trans,
114 |                                                  spatial_scale,
115 |                                                  output_dim,
116 |                                                  group_size,
117 |                                                  pooled_size,
118 |                                                  part_size,
119 |                                                  sample_per_part,
120 |                                                  trans_std);
121 | #else
122 |         AT_ERROR("Not compiled with GPU support");
123 | #endif
124 |     }
125 |     else{
126 |         return dcn_v2_psroi_pooling_cpu_forward(input,
127 |                                                  bbox,
128 |                                                  trans,
129 |                                                  no_trans,
130 |                                                  spatial_scale,
131 |                                                  output_dim,
132 |                                                  group_size,
133 |                                                  pooled_size,
134 |                                                  part_size,
135 |                                                  sample_per_part,
136 |                                                  trans_std);
137 |     }
138 | }
139 | 
140 | std::tuple<at::Tensor, at::Tensor>
141 | dcn_v2_psroi_pooling_backward(const at::Tensor &out_grad,
142 |                               const at::Tensor &input,
143 |                               const at::Tensor &bbox,
144 |                               const at::Tensor &trans,
145 |                               const at::Tensor &top_count,
146 |                               const int no_trans,
147 |                               const float spatial_scale,
148 |                               const int output_dim,
149 |                               const int group_size,
150 |                               const int pooled_size,
151 |                               const int part_size,
152 |                               const int sample_per_part,
153 |                               const float trans_std)
154 | {
155 |     if (input.type().is_cuda())
156 |     {
157 | #ifdef WITH_CUDA
158 |         return dcn_v2_psroi_pooling_cuda_backward(out_grad,
159 |                                                   input,
160 |                                                   bbox,
161 |                                                   trans,
162 |                                                   top_count,
163 |                                                   no_trans,
164 |                                                   spatial_scale,
165 |                                                   output_dim,
166 |                                                   group_size,
167 |                                                   pooled_size,
168 |                                                   part_size,
169 |                                                   sample_per_part,
170 |                                                   trans_std);
171 | #else
172 |         AT_ERROR("Not compiled with GPU support");
173 | #endif
174 |     }
175 |     else{
176 |         return dcn_v2_psroi_pooling_cpu_backward(out_grad,
177 |                                                   input,
178 |                                                   bbox,
179 |                                                   trans,
180 |                                                   top_count,
181 |                                                   no_trans,
182 |                                                   spatial_scale,
183 |                                                   output_dim,
184 |                                                   group_size,
185 |                                                   pooled_size,
186 |                                                   part_size,
187 |                                                   sample_per_part,
188 |                                                   trans_std);
189 |     }
190 | }


--------------------------------------------------------------------------------
/mmdet/layers/DCNv2/src/vision.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | #include "dcn_v2.h"
 3 | 
 4 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
 5 |   m.def("dcn_v2_forward", &dcn_v2_forward, "dcn_v2_forward");
 6 |   m.def("dcn_v2_backward", &dcn_v2_backward, "dcn_v2_backward");
 7 |   m.def("dcn_v2_psroi_pooling_forward", &dcn_v2_psroi_pooling_forward, "dcn_v2_psroi_pooling_forward");
 8 |   m.def("dcn_v2_psroi_pooling_backward", &dcn_v2_psroi_pooling_backward, "dcn_v2_psroi_pooling_backward");
 9 | }
10 | 


--------------------------------------------------------------------------------
/mmdet/layers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lbin/Retinaface_Detectron2/579e500b35efac6afc389dfc9bbea0b129e91ba6/mmdet/layers/__init__.py


--------------------------------------------------------------------------------
/mmdet/layers/nms.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torchvision.ops import boxes as box_ops
 3 | from torchvision.ops import nms  # BC-compat
 4 | 
 5 | from apex.amp import float_function
 6 | 
 7 | 
 8 | @float_function
 9 | def batched_nms(boxes, scores, idxs, iou_threshold):
10 |     """
11 |     Same as torchvision.ops.boxes.batched_nms, but safer.
12 |     """
13 |     assert boxes.shape[-1] == 4
14 |     # TODO may need better strategy.
15 |     # Investigate after having a fully-cuda NMS op.
16 |     if len(boxes) < 40000:
17 |         return box_ops.batched_nms(boxes, scores, idxs, iou_threshold)
18 | 
19 |     result_mask = scores.new_zeros(scores.size(), dtype=torch.bool)
20 |     for id in torch.unique(idxs).cpu().tolist():
21 |         mask = (idxs == id).nonzero().view(-1)
22 |         keep = nms(boxes[mask], scores[mask], iou_threshold)
23 |         result_mask[mask[keep]] = True
24 |     keep = result_mask.nonzero().view(-1)
25 |     keep = keep[scores[keep].argsort(descending=True)]
26 |     return keep
27 | 


--------------------------------------------------------------------------------
/mmdet/layers/ssh.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from detectron2.layers import Conv2d, get_norm
  5 | 
  6 | from aifwdet.layers.DCNv2.dcn_v2 import DCN
  7 | 
  8 | # from detectron2.layers.batch_norm import NaiveSyncBatchNorm
  9 | 
 10 | 
 11 | def conv_bn(in_channel, out_channel, stride=1, leaky=0, norm="BN"):
 12 |     return nn.Sequential(
 13 |         Conv2d(
 14 |             in_channel,
 15 |             out_channel,
 16 |             kernel_size=3,
 17 |             stride=stride,
 18 |             padding=1,
 19 |             bias=False,
 20 |             norm=get_norm(norm, out_channel),
 21 |         ),
 22 |         nn.LeakyReLU(negative_slope=leaky, inplace=True),
 23 |     )
 24 | 
 25 | 
 26 | def conv_bn_no_relu(in_channel, out_channel, stride, norm="BN"):
 27 |     return Conv2d(
 28 |         in_channel,
 29 |         out_channel,
 30 |         kernel_size=3,
 31 |         stride=stride,
 32 |         padding=1,
 33 |         bias=False,
 34 |         norm=get_norm(norm, out_channel),
 35 |     )
 36 | 
 37 | 
 38 | def conv_bn1X1(in_channel, out_channel, stride, leaky=0, norm="BN"):
 39 |     return nn.Sequential(
 40 |         Conv2d(
 41 |             in_channel,
 42 |             out_channel,
 43 |             kernel_size=1,
 44 |             stride=stride,
 45 |             padding=0,
 46 |             bias=False,
 47 |             norm=get_norm(norm, out_channel),
 48 |         ),
 49 |         nn.LeakyReLU(negative_slope=leaky, inplace=True),
 50 |     )
 51 | 
 52 | 
 53 | def conv_dw(in_channel, out_channel, stride, leaky=0.1, norm="BN"):
 54 |     return nn.Sequential(
 55 |         Conv2d(
 56 |             in_channel,
 57 |             out_channel,
 58 |             kernel_size=3,
 59 |             stride=stride,
 60 |             padding=1,
 61 |             bias=False,
 62 |             groups=in_channel,
 63 |             norm=get_norm(norm, out_channel),
 64 |         ),
 65 |         nn.LeakyReLU(negative_slope=leaky, inplace=True),
 66 |         Conv2d(
 67 |             in_channel,
 68 |             out_channel,
 69 |             kernel_size=1,
 70 |             stride=1,
 71 |             padding=0,
 72 |             bias=False,
 73 |             norm=get_norm(norm, out_channel),
 74 |         ),
 75 |         nn.LeakyReLU(negative_slope=leaky, inplace=True),
 76 |     )
 77 | 
 78 | 
 79 | class DeformConv(nn.Module):
 80 |     def __init__(self, in_channel, out_channel, norm="BN"):
 81 |         super(DeformConv, self).__init__()
 82 |         self.actf = nn.Sequential(get_norm(norm, out_channel), nn.ReLU(inplace=True))
 83 |         self.conv = DCN(
 84 |             in_channel,
 85 |             out_channel,
 86 |             kernel_size=(3, 3),
 87 |             stride=1,
 88 |             padding=1,
 89 |             dilation=1,
 90 |             deformable_groups=1,
 91 |         )
 92 | 
 93 |     def forward(self, x):
 94 |         x = self.conv(x)
 95 |         x = self.actf(x)
 96 |         return x
 97 | 
 98 | 
 99 | # SSH:Single Stage Headless Face Detector
100 | class SSH(nn.Module):
101 |     def __init__(self, cfg, in_channel, out_channel):
102 |         super(SSH, self).__init__()
103 |         assert out_channel % 4 == 0
104 |         self.use_dcnv2 = cfg.MODEL.RETINANET.WITH_DCNv2
105 |         self.norm = cfg.MODEL.RETINANET.NORM
106 |         leaky = 0
107 |         if out_channel <= 64:
108 |             leaky = 0.1
109 |         self.conv_1 = conv_bn_no_relu(in_channel, out_channel // 2, stride=1, norm=self.norm)
110 | 
111 |         self.conv_2 = conv_bn(in_channel, out_channel // 4, stride=1, leaky=leaky, norm=self.norm)
112 |         self.conv_3 = conv_bn_no_relu(out_channel // 4, out_channel // 4, stride=1, norm=self.norm)
113 | 
114 |         self.conv_4 = conv_bn(
115 |             out_channel // 4, out_channel // 4, stride=1, leaky=leaky, norm=self.norm
116 |         )
117 |         self.conv_5 = conv_bn_no_relu(out_channel // 4, out_channel // 4, stride=1, norm=self.norm)
118 |         if self.use_dcnv2:
119 |             self.dcn = DeformConv(in_channel, out_channel, norm=self.norm)
120 | 
121 |     def forward(self, input):
122 |         conv_1 = self.conv_1(input)
123 | 
124 |         conv_2 = self.conv_2(input)
125 |         conv_2_3 = self.conv_3(conv_2)
126 | 
127 |         conv_4 = self.conv_4(conv_2)
128 |         conv_4_5 = self.conv_5(conv_4)
129 | 
130 |         out = torch.cat([conv_1, conv_2_3, conv_4_5], dim=1)
131 |         out = F.relu(out)
132 |         if self.use_dcnv2:
133 |             out = self.dcn(out)
134 |         return out
135 | 


--------------------------------------------------------------------------------
/mmdet/modeling/__init__.py:
--------------------------------------------------------------------------------
1 | from .backbone import *
2 | from .meta_arch import RetinaFace
3 | 


--------------------------------------------------------------------------------
/mmdet/modeling/backbone/__init__.py:
--------------------------------------------------------------------------------
1 | from .torch_resnet import build_torch_resnet_backbone, build_torch_resnet_fpn_backbone
2 | 
3 | __all__ = [k for k in globals().keys() if not k.startswith("_")]
4 | 


--------------------------------------------------------------------------------
/mmdet/modeling/backbone/torch_resnet.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | import torchvision.models.resnet as resnet
 3 | from detectron2.layers import ShapeSpec
 4 | 
 5 | # from centernet.network.backbone import Backbone
 6 | from detectron2.modeling import Backbone
 7 | from detectron2.modeling.backbone.build import BACKBONE_REGISTRY
 8 | from detectron2.modeling.backbone.fpn import FPN, LastLevelMaxPool
 9 | 
10 | _resnet_mapper = {18: resnet.resnet18, 50: resnet.resnet50, 101: resnet.resnet101}
11 | 
12 | 
13 | class ResnetBackbone(Backbone):
14 |     def __init__(self, cfg, out_features=None, pretrained=True):
15 |         super().__init__()
16 |         depth = cfg.MODEL.RESNETS.DEPTH
17 |         backbone = _resnet_mapper[depth](pretrained=pretrained)
18 |         self.stage0 = nn.Sequential(backbone.conv1, backbone.bn1, backbone.relu, backbone.maxpool)
19 |         self.stage1 = backbone.layer1
20 |         self.stage2 = backbone.layer2
21 |         self.stage3 = backbone.layer3
22 |         self.stage4 = backbone.layer4
23 | 
24 |         self.stages_and_names = []
25 | 
26 |         self.add_module("res1", self.stage0)
27 |         self.stages_and_names.append((self.stage0, "res1"))
28 | 
29 |         self.add_module("res2", self.stage1)
30 |         self.stages_and_names.append((self.stage1, "res2"))
31 | 
32 |         self.add_module("res3", self.stage2)
33 |         self.stages_and_names.append((self.stage2, "res3"))
34 | 
35 |         self.add_module("res4", self.stage3)
36 |         self.stages_and_names.append((self.stage3, "res4"))
37 | 
38 |         self.add_module("res5", self.stage4)
39 |         self.stages_and_names.append((self.stage4, "res5"))
40 | 
41 |         self._out_feature_strides = {}
42 |         self._out_feature_channels = {}
43 | 
44 |         self._out_feature_strides["res3"] = 8
45 |         self._out_feature_channels["res3"] = 512
46 | 
47 |         self._out_feature_strides["res4"] = 16
48 |         self._out_feature_channels["res4"] = 1024
49 | 
50 |         self._out_feature_strides["res5"] = 32
51 |         self._out_feature_channels["res5"] = 2048
52 | 
53 |         self._out_features = out_features
54 | 
55 |     def forward(self, x):
56 |         outputs = {}
57 |         for stage, name in self.stages_and_names:
58 |             x = stage(x)
59 |             if name in self._out_features:
60 |                 outputs[name] = x
61 | 
62 |         return outputs
63 | 
64 | 
65 | @BACKBONE_REGISTRY.register()
66 | def build_torch_resnet_backbone(cfg):
67 |     out_features = cfg.MODEL.RESNETS.OUT_FEATURES
68 |     return ResnetBackbone(cfg, out_features)
69 | 
70 | 
71 | @BACKBONE_REGISTRY.register()
72 | def build_torch_resnet_fpn_backbone(cfg, input_shape: ShapeSpec):
73 |     """
74 |     Args:
75 |         cfg: a detectron2 CfgNode
76 |     Returns:
77 |         backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
78 |     """
79 |     bottom_up = build_torch_resnet_backbone(cfg)
80 |     in_features = cfg.MODEL.FPN.IN_FEATURES
81 |     out_channels = cfg.MODEL.FPN.OUT_CHANNELS
82 |     backbone = FPN(
83 |         bottom_up=bottom_up,
84 |         in_features=in_features,
85 |         out_channels=out_channels,
86 |         norm=cfg.MODEL.FPN.NORM,
87 |         top_block=LastLevelMaxPool(),
88 |         fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
89 |     )
90 |     return backbone
91 | 


--------------------------------------------------------------------------------
/mmdet/modeling/meta_arch/__init__.py:
--------------------------------------------------------------------------------
1 | from .retinaface import RetinaFace
2 | 


--------------------------------------------------------------------------------
/mmdet/modeling/meta_arch/retinaface.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | from typing import List
  3 | 
  4 | import torch
  5 | from detectron2.layers import ShapeSpec, cat
  6 | from detectron2.modeling.anchor_generator import build_anchor_generator
  7 | from detectron2.modeling.meta_arch import RetinaNet
  8 | from detectron2.modeling.meta_arch.build import META_ARCH_REGISTRY
  9 | from detectron2.structures import ImageList
 10 | from detectron2.utils.events import get_event_storage
 11 | from fvcore.nn import sigmoid_focal_loss_jit, smooth_l1_loss
 12 | from torch import nn
 13 | from torch.nn import functional as F
 14 | 
 15 | # from aifwdet.layers.nms import batched_nms
 16 | from aifwdet.layers.ssh import SSH
 17 | 
 18 | __all__ = ["RetinaFace"]
 19 | 
 20 | 
 21 | @META_ARCH_REGISTRY.register()
 22 | class RetinaFace(RetinaNet):
 23 |     """
 24 |     # Implement RetinaFace (https://arxiv.org/abs/1905.00641).
 25 |     """
 26 | 
 27 |     def __init__(self, cfg):
 28 |         super().__init__(cfg)
 29 | 
 30 |         backbone_shape = self.backbone.output_shape()
 31 |         feature_shapes = [backbone_shape[f] for f in self.in_features]
 32 |         self.head = RetinaFaceHead(cfg, feature_shapes)
 33 |         self.input_format = cfg.INPUT.FORMAT
 34 |         self.to(self.device)
 35 | 
 36 |     def preprocess_image(self, batched_inputs):
 37 |         """
 38 |         Normalize, pad and batch the input images.
 39 |         """
 40 |         images = [x["image"].to(self.device) for x in batched_inputs]
 41 |         if self.input_format == "RGB":
 42 |             images = [(x / 255.0 - self.pixel_mean) / self.pixel_std for x in images]
 43 |         else:
 44 |             images = [(x - self.pixel_mean) / self.pixel_std for x in images]
 45 |         images = ImageList.from_tensors(images, self.backbone.size_divisibility)
 46 |         return images
 47 | 
 48 | 
 49 | class RetinaFaceHead(nn.Module):
 50 |     """
 51 |     The head used in RetinaNet for object classification and box regression.
 52 |     It has two subnets for the two tasks, with a common structure but separate parameters.
 53 |     """
 54 | 
 55 |     def __init__(self, cfg, input_shape: List[ShapeSpec]):
 56 |         super().__init__()
 57 |         # fmt: off
 58 |         in_channels      = input_shape[0].channels
 59 |         num_classes      = cfg.MODEL.RETINANET.NUM_CLASSES
 60 |         num_convs        = cfg.MODEL.RETINANET.NUM_CONVS
 61 |         prior_prob       = cfg.MODEL.RETINANET.PRIOR_PROB
 62 |         num_anchors      = build_anchor_generator(cfg, input_shape).num_cell_anchors
 63 |         # fmt: on
 64 |         assert (
 65 |             len(set(num_anchors)) == 1
 66 |         ), "Using different number of anchors between levels is not currently supported!"
 67 |         num_anchors = num_anchors[0]
 68 | 
 69 |         self.ssh = []
 70 |         for i in range(len(cfg.MODEL.RETINANET.IN_FEATURES)):
 71 |             ssh = SSH(cfg, in_channels, in_channels)
 72 |             name = "ssh" + str(i)
 73 |             self.add_module(name, ssh)
 74 |             self.ssh.append(ssh)
 75 | 
 76 |         cls_subnet = []
 77 |         bbox_subnet = []
 78 |         for _ in range(num_convs):
 79 |             cls_subnet.append(
 80 |                 nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
 81 |             )
 82 |             cls_subnet.append(nn.ReLU())
 83 |             bbox_subnet.append(
 84 |                 nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
 85 |             )
 86 |             bbox_subnet.append(nn.ReLU())
 87 | 
 88 |         self.cls_score = nn.Conv2d(
 89 |             in_channels, num_anchors * num_classes, kernel_size=1, stride=1, padding=0
 90 |         )
 91 |         self.bbox_pred = nn.Conv2d(in_channels, num_anchors * 4, kernel_size=1, stride=1, padding=0)
 92 | 
 93 |         # Initialization
 94 |         for modules in [self.cls_score, self.bbox_pred]:
 95 |             for layer in modules.modules():
 96 |                 if isinstance(layer, nn.Conv2d):
 97 |                     torch.nn.init.normal_(layer.weight, mean=0, std=0.01)
 98 |                     torch.nn.init.constant_(layer.bias, 0)
 99 | 
100 |         # Use prior in model initialization to improve stability
101 |         bias_value = -math.log((1 - prior_prob) / prior_prob)
102 |         torch.nn.init.constant_(self.cls_score.bias, bias_value)
103 | 
104 |     def forward(self, features):
105 |         """
106 |         Arguments:
107 |             features (list[Tensor]): FPN feature map tensors in high to low resolution.
108 |                 Each tensor in the list correspond to different feature levels.
109 |         Returns:
110 |             logits (list[Tensor]): #lvl tensors, each has shape (N, AxK, Hi, Wi).
111 |                 The tensor predicts the classification probability
112 |                 at each spatial position for each of the A anchors and K object
113 |                 classes.
114 |             bbox_reg (list[Tensor]): #lvl tensors, each has shape (N, Ax4, Hi, Wi).
115 |                 The tensor predicts 4-vector (dx,dy,dw,dh) box
116 |                 regression values for every anchor. These values are the
117 |                 relative offset between the anchor and the ground truth box.
118 |         """
119 |         logits = []
120 |         bbox_reg = []
121 |         for i, feature in enumerate(features):
122 |             feature = self.ssh[i](feature)
123 |             logits.append(self.cls_score(feature))
124 |             bbox_reg.append(self.bbox_pred(feature))
125 |         return logits, bbox_reg
126 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import glob
 4 | import os
 5 | from distutils.core import Extension, setup
 6 | 
 7 | import numpy
 8 | import torch
 9 | from Cython.Build import cythonize
10 | from torch.utils.cpp_extension import CUDA_HOME, CppExtension, CUDAExtension
11 | 
12 | requirements = ["torch", "torchvision"]
13 | 
14 | torch_ver = [int(x) for x in torch.__version__.split(".")[:2]]
15 | assert torch_ver >= [1, 4], "Requires PyTorch >= 1.4"
16 | 
17 | bbox_oevelaps_extensions = [
18 |     Extension(
19 |         "mmdet.evaluation.bbox",
20 |         ["mmdet/evaluation/box_overlaps.pyx"],
21 |         include_dirs=[numpy.get_include()],
22 |     )
23 | ]
24 | 
25 | 
26 | def get_dcnv2_extensions():
27 |     this_dir = os.path.dirname(os.path.abspath(__file__))
28 |     extensions_dir = os.path.join(this_dir, "mmdet/layers/DCNv2/src")
29 | 
30 |     main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
31 |     source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
32 |     source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
33 | 
34 |     os.environ["CC"] = "g++"
35 |     sources = main_file + source_cpu
36 |     extension = CppExtension
37 |     extra_compile_args = {"cxx": []}
38 |     define_macros = []
39 | 
40 |     if torch.cuda.is_available() and CUDA_HOME is not None:
41 |         extension = CUDAExtension
42 |         sources += source_cuda
43 |         define_macros += [("WITH_CUDA", None)]
44 |         extra_compile_args["nvcc"] = [
45 |             "-DCUDA_HAS_FP16=1",
46 |             "-D__CUDA_NO_HALF_OPERATORS__",
47 |             "-D__CUDA_NO_HALF_CONVERSIONS__",
48 |             "-D__CUDA_NO_HALF2_OPERATORS__",
49 |         ]
50 |     else:
51 |         # raise NotImplementedError('Cuda is not available')
52 |         pass
53 | 
54 |     sources = [os.path.join(extensions_dir, s) for s in sources]
55 |     include_dirs = [extensions_dir]
56 |     ext_modules = [
57 |         extension(
58 |             "_ext",
59 |             sources,
60 |             include_dirs=include_dirs,
61 |             define_macros=define_macros,
62 |             extra_compile_args=extra_compile_args,
63 |         )
64 |     ]
65 |     return ext_modules
66 | 
67 | 
68 | setup(
69 |     name="mmdet",
70 |     version="0.1.0",
71 |     author="lbin",
72 |     url="https://github.com/lbin/Retinaface_Mobilenet_Pytorch",
73 |     description="mmdet",
74 |     # packages=find_packages(exclude=("configs", "tests")),
75 |     python_requires=">=3.6",
76 |     install_requires=[
77 |         "termcolor>=1.1",
78 |         "Pillow",  # you can also use pillow-simd for better performance
79 |         "yacs>=0.1.6",
80 |         "tabulate",
81 |         "cloudpickle",
82 |         "matplotlib",
83 |         "tqdm>4.29.0",
84 |         "tensorboard",
85 |         "fvcore",
86 |         "future",  # used by caffe2
87 |         "pydot",  # used to save caffe2 SVGs
88 |     ],
89 |     extras_require={
90 |         "all": ["shapely", "psutil"],
91 |         "dev": ["flake8", "isort", "black==19.3b0", "flake8-bugbear", "flake8-comprehensions"],
92 |     },
93 |     ext_modules=[cythonize(bbox_oevelaps_extensions)[0], get_dcnv2_extensions()[0]],
94 |     cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
95 | )
96 | 


--------------------------------------------------------------------------------
/train_net.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from detectron2.checkpoint import DetectionCheckpointer
 4 | 
 5 | # from detectron2.config import get_cfg
 6 | from detectron2.data import build_detection_test_loader, build_detection_train_loader
 7 | from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, launch
 8 | 
 9 | from mmdet.config import get_cfg
10 | from mmdet.data.datasets.widerface import register_widerface
11 | from mmdet.data.widerface_dataset_mapper import WiderFace_DatasetMapper
12 | 
13 | # from mmdet.engine.apex_trainer import ApexTrainer
14 | from mmdet.evaluation.evaluator import WiderFaceEvaluator
15 | 
16 | 
17 | class Trainer(DefaultTrainer):
18 |     @classmethod
19 |     def build_evaluator(cls, cfg, dataset_name, output_folder=None):
20 |         if output_folder is None:
21 |             output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
22 | 
23 |         return WiderFaceEvaluator(dataset_name, output_folder)
24 | 
25 |     @classmethod
26 |     def build_train_loader(cls, cfg):
27 |         return build_detection_train_loader(cfg, mapper=WiderFace_DatasetMapper(cfg, True))
28 | 
29 |     @classmethod
30 |     def build_test_loader(cls, cfg, dataset_name):
31 |         return build_detection_test_loader(
32 |             cfg, dataset_name, mapper=WiderFace_DatasetMapper(cfg, False)
33 |         )
34 | 
35 | 
36 | def setup(args):
37 |     """
38 |     Create configs and perform basic setups.
39 |     """
40 |     cfg = get_cfg()
41 |     cfg.merge_from_file(args.config_file)
42 |     cfg.merge_from_list(args.opts)
43 |     cfg.freeze()
44 |     default_setup(cfg, args)
45 |     return cfg
46 | 
47 | 
48 | def main(args):
49 |     cfg = setup(args)
50 |     register_widerface()
51 | 
52 |     if args.eval_only:
53 |         model = Trainer.build_model(cfg)
54 |         DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
55 |             cfg.MODEL.WEIGHTS, resume=args.resume
56 |         )
57 |         res = Trainer.test(cfg, model)
58 |         return res
59 | 
60 |     trainer = Trainer(cfg)
61 |     trainer.resume_or_load(resume=args.resume)
62 |     return trainer.train()
63 | 
64 | 
65 | if __name__ == "__main__":
66 |     args = default_argument_parser().parse_args()
67 |     print("Command Line Args:", args)
68 |     launch(
69 |         main,
70 |         args.num_gpus,
71 |         num_machines=args.num_machines,
72 |         machine_rank=args.machine_rank,
73 |         dist_url=args.dist_url,
74 |         args=(args,),
75 |     )
76 | 


--------------------------------------------------------------------------------